ipfw: Add per-cpu table support.
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27
28 /*
29  * Implement IP packet firewall (new version)
30  */
31
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
62
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79
80 #include <net/ipfw/ip_fw2.h>
81
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85         if (fw_debug > 0) \
86                 kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...)       ((void)0)
90 #endif
91
92 /*
93  * Description about per-CPU rule duplication:
94  *
95  * Module loading/unloading and all ioctl operations are serialized
96  * by netisr0, so we don't have any ordering or locking problems.
97  *
98  * Following graph shows how operation on per-CPU rule list is
99  * performed [2 CPU case]:
100  *
101  *   CPU0                 CPU1
102  *
103  * netisr0 <------------------------------------+
104  *  domsg                                       |
105  *    :                                         |
106  *    :(delete/add...)                          |
107  *    :                                         |
108  *    :         netmsg                          | netmsg
109  *  forwardmsg---------->netisr1                |
110  *                          :                   |
111  *                          :(delete/add...)    |
112  *                          :                   |
113  *                          :                   |
114  *                        replymsg--------------+
115  *
116  *
117  *
118  * Rule structure [2 CPU case]
119  *
120  *    CPU0               CPU1
121  *
122  * layer3_chain       layer3_chain
123  *     |                  |
124  *     V                  V
125  * +-------+ sibling  +-------+ sibling
126  * | rule1 |--------->| rule1 |--------->NULL
127  * +-------+          +-------+
128  *     |                  |
129  *     |next              |next
130  *     V                  V
131  * +-------+ sibling  +-------+ sibling
132  * | rule2 |--------->| rule2 |--------->NULL
133  * +-------+          +-------+
134  *
135  * ip_fw.sibling:
136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
137  *    iterate layer3_chain in netisr0; the current rule's duplication
138  *    to the other CPUs could safely be read-only accessed through
139  *    ip_fw.sibling.
140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141  *    a) In netisr0 rule3 is determined to be inserted between rule1
142  *       and rule2.  To make this decision we need to iterate the
143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
145  *       in netisr0 as next_rule.
146  *    b) After the insertion in netisr0 is done, we will move on to
147  *       netisr1.  But instead of relocating the rule3's position in
148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
149  *       netmsg's prev_rule to rule1->sibling and next_rule to
150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
151  *       netisr0.
152  */
153
154 /*
155  * Description of states and tracks.
156  *
157  * Both states and tracks are stored in per-cpu RB trees instead of
158  * per-cpu hash tables to avoid the worst case hash degeneration.
159  *
160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161  * measured in seconds and depending on the flags.
162  *
163  * When a packet is received, its address fields are first masked with
164  * the mask defined for the rule, then matched against the entries in
165  * the per-cpu state RB tree.  States are generated by 'keep-state'
166  * and 'limit' options.
167  *
168  * The max number of states is ipfw_state_max.  When we reach the
169  * maximum number of states we do not create anymore.  This is done to
170  * avoid consuming too much memory, but also too much time when
171  * searching on each packet.
172  *
173  * Each state holds a pointer to the parent ipfw rule of the current
174  * CPU so we know what action to perform.  States are removed when the
175  * parent rule is deleted.  XXX we should make them survive.
176  *
177  * There are some limitations with states -- we do not obey the
178  * 'randomized match', and we do not do multiple passes through the
179  * firewall.  XXX check the latter!!!
180  *
181  * States grow independently on each CPU, e.g. 2 CPU case:
182  *
183  *        CPU0                     CPU1
184  * ...................      ...................
185  * :  state RB tree  :      :  state RB tree  :
186  * :                 :      :                 :
187  * : state1   state2 :      :      state3     :
188  * :     |    |      :      :        |        :
189  * :.....|....|......:      :........|........:
190  *       |    |                      |
191  *       |    |                      |st_rule
192  *       |    |                      |
193  *       V    V                      V
194  *     +-------+                 +-------+
195  *     | rule1 |                 | rule1 |
196  *     +-------+                 +-------+
197  *
198  * Tracks are used to enforce limits on the number of sessions.  Tracks
199  * are generated by 'limit' option.
200  *
201  * The max number of tracks is ipfw_track_max.  When we reach the
202  * maximum number of tracks we do not create anymore.  This is done to
203  * avoid consuming too much memory.
204  *
205  * Tracks are organized into two layers, track counter RB tree is
206  * shared between CPUs, track RB tree is per-cpu.  States generated by
207  * 'limit' option are linked to the track in addition to the per-cpu
208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
209  *
210  *             ..............................
211  *             :    track counter RB tree   :
212  *             :                            :
213  *             :        +-----------+       :
214  *             :        |  trkcnt1  |       :
215  *             :        |           |       :
216  *             :      +--->counter<----+    :
217  *             :      | |           |  |    :
218  *             :      | +-----------+  |    :
219  *             :......|................|....:
220  *                    |                |
221  *        CPU0        |                |         CPU1
222  * .................  |t_count         |  .................
223  * : track RB tree :  |                |  : track RB tree :
224  * :               :  |                |  :               :
225  * : +-->track1-------+                +--------track2    :
226  * : |     A       :                      :               :
227  * : |     |       :                      :               :
228  * :.|.....|.......:                      :...............:
229  *   |     +----------------+
230  *   | .................... |
231  *   | :   state RB tree  : |st_track
232  *   | :                  : |
233  *   +---state1    state2---+
234  *     :     |       |    :
235  *     :.....|.......|....:
236  *           |       |
237  *           |       |st_rule
238  *           V       V
239  *         +----------+
240  *         |   rule1  |
241  *         +----------+
242  */
243
244 #define IPFW_AUTOINC_STEP_MIN   1
245 #define IPFW_AUTOINC_STEP_MAX   1000
246 #define IPFW_AUTOINC_STEP_DEF   100
247
248 #define IPFW_TABLE_MAX_DEF      64
249
250 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
252
253 #define MATCH_REVERSE           0
254 #define MATCH_FORWARD           1
255 #define MATCH_NONE              2
256 #define MATCH_UNKNOWN           3
257
258 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
259 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
260                                  (IPFW_STATE_TCPFLAGS << 8))
261
262 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
263 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
264 #define BOTH_RST                (TH_RST | (TH_RST << 8))
265 /* TH_ACK here means FIN was ACKed. */
266 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
267
268 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
269                                  (((s)->st_state & BOTH_RST) ||         \
270                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
271
272 #define O_ANCHOR                O_NOP
273
274 struct netmsg_ipfw {
275         struct netmsg_base      base;
276         const struct ipfw_ioc_rule *ioc_rule;
277         struct ip_fw            *next_rule;
278         struct ip_fw            *prev_rule;
279         struct ip_fw            *sibling;
280         uint32_t                rule_flags;
281 };
282
283 struct netmsg_del {
284         struct netmsg_base      base;
285         struct ip_fw            *start_rule;
286         struct ip_fw            *prev_rule;
287         uint16_t                rulenum;
288         uint8_t                 from_set;
289         uint8_t                 to_set;
290 };
291
292 struct netmsg_zent {
293         struct netmsg_base      base;
294         struct ip_fw            *start_rule;
295         uint16_t                rulenum;
296         uint16_t                log_only;
297 };
298
299 struct netmsg_cpstate {
300         struct netmsg_base      base;
301         struct ipfw_ioc_state   *ioc_state;
302         int                     state_cntmax;
303         int                     state_cnt;
304 };
305
306 struct netmsg_tblent {
307         struct netmsg_base      base;
308         struct sockaddr         *key;
309         struct sockaddr         *netmask;
310         struct ipfw_tblent      *sibling;
311         int                     tableid;
312 };
313
314 struct netmsg_tblflush {
315         struct netmsg_base      base;
316         int                     tableid;
317         int                     destroy;
318 };
319
320 struct netmsg_tblexp {
321         struct netmsg_base      base;
322         time_t                  expire;
323         int                     tableid;
324         int                     cnt;
325         int                     expcnt;
326         struct radix_node_head  *rnh;
327 };
328
329 struct ipfw_table_cp {
330         struct ipfw_ioc_tblent  *te;
331         int                     te_idx;
332         int                     te_cnt;
333 };
334
335 struct ipfw_addrs {
336         uint32_t                addr1;
337         uint32_t                addr2;
338 };
339
340 struct ipfw_ports {
341         uint16_t                port1;
342         uint16_t                port2;
343 };
344
345 struct ipfw_key {
346         union {
347                 struct ipfw_addrs addrs;
348                 uint64_t        value;
349         } addr_u;
350         union {
351                 struct ipfw_ports ports;
352                 uint32_t        value;
353         } port_u;
354         uint8_t                 proto;
355         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
356         uint16_t                rsvd2;
357 };
358
359 #define IPFW_KEY_SWAP_ADDRS     0x1
360 #define IPFW_KEY_SWAP_PORTS     0x2
361 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
362
363 struct ipfw_trkcnt {
364         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
365         struct ipfw_key         tc_key;
366         uintptr_t               tc_ruleid;
367         int                     tc_refs;
368         int                     tc_count;
369         time_t                  tc_expire;      /* userland get-only */
370         uint16_t                tc_rulenum;     /* userland get-only */
371 } __cachealign;
372
373 #define tc_addrs                tc_key.addr_u.value
374 #define tc_ports                tc_key.port_u.value
375 #define tc_proto                tc_key.proto
376 #define tc_saddr                tc_key.addr_u.addrs.addr1
377 #define tc_daddr                tc_key.addr_u.addrs.addr2
378 #define tc_sport                tc_key.port_u.ports.port1
379 #define tc_dport                tc_key.port_u.ports.port2
380
381 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
382
383 struct ipfw_state;
384
385 struct ipfw_track {
386         RB_ENTRY(ipfw_track)    t_rblink;
387         struct ipfw_key         t_key;
388         struct ip_fw            *t_rule;
389         time_t                  t_lastexp;
390         LIST_HEAD(, ipfw_state) t_state_list;
391         time_t                  t_expire;
392         volatile int            *t_count;
393         struct ipfw_trkcnt      *t_trkcnt;
394         TAILQ_ENTRY(ipfw_track) t_link;
395 };
396
397 #define t_addrs                 t_key.addr_u.value
398 #define t_ports                 t_key.port_u.value
399 #define t_proto                 t_key.proto
400 #define t_saddr                 t_key.addr_u.addrs.addr1
401 #define t_daddr                 t_key.addr_u.addrs.addr2
402 #define t_sport                 t_key.port_u.ports.port1
403 #define t_dport                 t_key.port_u.ports.port2
404
405 RB_HEAD(ipfw_track_tree, ipfw_track);
406 TAILQ_HEAD(ipfw_track_list, ipfw_track);
407
408 struct ipfw_state {
409         RB_ENTRY(ipfw_state)    st_rblink;
410         struct ipfw_key         st_key;
411
412         time_t                  st_expire;      /* expire time */
413         struct ip_fw            *st_rule;
414
415         uint64_t                st_pcnt;        /* packets */
416         uint64_t                st_bcnt;        /* bytes */
417
418         /*
419          * st_state:
420          * State of this rule, typically a combination of TCP flags.
421          *
422          * st_ack_fwd/st_ack_rev:
423          * Most recent ACKs in forward and reverse direction.  They
424          * are used to generate keepalives.
425          */
426         uint32_t                st_state;
427         uint32_t                st_ack_fwd;
428         uint32_t                st_seq_fwd;
429         uint32_t                st_ack_rev;
430         uint32_t                st_seq_rev;
431
432         uint16_t                st_flags;       /* IPFW_STATE_F_ */
433         uint16_t                st_type;        /* O_KEEP_STATE/O_LIMIT */
434         struct ipfw_track       *st_track;
435
436         LIST_ENTRY(ipfw_state)  st_trklink;
437         TAILQ_ENTRY(ipfw_state) st_link;
438 };
439
440 #define st_addrs                st_key.addr_u.value
441 #define st_ports                st_key.port_u.value
442 #define st_proto                st_key.proto
443 #define st_swap                 st_key.swap
444
445 #define IPFW_STATE_F_ACKFWD     0x0001
446 #define IPFW_STATE_F_SEQFWD     0x0002
447 #define IPFW_STATE_F_ACKREV     0x0004
448 #define IPFW_STATE_F_SEQREV     0x0008
449
450 TAILQ_HEAD(ipfw_state_list, ipfw_state);
451 RB_HEAD(ipfw_state_tree, ipfw_state);
452
453 struct ipfw_tblent {
454         struct radix_node       te_nodes[2];
455         struct sockaddr_in      te_key;
456         u_long                  te_use;
457         time_t                  te_lastuse;
458         struct ipfw_tblent      *te_sibling;
459         volatile int            te_expired;
460 };
461
462 struct ipfw_context {
463         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
464         struct ip_fw            *ipfw_default_rule;     /* default rule */
465         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
466
467         /*
468          * ipfw_set_disable contains one bit per set value (0..31).
469          * If the bit is set, all rules with the corresponding set
470          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
471          * default rule and CANNOT be disabled.
472          */
473         uint32_t                ipfw_set_disable;
474
475         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
476
477         struct ipfw_state_tree  ipfw_state_tree;
478         struct ipfw_state_list  ipfw_state_list;
479         int                     ipfw_state_loosecnt;
480         int                     ipfw_state_cnt;
481
482         union {
483                 struct ipfw_state state;
484                 struct ipfw_track track;
485                 struct ipfw_trkcnt trkcnt;
486         } ipfw_tmpkey;
487
488         struct ipfw_track_tree  ipfw_track_tree;
489         struct ipfw_track_list  ipfw_track_list;
490         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
491
492         struct callout          ipfw_stateto_ch;
493         time_t                  ipfw_state_lastexp;
494         struct netmsg_base      ipfw_stateexp_nm;
495         struct netmsg_base      ipfw_stateexp_more;
496         struct ipfw_state       ipfw_stateexp_anch;
497
498         struct callout          ipfw_trackto_ch;
499         time_t                  ipfw_track_lastexp;
500         struct netmsg_base      ipfw_trackexp_nm;
501         struct netmsg_base      ipfw_trackexp_more;
502         struct ipfw_track       ipfw_trackexp_anch;
503
504         struct callout          ipfw_keepalive_ch;
505         struct netmsg_base      ipfw_keepalive_nm;
506         struct netmsg_base      ipfw_keepalive_more;
507         struct ipfw_state       ipfw_keepalive_anch;
508
509         /*
510          * Statistics
511          */
512         u_long                  ipfw_sts_reap;
513         u_long                  ipfw_sts_reapfailed;
514         u_long                  ipfw_sts_overflow;
515         u_long                  ipfw_sts_nomem;
516         u_long                  ipfw_sts_tcprecycled;
517
518         u_long                  ipfw_tks_nomem;
519         u_long                  ipfw_tks_reap;
520         u_long                  ipfw_tks_reapfailed;
521         u_long                  ipfw_tks_overflow;
522         u_long                  ipfw_tks_cntnomem;
523
524         /* Last field */
525         struct radix_node_head  *ipfw_tables[];
526 };
527
528 #define IPFW_FLAG_KEEPALIVE     0x01
529 #define IPFW_FLAG_STATEEXP      0x02
530 #define IPFW_FLAG_TRACKEXP      0x04
531 #define IPFW_FLAG_STATEREAP     0x08
532 #define IPFW_FLAG_TRACKREAP     0x10
533
534 #define ipfw_state_tmpkey       ipfw_tmpkey.state
535 #define ipfw_track_tmpkey       ipfw_tmpkey.track
536 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
537
538 struct ipfw_global {
539         int                     ipfw_state_loosecnt;    /* cache aligned */
540         time_t                  ipfw_state_globexp __cachealign;
541
542         struct lwkt_token       ipfw_trkcnt_token __cachealign;
543         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
544         int                     ipfw_trkcnt_cnt;
545         time_t                  ipfw_track_globexp;
546
547 #ifdef KLD_MODULE
548         /*
549          * Module can not be unloaded, if there are references to
550          * certains rules of ipfw(4), e.g. dummynet(4)
551          */
552         int                     ipfw_refcnt __cachealign;
553 #endif
554 } __cachealign;
555
556 static struct ipfw_context      *ipfw_ctx[MAXCPU];
557
558 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
559
560 /*
561  * Following two global variables are accessed and updated only
562  * in netisr0.
563  */
564 static uint32_t static_count;   /* # of static rules */
565 static uint32_t static_ioc_len; /* bytes of static rules */
566
567 /*
568  * If 1, then ipfw static rules are being flushed,
569  * ipfw_chk() will skip to the default rule.
570  */
571 static int ipfw_flushing;
572
573 static int fw_verbose;
574 static int verbose_limit;
575
576 static int fw_debug;
577 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
578
579 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
580
581 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
582 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
583
584 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
585
586 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
587 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
588     "Firewall statistics");
589
590 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
591     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
592 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
593     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
594     "Rule number autincrement step");
595 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
596     &fw_one_pass, 0,
597     "Only do a single pass through ipfw when using dummynet(4)");
598 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
599     &fw_debug, 0, "Enable printing of debug ip_fw statements");
600 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
601     &fw_verbose, 0, "Log matches to ipfw rules");
602 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
603     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
604 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
605     &ipfw_table_max, 0, "Max # of tables");
606
607 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
608 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
609 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
610 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
611 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
612 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
613
614 /*
615  * Timeouts for various events in handing states.
616  *
617  * NOTE:
618  * 1 == 0~1 second.
619  * 2 == 1~2 second(s).
620  *
621  * We use 2 seconds for FIN lifetime, so that the states will not be
622  * ripped prematurely.
623  */
624 static uint32_t dyn_ack_lifetime = 300;
625 static uint32_t dyn_syn_lifetime = 20;
626 static uint32_t dyn_finwait_lifetime = 20;
627 static uint32_t dyn_fin_lifetime = 2;
628 static uint32_t dyn_rst_lifetime = 2;
629 static uint32_t dyn_udp_lifetime = 10;
630 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
631
632 /*
633  * Keepalives are sent if dyn_keepalive is set. They are sent every
634  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
635  * seconds of lifetime of a rule.
636  */
637 static uint32_t dyn_keepalive_interval = 20;
638 static uint32_t dyn_keepalive_period = 5;
639 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
640
641 static struct ipfw_global       ipfw_gd;
642 static int      ipfw_state_loosecnt_updthr;
643 static int      ipfw_state_max = 4096;  /* max # of states */
644 static int      ipfw_track_max = 4096;  /* max # of tracks */
645
646 static int      ipfw_state_headroom;    /* setup at module load time */
647 static int      ipfw_state_reap_min = 8;
648 static int      ipfw_state_expire_max = 32;
649 static int      ipfw_state_scan_max = 256;
650 static int      ipfw_keepalive_max = 8;
651 static int      ipfw_track_reap_max = 4;
652 static int      ipfw_track_expire_max = 16;
653 static int      ipfw_track_scan_max = 128;
654
655 /* Compat */
656 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
657     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
658     "Number of states and tracks");
659 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
660     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
661     "Max number of states and tracks");
662
663 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
664     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
665     "Number of states");
666 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
667     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
668     "Max number of states");
669 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
670     &ipfw_state_headroom, 0, "headroom for state reap");
671 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
672     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
673 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
674     &ipfw_track_max, 0, "Max number of tracks");
675 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
676     &static_count, 0, "Number of static rules");
677 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
678     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
679 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
680     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
681 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
682     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
683 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
684     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
685 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
686     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
687 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
688     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
689 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
690     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
692     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
693 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
694     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
695     "I", "# of states to scan for each expire iteration");
696 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
697     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
698     "I", "# of states to expire for each expire iteration");
699 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
700     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
701     "I", "# of states to expire for each expire iteration");
702 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
703     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
704     "I", "# of states to reap for state shortage");
705 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
706     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
707     "I", "# of tracks to scan for each expire iteration");
708 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
709     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
710     "I", "# of tracks to expire for each expire iteration");
711 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
712     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
713     "I", "# of tracks to reap for track shortage");
714
715 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
716     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
717     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
718     "LU", "# of state reaps due to states shortage");
719 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
720     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
721     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
722     "LU", "# of state reap failure");
723 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
724     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
725     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
726     "LU", "# of state overflow");
727 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
728     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
729     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
730     "LU", "# of state allocation failure");
731 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
732     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
733     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
734     "LU", "# of state deleted due to fast TCP port recycling");
735
736 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
737     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
738     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
739     "LU", "# of track allocation failure");
740 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
741     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
742     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
743     "LU", "# of track reap due to tracks shortage");
744 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
745     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
746     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
747     "LU", "# of track reap failure");
748 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
749     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
750     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
751     "LU", "# of track overflow");
752 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
753     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
754     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
755     "LU", "# of track counter allocation failure");
756
757 static int              ipfw_state_cmp(struct ipfw_state *,
758                             struct ipfw_state *);
759 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
760                             struct ipfw_trkcnt *);
761 static int              ipfw_track_cmp(struct ipfw_track *,
762                             struct ipfw_track *);
763
764 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
765 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
766
767 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
768 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
769
770 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
771 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
772
773 static ip_fw_chk_t      ipfw_chk;
774 static void             ipfw_track_expire_ipifunc(void *);
775 static void             ipfw_state_expire_ipifunc(void *);
776 static void             ipfw_keepalive(void *);
777 static int              ipfw_state_expire_start(struct ipfw_context *,
778                             int, int);
779
780 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
781 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
782 #define IPFW_TRKCNT_TOKINIT     \
783         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
784
785 static void
786 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
787     const struct sockaddr *netmask)
788 {
789         const u_char *cp1 = (const u_char *)src;
790         u_char *cp2 = (u_char *)dst;
791         const u_char *cp3 = (const u_char *)netmask;
792         u_char *cplim = cp2 + *cp3;
793         u_char *cplim2 = cp2 + *cp1;
794
795         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
796         cp3 += 2;
797         if (cplim > cplim2)
798                 cplim = cplim2;
799         while (cp2 < cplim)
800                 *cp2++ = *cp1++ & *cp3++;
801         if (cp2 < cplim2)
802                 bzero(cp2, cplim2 - cp2);
803 }
804
805 static __inline void
806 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
807     in_addr_t daddr, uint16_t dport, uint8_t proto)
808 {
809
810         key->proto = proto;
811         key->swap = 0;
812
813         if (saddr < daddr) {
814                 key->addr_u.addrs.addr1 = daddr;
815                 key->addr_u.addrs.addr2 = saddr;
816                 key->swap |= IPFW_KEY_SWAP_ADDRS;
817         } else {
818                 key->addr_u.addrs.addr1 = saddr;
819                 key->addr_u.addrs.addr2 = daddr;
820         }
821
822         if (sport < dport) {
823                 key->port_u.ports.port1 = dport;
824                 key->port_u.ports.port2 = sport;
825                 key->swap |= IPFW_KEY_SWAP_PORTS;
826         } else {
827                 key->port_u.ports.port1 = sport;
828                 key->port_u.ports.port2 = dport;
829         }
830
831         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
832                 key->swap |= IPFW_KEY_SWAP_PORTS;
833         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
834                 key->swap |= IPFW_KEY_SWAP_ADDRS;
835 }
836
837 static __inline void
838 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
839     in_addr_t *daddr, uint16_t *dport)
840 {
841
842         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
843                 *saddr = key->addr_u.addrs.addr2;
844                 *daddr = key->addr_u.addrs.addr1;
845         } else {
846                 *saddr = key->addr_u.addrs.addr1;
847                 *daddr = key->addr_u.addrs.addr2;
848         }
849
850         if (key->swap & IPFW_KEY_SWAP_PORTS) {
851                 *sport = key->port_u.ports.port2;
852                 *dport = key->port_u.ports.port1;
853         } else {
854                 *sport = key->port_u.ports.port1;
855                 *dport = key->port_u.ports.port2;
856         }
857 }
858
859 static int
860 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
861 {
862
863         if (s1->st_proto > s2->st_proto)
864                 return (1);
865         if (s1->st_proto < s2->st_proto)
866                 return (-1);
867
868         if (s1->st_addrs > s2->st_addrs)
869                 return (1);
870         if (s1->st_addrs < s2->st_addrs)
871                 return (-1);
872
873         if (s1->st_ports > s2->st_ports)
874                 return (1);
875         if (s1->st_ports < s2->st_ports)
876                 return (-1);
877
878         if (s1->st_swap == s2->st_swap ||
879             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
880                 return (0);
881
882         if (s1->st_swap > s2->st_swap)
883                 return (1);
884         else
885                 return (-1);
886 }
887
888 static int
889 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
890 {
891
892         if (t1->tc_proto > t2->tc_proto)
893                 return (1);
894         if (t1->tc_proto < t2->tc_proto)
895                 return (-1);
896
897         if (t1->tc_addrs > t2->tc_addrs)
898                 return (1);
899         if (t1->tc_addrs < t2->tc_addrs)
900                 return (-1);
901
902         if (t1->tc_ports > t2->tc_ports)
903                 return (1);
904         if (t1->tc_ports < t2->tc_ports)
905                 return (-1);
906
907         if (t1->tc_ruleid > t2->tc_ruleid)
908                 return (1);
909         if (t1->tc_ruleid < t2->tc_ruleid)
910                 return (-1);
911
912         return (0);
913 }
914
915 static int
916 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
917 {
918
919         if (t1->t_proto > t2->t_proto)
920                 return (1);
921         if (t1->t_proto < t2->t_proto)
922                 return (-1);
923
924         if (t1->t_addrs > t2->t_addrs)
925                 return (1);
926         if (t1->t_addrs < t2->t_addrs)
927                 return (-1);
928
929         if (t1->t_ports > t2->t_ports)
930                 return (1);
931         if (t1->t_ports < t2->t_ports)
932                 return (-1);
933
934         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
935                 return (1);
936         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
937                 return (-1);
938
939         return (0);
940 }
941
942 static void
943 ipfw_state_max_set(int state_max)
944 {
945
946         ipfw_state_max = state_max;
947         /* Allow 5% states over-allocation. */
948         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
949 }
950
951 static __inline int
952 ipfw_state_cntcoll(void)
953 {
954         int cpu, state_cnt = 0;
955
956         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
957                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
958         return (state_cnt);
959 }
960
961 static __inline int
962 ipfw_state_cntsync(void)
963 {
964         int state_cnt;
965
966         state_cnt = ipfw_state_cntcoll();
967         ipfw_gd.ipfw_state_loosecnt = state_cnt;
968         return (state_cnt);
969 }
970
971 static __inline int
972 ipfw_free_rule(struct ip_fw *rule)
973 {
974         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
975         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
976         rule->refcnt--;
977         if (rule->refcnt == 0) {
978                 kfree(rule, M_IPFW);
979                 return 1;
980         }
981         return 0;
982 }
983
984 static void
985 ipfw_unref_rule(void *priv)
986 {
987         ipfw_free_rule(priv);
988 #ifdef KLD_MODULE
989         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
990 #endif
991 }
992
993 static __inline void
994 ipfw_ref_rule(struct ip_fw *rule)
995 {
996         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
997 #ifdef KLD_MODULE
998         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
999 #endif
1000         rule->refcnt++;
1001 }
1002
1003 /*
1004  * This macro maps an ip pointer into a layer3 header pointer of type T
1005  */
1006 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1007
1008 static __inline int
1009 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1010 {
1011         int type = L3HDR(struct icmp,ip)->icmp_type;
1012
1013         return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type)));
1014 }
1015
1016 #define TT      ((1 << ICMP_ECHO) | \
1017                  (1 << ICMP_ROUTERSOLICIT) | \
1018                  (1 << ICMP_TSTAMP) | \
1019                  (1 << ICMP_IREQ) | \
1020                  (1 << ICMP_MASKREQ))
1021
1022 static int
1023 is_icmp_query(struct ip *ip)
1024 {
1025         int type = L3HDR(struct icmp, ip)->icmp_type;
1026
1027         return (type <= ICMP_MAXTYPE && (TT & (1 << type)));
1028 }
1029
1030 #undef TT
1031
1032 /*
1033  * The following checks use two arrays of 8 or 16 bits to store the
1034  * bits that we want set or clear, respectively. They are in the
1035  * low and high half of cmd->arg1 or cmd->d[0].
1036  *
1037  * We scan options and store the bits we find set. We succeed if
1038  *
1039  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1040  *
1041  * The code is sometimes optimized not to store additional variables.
1042  */
1043 static int
1044 flags_match(ipfw_insn *cmd, uint8_t bits)
1045 {
1046         u_char want_clear;
1047         bits = ~bits;
1048
1049         if (((cmd->arg1 & 0xff) & bits) != 0)
1050                 return 0; /* some bits we want set were clear */
1051
1052         want_clear = (cmd->arg1 >> 8) & 0xff;
1053         if ((want_clear & bits) != want_clear)
1054                 return 0; /* some bits we want clear were set */
1055         return 1;
1056 }
1057
1058 static int
1059 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1060 {
1061         int optlen, bits = 0;
1062         u_char *cp = (u_char *)(ip + 1);
1063         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1064
1065         for (; x > 0; x -= optlen, cp += optlen) {
1066                 int opt = cp[IPOPT_OPTVAL];
1067
1068                 if (opt == IPOPT_EOL)
1069                         break;
1070
1071                 if (opt == IPOPT_NOP) {
1072                         optlen = 1;
1073                 } else {
1074                         optlen = cp[IPOPT_OLEN];
1075                         if (optlen <= 0 || optlen > x)
1076                                 return 0; /* invalid or truncated */
1077                 }
1078
1079                 switch (opt) {
1080                 case IPOPT_LSRR:
1081                         bits |= IP_FW_IPOPT_LSRR;
1082                         break;
1083
1084                 case IPOPT_SSRR:
1085                         bits |= IP_FW_IPOPT_SSRR;
1086                         break;
1087
1088                 case IPOPT_RR:
1089                         bits |= IP_FW_IPOPT_RR;
1090                         break;
1091
1092                 case IPOPT_TS:
1093                         bits |= IP_FW_IPOPT_TS;
1094                         break;
1095
1096                 default:
1097                         break;
1098                 }
1099         }
1100         return (flags_match(cmd, bits));
1101 }
1102
1103 static int
1104 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1105 {
1106         int optlen, bits = 0;
1107         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1108         u_char *cp = (u_char *)(tcp + 1);
1109         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1110
1111         for (; x > 0; x -= optlen, cp += optlen) {
1112                 int opt = cp[0];
1113
1114                 if (opt == TCPOPT_EOL)
1115                         break;
1116
1117                 if (opt == TCPOPT_NOP) {
1118                         optlen = 1;
1119                 } else {
1120                         optlen = cp[1];
1121                         if (optlen <= 0)
1122                                 break;
1123                 }
1124
1125                 switch (opt) {
1126                 case TCPOPT_MAXSEG:
1127                         bits |= IP_FW_TCPOPT_MSS;
1128                         break;
1129
1130                 case TCPOPT_WINDOW:
1131                         bits |= IP_FW_TCPOPT_WINDOW;
1132                         break;
1133
1134                 case TCPOPT_SACK_PERMITTED:
1135                 case TCPOPT_SACK:
1136                         bits |= IP_FW_TCPOPT_SACK;
1137                         break;
1138
1139                 case TCPOPT_TIMESTAMP:
1140                         bits |= IP_FW_TCPOPT_TS;
1141                         break;
1142
1143                 case TCPOPT_CC:
1144                 case TCPOPT_CCNEW:
1145                 case TCPOPT_CCECHO:
1146                         bits |= IP_FW_TCPOPT_CC;
1147                         break;
1148
1149                 default:
1150                         break;
1151                 }
1152         }
1153         return (flags_match(cmd, bits));
1154 }
1155
1156 static int
1157 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1158 {
1159         if (ifp == NULL)        /* no iface with this packet, match fails */
1160                 return 0;
1161
1162         /* Check by name or by IP address */
1163         if (cmd->name[0] != '\0') { /* match by name */
1164                 /* Check name */
1165                 if (cmd->p.glob) {
1166                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1167                                 return(1);
1168                 } else {
1169                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1170                                 return(1);
1171                 }
1172         } else {
1173                 struct ifaddr_container *ifac;
1174
1175                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1176                         struct ifaddr *ia = ifac->ifa;
1177
1178                         if (ia->ifa_addr == NULL)
1179                                 continue;
1180                         if (ia->ifa_addr->sa_family != AF_INET)
1181                                 continue;
1182                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1183                             (ia->ifa_addr))->sin_addr.s_addr)
1184                                 return(1);      /* match */
1185                 }
1186         }
1187         return(0);      /* no match, fail ... */
1188 }
1189
1190 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1191
1192 /*
1193  * We enter here when we have a rule with O_LOG.
1194  * XXX this function alone takes about 2Kbytes of code!
1195  */
1196 static void
1197 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1198     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1199 {
1200         char *action;
1201         int limit_reached = 0;
1202         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1203
1204         fragment[0] = '\0';
1205         proto[0] = '\0';
1206
1207         if (f == NULL) {        /* bogus pkt */
1208                 if (verbose_limit != 0 &&
1209                     ctx->ipfw_norule_counter >= verbose_limit)
1210                         return;
1211                 ctx->ipfw_norule_counter++;
1212                 if (ctx->ipfw_norule_counter == verbose_limit)
1213                         limit_reached = verbose_limit;
1214                 action = "Refuse";
1215         } else {        /* O_LOG is the first action, find the real one */
1216                 ipfw_insn *cmd = ACTION_PTR(f);
1217                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1218
1219                 if (l->max_log != 0 && l->log_left == 0)
1220                         return;
1221                 l->log_left--;
1222                 if (l->log_left == 0)
1223                         limit_reached = l->max_log;
1224                 cmd += F_LEN(cmd);      /* point to first action */
1225                 if (cmd->opcode == O_PROB)
1226                         cmd += F_LEN(cmd);
1227
1228                 action = action2;
1229                 switch (cmd->opcode) {
1230                 case O_DENY:
1231                         action = "Deny";
1232                         break;
1233
1234                 case O_REJECT:
1235                         if (cmd->arg1==ICMP_REJECT_RST) {
1236                                 action = "Reset";
1237                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1238                                 action = "Reject";
1239                         } else {
1240                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1241                                           cmd->arg1);
1242                         }
1243                         break;
1244
1245                 case O_ACCEPT:
1246                         action = "Accept";
1247                         break;
1248
1249                 case O_COUNT:
1250                         action = "Count";
1251                         break;
1252
1253                 case O_DIVERT:
1254                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1255                         break;
1256
1257                 case O_TEE:
1258                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1259                         break;
1260
1261                 case O_SKIPTO:
1262                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1263                         break;
1264
1265                 case O_PIPE:
1266                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1267                         break;
1268
1269                 case O_QUEUE:
1270                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1271                         break;
1272
1273                 case O_FORWARD_IP:
1274                         {
1275                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1276                                 int len;
1277
1278                                 len = ksnprintf(SNPARGS(action2, 0),
1279                                     "Forward to %s",
1280                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1281                                 if (sa->sa.sin_port) {
1282                                         ksnprintf(SNPARGS(action2, len), ":%d",
1283                                                   sa->sa.sin_port);
1284                                 }
1285                         }
1286                         break;
1287
1288                 default:
1289                         action = "UNKNOWN";
1290                         break;
1291                 }
1292         }
1293
1294         if (hlen == 0) {        /* non-ip */
1295                 ksnprintf(SNPARGS(proto, 0), "MAC");
1296         } else {
1297                 struct ip *ip = mtod(m, struct ip *);
1298                 /* these three are all aliases to the same thing */
1299                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1300                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1301                 struct udphdr *const udp = (struct udphdr *)icmp;
1302
1303                 int ip_off, offset, ip_len;
1304                 int len;
1305
1306                 if (eh != NULL) { /* layer 2 packets are as on the wire */
1307                         ip_off = ntohs(ip->ip_off);
1308                         ip_len = ntohs(ip->ip_len);
1309                 } else {
1310                         ip_off = ip->ip_off;
1311                         ip_len = ip->ip_len;
1312                 }
1313                 offset = ip_off & IP_OFFMASK;
1314                 switch (ip->ip_p) {
1315                 case IPPROTO_TCP:
1316                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1317                                         kinet_ntoa(ip->ip_src, abuf));
1318                         if (offset == 0) {
1319                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1320                                           ntohs(tcp->th_sport),
1321                                           kinet_ntoa(ip->ip_dst, abuf),
1322                                           ntohs(tcp->th_dport));
1323                         } else {
1324                                 ksnprintf(SNPARGS(proto, len), " %s",
1325                                           kinet_ntoa(ip->ip_dst, abuf));
1326                         }
1327                         break;
1328
1329                 case IPPROTO_UDP:
1330                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1331                                         kinet_ntoa(ip->ip_src, abuf));
1332                         if (offset == 0) {
1333                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1334                                           ntohs(udp->uh_sport),
1335                                           kinet_ntoa(ip->ip_dst, abuf),
1336                                           ntohs(udp->uh_dport));
1337                         } else {
1338                                 ksnprintf(SNPARGS(proto, len), " %s",
1339                                           kinet_ntoa(ip->ip_dst, abuf));
1340                         }
1341                         break;
1342
1343                 case IPPROTO_ICMP:
1344                         if (offset == 0) {
1345                                 len = ksnprintf(SNPARGS(proto, 0),
1346                                                 "ICMP:%u.%u ",
1347                                                 icmp->icmp_type,
1348                                                 icmp->icmp_code);
1349                         } else {
1350                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1351                         }
1352                         len += ksnprintf(SNPARGS(proto, len), "%s",
1353                                          kinet_ntoa(ip->ip_src, abuf));
1354                         ksnprintf(SNPARGS(proto, len), " %s",
1355                                   kinet_ntoa(ip->ip_dst, abuf));
1356                         break;
1357
1358                 default:
1359                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1360                                         kinet_ntoa(ip->ip_src, abuf));
1361                         ksnprintf(SNPARGS(proto, len), " %s",
1362                                   kinet_ntoa(ip->ip_dst, abuf));
1363                         break;
1364                 }
1365
1366                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1367                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1368                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1369                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1370                 }
1371         }
1372
1373         if (oif || m->m_pkthdr.rcvif) {
1374                 log(LOG_SECURITY | LOG_INFO,
1375                     "ipfw: %d %s %s %s via %s%s\n",
1376                     f ? f->rulenum : -1,
1377                     action, proto, oif ? "out" : "in",
1378                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1379                     fragment);
1380         } else {
1381                 log(LOG_SECURITY | LOG_INFO,
1382                     "ipfw: %d %s %s [no if info]%s\n",
1383                     f ? f->rulenum : -1,
1384                     action, proto, fragment);
1385         }
1386
1387         if (limit_reached) {
1388                 log(LOG_SECURITY | LOG_NOTICE,
1389                     "ipfw: limit %d reached on entry %d\n",
1390                     limit_reached, f ? f->rulenum : -1);
1391         }
1392 }
1393
1394 #undef SNPARGS
1395
1396 #define TIME_LEQ(a, b)  ((a) - (b) <= 0)
1397
1398 static void
1399 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1400 {
1401
1402         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT,
1403             ("invalid state type %u", s->st_type));
1404         KASSERT(ctx->ipfw_state_cnt > 0,
1405             ("invalid state count %d", ctx->ipfw_state_cnt));
1406
1407         if (s->st_track != NULL) {
1408                 struct ipfw_track *t = s->st_track;
1409
1410                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1411                     ("track state list is empty"));
1412                 LIST_REMOVE(s, st_trklink);
1413
1414                 KASSERT(*t->t_count > 0,
1415                     ("invalid track count %d", *t->t_count));
1416                 atomic_subtract_int(t->t_count, 1);
1417         }
1418
1419         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1420         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1421         kfree(s, M_IPFW);
1422
1423         ctx->ipfw_state_cnt--;
1424         if (ctx->ipfw_state_loosecnt > 0)
1425                 ctx->ipfw_state_loosecnt--;
1426 }
1427
1428 static int
1429 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1430 {
1431         struct ipfw_state *s, *anchor;
1432         int expired;
1433
1434         if (reap_max < ipfw_state_reap_min)
1435                 reap_max = ipfw_state_reap_min;
1436
1437         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1438                 /*
1439                  * Kick start state expiring.  Ignore scan limit,
1440                  * we are short of states.
1441                  */
1442                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1443                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1444                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1445                 return (expired);
1446         }
1447
1448         /*
1449          * States are being expired.
1450          */
1451
1452         if (ctx->ipfw_state_cnt == 0)
1453                 return (0);
1454
1455         expired = 0;
1456         anchor = &ctx->ipfw_stateexp_anch;
1457         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1458                 /*
1459                  * Ignore scan limit; we are short of states.
1460                  */
1461
1462                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1463                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1464
1465                 if (s->st_type == O_ANCHOR)
1466                         continue;
1467
1468                 if (IPFW_STATE_TCPCLOSED(s) ||
1469                     TIME_LEQ(s->st_expire, time_uptime)) {
1470                         ipfw_state_del(ctx, s);
1471                         if (++expired >= reap_max)
1472                                 break;
1473                         if ((expired & 0xff) == 0 && 
1474                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1475                             ipfw_state_max)
1476                                 break;
1477                 }
1478         }
1479         /*
1480          * NOTE:
1481          * Leave the anchor on the list, even if the end of the list has
1482          * been reached.  ipfw_state_expire_more_dispatch() will handle
1483          * the removal.
1484          */
1485         return (expired);
1486 }
1487
1488 static void
1489 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1490 {
1491         struct ipfw_state *s, *sn;
1492
1493         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1494                 if (s->st_type == O_ANCHOR)
1495                         continue;
1496                 if (rule != NULL && s->st_rule != rule)
1497                         continue;
1498                 ipfw_state_del(ctx, s);
1499         }
1500 }
1501
1502 static void
1503 ipfw_state_expire_done(struct ipfw_context *ctx)
1504 {
1505
1506         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1507             ("stateexp is not in progress"));
1508         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1509         callout_reset(&ctx->ipfw_stateto_ch, hz,
1510             ipfw_state_expire_ipifunc, NULL);
1511 }
1512
1513 static void
1514 ipfw_state_expire_more(struct ipfw_context *ctx)
1515 {
1516         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1517
1518         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1519             ("stateexp is not in progress"));
1520         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1521             ("stateexp more did not finish"));
1522         netisr_sendmsg_oncpu(nm);
1523 }
1524
1525 static int
1526 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1527     int scan_max, int expire_max)
1528 {
1529         struct ipfw_state *s;
1530         int scanned = 0, expired = 0;
1531
1532         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1533             ("stateexp is not in progress"));
1534
1535         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1536                 if (scanned++ >= scan_max) {
1537                         ipfw_state_expire_more(ctx);
1538                         return (expired);
1539                 }
1540
1541                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1542                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1543
1544                 if (s->st_type == O_ANCHOR)
1545                         continue;
1546
1547                 if (TIME_LEQ(s->st_expire, time_uptime) ||
1548                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1549                      IPFW_STATE_TCPCLOSED(s))) {
1550                         ipfw_state_del(ctx, s);
1551                         if (++expired >= expire_max) {
1552                                 ipfw_state_expire_more(ctx);
1553                                 return (expired);
1554                         }
1555                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1556                             (expired & 0xff) == 0 &&
1557                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1558                             ipfw_state_max) {
1559                                 ipfw_state_expire_more(ctx);
1560                                 return (expired);
1561                         }
1562                 }
1563         }
1564         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1565         ipfw_state_expire_done(ctx);
1566         return (expired);
1567 }
1568
1569 static void
1570 ipfw_state_expire_more_dispatch(netmsg_t nm)
1571 {
1572         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1573         struct ipfw_state *anchor;
1574
1575         ASSERT_NETISR_NCPUS(mycpuid);
1576         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1577             ("statexp is not in progress"));
1578
1579         /* Reply ASAP */
1580         netisr_replymsg(&nm->base, 0);
1581
1582         anchor = &ctx->ipfw_stateexp_anch;
1583         if (ctx->ipfw_state_cnt == 0) {
1584                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1585                 ipfw_state_expire_done(ctx);
1586                 return;
1587         }
1588         ipfw_state_expire_loop(ctx, anchor,
1589             ipfw_state_scan_max, ipfw_state_expire_max);
1590 }
1591
1592 static int
1593 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1594 {
1595         struct ipfw_state *anchor;
1596
1597         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1598             ("stateexp is in progress"));
1599         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1600
1601         if (ctx->ipfw_state_cnt == 0) {
1602                 ipfw_state_expire_done(ctx);
1603                 return (0);
1604         }
1605
1606         /*
1607          * Do not expire more than once per second, it is useless.
1608          */
1609         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1610             ctx->ipfw_state_lastexp == time_uptime) {
1611                 ipfw_state_expire_done(ctx);
1612                 return (0);
1613         }
1614         ctx->ipfw_state_lastexp = time_uptime;
1615
1616         anchor = &ctx->ipfw_stateexp_anch;
1617         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1618         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1619 }
1620
1621 static void
1622 ipfw_state_expire_dispatch(netmsg_t nm)
1623 {
1624         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1625
1626         ASSERT_NETISR_NCPUS(mycpuid);
1627
1628         /* Reply ASAP */
1629         crit_enter();
1630         netisr_replymsg(&nm->base, 0);
1631         crit_exit();
1632
1633         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1634                 /* Running; done. */
1635                 return;
1636         }
1637         ipfw_state_expire_start(ctx,
1638             ipfw_state_scan_max, ipfw_state_expire_max);
1639 }
1640
1641 static void
1642 ipfw_state_expire_ipifunc(void *dummy __unused)
1643 {
1644         struct netmsg_base *msg;
1645
1646         KKASSERT(mycpuid < netisr_ncpus);
1647         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
1648
1649         crit_enter();
1650         if (msg->lmsg.ms_flags & MSGF_DONE)
1651                 netisr_sendmsg_oncpu(msg);
1652         crit_exit();
1653 }
1654
1655 static boolean_t
1656 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
1657 {
1658         uint32_t seq = ntohl(tcp->th_seq);
1659         uint32_t ack = ntohl(tcp->th_ack);
1660
1661         if (tcp->th_flags & TH_RST)
1662                 return (TRUE);
1663
1664         if (dir == MATCH_FORWARD) {
1665                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
1666                         s->st_flags |= IPFW_STATE_F_SEQFWD;
1667                         s->st_seq_fwd = seq;
1668                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
1669                         s->st_seq_fwd = seq;
1670                 } else {
1671                         /* Out-of-sequence; done. */
1672                         return (FALSE);
1673                 }
1674                 if (tcp->th_flags & TH_ACK) {
1675                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
1676                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
1677                                 s->st_ack_fwd = ack;
1678                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
1679                                 s->st_ack_fwd = ack;
1680                         } else {
1681                                 /* Out-of-sequence; done. */
1682                                 return (FALSE);
1683                         }
1684
1685                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
1686                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
1687                                 s->st_state |= (TH_ACK << 8);
1688                 }
1689         } else {
1690                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
1691                         s->st_flags |= IPFW_STATE_F_SEQREV;
1692                         s->st_seq_rev = seq;
1693                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
1694                         s->st_seq_rev = seq;
1695                 } else {
1696                         /* Out-of-sequence; done. */
1697                         return (FALSE);
1698                 }
1699                 if (tcp->th_flags & TH_ACK) {
1700                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
1701                                 s->st_flags |= IPFW_STATE_F_ACKREV;
1702                                 s->st_ack_rev= ack;
1703                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
1704                                 s->st_ack_rev = ack;
1705                         } else {
1706                                 /* Out-of-sequence; done. */
1707                                 return (FALSE);
1708                         }
1709
1710                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
1711                             s->st_ack_rev == s->st_seq_fwd + 1)
1712                                 s->st_state |= TH_ACK;
1713                 }
1714         }
1715         return (TRUE);
1716 }
1717
1718 static void
1719 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
1720     const struct tcphdr *tcp, struct ipfw_state *s)
1721 {
1722
1723         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
1724                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
1725
1726                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
1727                         return;
1728
1729                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
1730                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
1731                 case TH_SYN:                            /* opening */
1732                         s->st_expire = time_uptime + dyn_syn_lifetime;
1733                         break;
1734
1735                 case BOTH_SYN:                  /* move to established */
1736                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
1737                 case BOTH_SYN | (TH_FIN << 8):
1738                         s->st_expire = time_uptime + dyn_ack_lifetime;
1739                         break;
1740
1741                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
1742                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
1743                                 /* And both FINs were ACKed. */
1744                                 s->st_expire = time_uptime + dyn_fin_lifetime;
1745                         } else {
1746                                 s->st_expire = time_uptime +
1747                                     dyn_finwait_lifetime;
1748                         }
1749                         break;
1750
1751                 default:
1752 #if 0
1753                         /*
1754                          * reset or some invalid combination, but can also
1755                          * occur if we use keep-state the wrong way.
1756                          */
1757                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
1758                                 kprintf("invalid state: 0x%x\n", s->st_state);
1759 #endif
1760                         s->st_expire = time_uptime + dyn_rst_lifetime;
1761                         break;
1762                 }
1763         } else if (pkt->proto == IPPROTO_UDP) {
1764                 s->st_expire = time_uptime + dyn_udp_lifetime;
1765         } else {
1766                 /* other protocols */
1767                 s->st_expire = time_uptime + dyn_short_lifetime;
1768         }
1769 }
1770
1771 /*
1772  * Lookup a state.
1773  */
1774 static struct ipfw_state *
1775 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1776     int *match_direction, const struct tcphdr *tcp)
1777 {
1778         struct ipfw_state *key, *s;
1779         int dir = MATCH_NONE;
1780
1781         key = &ctx->ipfw_state_tmpkey;
1782         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
1783             pkt->dst_ip, pkt->dst_port, pkt->proto);
1784         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
1785         if (s == NULL)
1786                 goto done; /* not found. */
1787         if (TIME_LEQ(s->st_expire, time_uptime)) {
1788                 /* Expired. */
1789                 ipfw_state_del(ctx, s);
1790                 s = NULL;
1791                 goto done;
1792         }
1793         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
1794                 /* TCP ports recycling is too fast. */
1795                 ctx->ipfw_sts_tcprecycled++;
1796                 ipfw_state_del(ctx, s);
1797                 s = NULL;
1798                 goto done;
1799         }
1800
1801         if (s->st_swap == key->st_swap) {
1802                 dir = MATCH_FORWARD;
1803         } else {
1804                 KASSERT((s->st_swap & key->st_swap) == 0,
1805                     ("found mismatch state"));
1806                 dir = MATCH_REVERSE;
1807         }
1808
1809         /* Update this state. */
1810         ipfw_state_update(pkt, dir, tcp, s);
1811
1812         if (s->st_track != NULL) {
1813                 /* This track has been used. */
1814                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
1815         }
1816 done:
1817         if (match_direction)
1818                 *match_direction = dir;
1819         return (s);
1820 }
1821
1822 static __inline struct ip_fw *
1823 ipfw_state_lookup_rule(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1824     int *match_direction, const struct tcphdr *tcp, uint16_t len)
1825 {
1826         struct ipfw_state *s;
1827
1828         s = ipfw_state_lookup(ctx, pkt, match_direction, tcp);
1829         if (s == NULL)
1830                 return (NULL);
1831
1832         KASSERT(s->st_rule->cpuid == mycpuid,
1833             ("rule %p (cpu%d) does not belong to the current cpu%d",
1834              s->st_rule, s->st_rule->cpuid, mycpuid));
1835
1836         s->st_pcnt++;
1837         s->st_bcnt += len;
1838
1839         return (s->st_rule);
1840 }
1841
1842 static struct ipfw_state *
1843 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
1844     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
1845     const struct tcphdr *tcp)
1846 {
1847         struct ipfw_state *s, *dup;
1848
1849         KASSERT(type == O_KEEP_STATE || type == O_LIMIT,
1850             ("invalid state type %u", type));
1851
1852         s = kmalloc(sizeof(*s), M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
1853         if (s == NULL) {
1854                 ctx->ipfw_sts_nomem++;
1855                 return (NULL);
1856         }
1857
1858         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
1859             id->dst_ip, id->dst_port, id->proto);
1860
1861         s->st_rule = rule;
1862         s->st_type = type;
1863
1864         ctx->ipfw_state_cnt++;
1865         ctx->ipfw_state_loosecnt++;
1866         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
1867                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
1868                 ctx->ipfw_state_loosecnt = 0;
1869         }
1870
1871         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1872         if (dup != NULL)
1873                 panic("ipfw: state exists");
1874         TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1875
1876         /*
1877          * Update this state:
1878          * Set st_expire and st_state.
1879          */
1880         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
1881
1882         if (t != NULL) {
1883                 /* Keep the track referenced. */
1884                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
1885                 s->st_track = t;
1886         }
1887         return (s);
1888 }
1889
1890 static boolean_t
1891 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
1892 {
1893         struct ipfw_trkcnt *trk;
1894         boolean_t trk_freed = FALSE;
1895
1896         KASSERT(t->t_count != NULL, ("track anchor"));
1897         KASSERT(LIST_EMPTY(&t->t_state_list),
1898             ("invalid track is still referenced"));
1899
1900         trk = t->t_trkcnt;
1901         KASSERT(trk != NULL, ("track has no trkcnt"));
1902
1903         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
1904         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
1905         kfree(t, M_IPFW);
1906
1907         /*
1908          * fdrop() style reference counting.
1909          * See kern/kern_descrip.c fdrop().
1910          */
1911         for (;;) {
1912                 int refs = trk->tc_refs;
1913
1914                 cpu_ccfence();
1915                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
1916                 if (refs == 1) {
1917                         IPFW_TRKCNT_TOKGET;
1918                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
1919                                 KASSERT(trk->tc_count == 0,
1920                                     ("%d states reference this trkcnt",
1921                                      trk->tc_count));
1922                                 RB_REMOVE(ipfw_trkcnt_tree,
1923                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
1924
1925                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
1926                                     ("invalid trkcnt cnt %d",
1927                                      ipfw_gd.ipfw_trkcnt_cnt));
1928                                 ipfw_gd.ipfw_trkcnt_cnt--;
1929                                 IPFW_TRKCNT_TOKREL;
1930
1931                                 if (ctx->ipfw_trkcnt_spare == NULL)
1932                                         ctx->ipfw_trkcnt_spare = trk;
1933                                 else
1934                                         kfree(trk, M_IPFW);
1935                                 trk_freed = TRUE;
1936                                 break; /* done! */
1937                         }
1938                         IPFW_TRKCNT_TOKREL;
1939                         /* retry */
1940                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
1941                         break; /* done! */
1942                 }
1943                 /* retry */
1944         }
1945         return (trk_freed);
1946 }
1947
1948 static void
1949 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
1950 {
1951         struct ipfw_track *t, *tn;
1952
1953         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
1954                 if (t->t_count == NULL) /* anchor */
1955                         continue;
1956                 if (rule != NULL && t->t_rule != rule)
1957                         continue;
1958                 ipfw_track_free(ctx, t);
1959         }
1960 }
1961
1962 static boolean_t
1963 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
1964     boolean_t reap)
1965 {
1966         struct ipfw_state *s, *sn;
1967         boolean_t ret = FALSE;
1968
1969         KASSERT(t->t_count != NULL, ("track anchor"));
1970
1971         if (LIST_EMPTY(&t->t_state_list))
1972                 return (FALSE);
1973
1974         /*
1975          * Do not expire more than once per second, it is useless.
1976          */
1977         if (t->t_lastexp == time_uptime)
1978                 return (FALSE);
1979         t->t_lastexp = time_uptime;
1980
1981         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
1982                 if (TIME_LEQ(s->st_expire, time_uptime) ||
1983                     (reap && IPFW_STATE_TCPCLOSED(s))) {
1984                         KASSERT(s->st_track == t,
1985                             ("state track %p does not match %p",
1986                              s->st_track, t));
1987                         ipfw_state_del(ctx, s);
1988                         ret = TRUE;
1989                 }
1990         }
1991         return (ret);
1992 }
1993
1994 static __inline struct ipfw_trkcnt *
1995 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
1996 {
1997         struct ipfw_trkcnt *trk;
1998
1999         if (ctx->ipfw_trkcnt_spare != NULL) {
2000                 trk = ctx->ipfw_trkcnt_spare;
2001                 ctx->ipfw_trkcnt_spare = NULL;
2002         } else {
2003                 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2004                     M_INTWAIT | M_NULLOK);
2005         }
2006         return (trk);
2007 }
2008
2009 static void
2010 ipfw_track_expire_done(struct ipfw_context *ctx)
2011 {
2012
2013         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2014             ("trackexp is not in progress"));
2015         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2016         callout_reset(&ctx->ipfw_trackto_ch, hz,
2017             ipfw_track_expire_ipifunc, NULL);
2018 }
2019
2020 static void
2021 ipfw_track_expire_more(struct ipfw_context *ctx)
2022 {
2023         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2024
2025         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2026             ("trackexp is not in progress"));
2027         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2028             ("trackexp more did not finish"));
2029         netisr_sendmsg_oncpu(nm);
2030 }
2031
2032 static int
2033 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2034     int scan_max, int expire_max)
2035 {
2036         struct ipfw_track *t;
2037         int scanned = 0, expired = 0;
2038         boolean_t reap = FALSE;
2039
2040         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2041             ("trackexp is not in progress"));
2042
2043         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2044                 reap = TRUE;
2045
2046         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2047                 if (scanned++ >= scan_max) {
2048                         ipfw_track_expire_more(ctx);
2049                         return (expired);
2050                 }
2051
2052                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2053                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2054
2055                 if (t->t_count == NULL) /* anchor */
2056                         continue;
2057
2058                 ipfw_track_state_expire(ctx, t, reap);
2059                 if (!LIST_EMPTY(&t->t_state_list)) {
2060                         /* There are states referencing this track. */
2061                         continue;
2062                 }
2063
2064                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2065                         /* Expired. */
2066                         if (ipfw_track_free(ctx, t)) {
2067                                 if (++expired >= expire_max) {
2068                                         ipfw_track_expire_more(ctx);
2069                                         return (expired);
2070                                 }
2071                         }
2072                 }
2073         }
2074         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2075         ipfw_track_expire_done(ctx);
2076         return (expired);
2077 }
2078
2079 static int
2080 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2081 {
2082         struct ipfw_track *anchor;
2083
2084         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2085             ("trackexp is in progress"));
2086         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2087
2088         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2089                 ipfw_track_expire_done(ctx);
2090                 return (0);
2091         }
2092
2093         /*
2094          * Do not expire more than once per second, it is useless.
2095          */
2096         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2097             ctx->ipfw_track_lastexp == time_uptime) {
2098                 ipfw_track_expire_done(ctx);
2099                 return (0);
2100         }
2101         ctx->ipfw_track_lastexp = time_uptime;
2102
2103         anchor = &ctx->ipfw_trackexp_anch;
2104         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2105         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2106 }
2107
2108 static void
2109 ipfw_track_expire_more_dispatch(netmsg_t nm)
2110 {
2111         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2112         struct ipfw_track *anchor;
2113
2114         ASSERT_NETISR_NCPUS(mycpuid);
2115         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2116             ("trackexp is not in progress"));
2117
2118         /* Reply ASAP */
2119         netisr_replymsg(&nm->base, 0);
2120
2121         anchor = &ctx->ipfw_trackexp_anch;
2122         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2123                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2124                 ipfw_track_expire_done(ctx);
2125                 return;
2126         }
2127         ipfw_track_expire_loop(ctx, anchor,
2128             ipfw_track_scan_max, ipfw_track_expire_max);
2129 }
2130
2131 static void
2132 ipfw_track_expire_dispatch(netmsg_t nm)
2133 {
2134         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2135
2136         ASSERT_NETISR_NCPUS(mycpuid);
2137
2138         /* Reply ASAP */
2139         crit_enter();
2140         netisr_replymsg(&nm->base, 0);
2141         crit_exit();
2142
2143         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2144                 /* Running; done. */
2145                 return;
2146         }
2147         ipfw_track_expire_start(ctx,
2148             ipfw_track_scan_max, ipfw_track_expire_max);
2149 }
2150
2151 static void
2152 ipfw_track_expire_ipifunc(void *dummy __unused)
2153 {
2154         struct netmsg_base *msg;
2155
2156         KKASSERT(mycpuid < netisr_ncpus);
2157         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2158
2159         crit_enter();
2160         if (msg->lmsg.ms_flags & MSGF_DONE)
2161                 netisr_sendmsg_oncpu(msg);
2162         crit_exit();
2163 }
2164
2165 static int
2166 ipfw_track_reap(struct ipfw_context *ctx)
2167 {
2168         struct ipfw_track *t, *anchor;
2169         int expired;
2170
2171         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2172                 /*
2173                  * Kick start track expiring.  Ignore scan limit,
2174                  * we are short of tracks.
2175                  */
2176                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2177                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2178                     ipfw_track_reap_max);
2179                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2180                 return (expired);
2181         }
2182
2183         /*
2184          * Tracks are being expired.
2185          */
2186
2187         if (RB_EMPTY(&ctx->ipfw_track_tree))
2188                 return (0);
2189
2190         expired = 0;
2191         anchor = &ctx->ipfw_trackexp_anch;
2192         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2193                 /*
2194                  * Ignore scan limit; we are short of tracks.
2195                  */
2196
2197                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2198                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2199
2200                 if (t->t_count == NULL) /* anchor */
2201                         continue;
2202
2203                 ipfw_track_state_expire(ctx, t, TRUE);
2204                 if (!LIST_EMPTY(&t->t_state_list)) {
2205                         /* There are states referencing this track. */
2206                         continue;
2207                 }
2208
2209                 if (ipfw_track_free(ctx, t)) {
2210                         if (++expired >= ipfw_track_reap_max) {
2211                                 ipfw_track_expire_more(ctx);
2212                                 break;
2213                         }
2214                 }
2215         }
2216         /*
2217          * NOTE:
2218          * Leave the anchor on the list, even if the end of the list has
2219          * been reached.  ipfw_track_expire_more_dispatch() will handle
2220          * the removal.
2221          */
2222         return (expired);
2223 }
2224
2225 static struct ipfw_track *
2226 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2227     uint16_t limit_mask, struct ip_fw *rule)
2228 {
2229         struct ipfw_track *key, *t, *dup;
2230         struct ipfw_trkcnt *trk, *ret;
2231         boolean_t do_expire = FALSE;
2232
2233         KASSERT(rule->track_ruleid != 0,
2234             ("rule %u has no track ruleid", rule->rulenum));
2235
2236         key = &ctx->ipfw_track_tmpkey;
2237         key->t_proto = id->proto;
2238         key->t_addrs = 0;
2239         key->t_ports = 0;
2240         key->t_rule = rule;
2241         if (limit_mask & DYN_SRC_ADDR)
2242                 key->t_saddr = id->src_ip;
2243         if (limit_mask & DYN_DST_ADDR)
2244                 key->t_daddr = id->dst_ip;
2245         if (limit_mask & DYN_SRC_PORT)
2246                 key->t_sport = id->src_port;
2247         if (limit_mask & DYN_DST_PORT)
2248                 key->t_dport = id->dst_port;
2249
2250         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2251         if (t != NULL)
2252                 goto done;
2253
2254         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2255         if (t == NULL) {
2256                 ctx->ipfw_tks_nomem++;
2257                 return (NULL);
2258         }
2259
2260         t->t_key = key->t_key;
2261         t->t_rule = rule;
2262         t->t_lastexp = 0;
2263         LIST_INIT(&t->t_state_list);
2264
2265         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2266                 time_t globexp, uptime;
2267
2268                 trk = NULL;
2269                 do_expire = TRUE;
2270
2271                 /*
2272                  * Do not expire globally more than once per second,
2273                  * it is useless.
2274                  */
2275                 uptime = time_uptime;
2276                 globexp = ipfw_gd.ipfw_track_globexp;
2277                 if (globexp != uptime &&
2278                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2279                     globexp, uptime)) {
2280                         int cpu;
2281
2282                         /* Expire tracks on other CPUs. */
2283                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2284                                 if (cpu == mycpuid)
2285                                         continue;
2286                                 lwkt_send_ipiq(globaldata_find(cpu),
2287                                     ipfw_track_expire_ipifunc, NULL);
2288                         }
2289                 }
2290         } else {
2291                 trk = ipfw_trkcnt_alloc(ctx);
2292         }
2293         if (trk == NULL) {
2294                 struct ipfw_trkcnt *tkey;
2295
2296                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2297                 key = NULL; /* tkey overlaps key */
2298
2299                 tkey->tc_key = t->t_key;
2300                 tkey->tc_ruleid = rule->track_ruleid;
2301
2302                 IPFW_TRKCNT_TOKGET;
2303                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2304                     tkey);
2305                 if (trk == NULL) {
2306                         IPFW_TRKCNT_TOKREL;
2307                         if (do_expire) {
2308                                 ctx->ipfw_tks_reap++;
2309                                 if (ipfw_track_reap(ctx) > 0) {
2310                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2311                                             ipfw_track_max) {
2312                                                 trk = ipfw_trkcnt_alloc(ctx);
2313                                                 if (trk != NULL)
2314                                                         goto install;
2315                                                 ctx->ipfw_tks_cntnomem++;
2316                                         } else {
2317                                                 ctx->ipfw_tks_overflow++;
2318                                         }
2319                                 } else {
2320                                         ctx->ipfw_tks_reapfailed++;
2321                                         ctx->ipfw_tks_overflow++;
2322                                 }
2323                         } else {
2324                                 ctx->ipfw_tks_cntnomem++;
2325                         }
2326                         kfree(t, M_IPFW);
2327                         return (NULL);
2328                 }
2329                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2330                     ("invalid trkcnt refs %d", trk->tc_refs));
2331                 atomic_add_int(&trk->tc_refs, 1);
2332                 IPFW_TRKCNT_TOKREL;
2333         } else {
2334 install:
2335                 trk->tc_key = t->t_key;
2336                 trk->tc_ruleid = rule->track_ruleid;
2337                 trk->tc_refs = 0;
2338                 trk->tc_count = 0;
2339                 trk->tc_expire = 0;
2340                 trk->tc_rulenum = rule->rulenum;
2341
2342                 IPFW_TRKCNT_TOKGET;
2343                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2344                     trk);
2345                 if (ret != NULL) {
2346                         KASSERT(ret->tc_refs > 0 &&
2347                             ret->tc_refs < netisr_ncpus,
2348                             ("invalid trkcnt refs %d", ret->tc_refs));
2349                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2350                             ("trkcnt spare was installed"));
2351                         ctx->ipfw_trkcnt_spare = trk;
2352                         trk = ret;
2353                 } else {
2354                         ipfw_gd.ipfw_trkcnt_cnt++;
2355                 }
2356                 atomic_add_int(&trk->tc_refs, 1);
2357                 IPFW_TRKCNT_TOKREL;
2358         }
2359         t->t_count = &trk->tc_count;
2360         t->t_trkcnt = trk;
2361
2362         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2363         if (dup != NULL)
2364                 panic("ipfw: track exists");
2365         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2366 done:
2367         t->t_expire = time_uptime + dyn_short_lifetime;
2368         return (t);
2369 }
2370
2371 /*
2372  * Install state for rule type cmd->o.opcode
2373  *
2374  * Returns 1 (failure) if state is not installed because of errors or because
2375  * states limitations are enforced.
2376  */
2377 static int
2378 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2379     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2380 {
2381         struct ipfw_state *s;
2382         struct ipfw_track *t;
2383         int count, diff;
2384
2385         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2386             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2387                 boolean_t overflow = TRUE;
2388
2389                 ctx->ipfw_sts_reap++;
2390                 if (ipfw_state_reap(ctx, diff) == 0)
2391                         ctx->ipfw_sts_reapfailed++;
2392                 if (ipfw_state_cntsync() < ipfw_state_max)
2393                         overflow = FALSE;
2394
2395                 if (overflow) {
2396                         time_t globexp, uptime;
2397                         int cpu;
2398
2399                         /*
2400                          * Do not expire globally more than once per second,
2401                          * it is useless.
2402                          */
2403                         uptime = time_uptime;
2404                         globexp = ipfw_gd.ipfw_state_globexp;
2405                         if (globexp == uptime ||
2406                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2407                             globexp, uptime)) {
2408                                 ctx->ipfw_sts_overflow++;
2409                                 return (1);
2410                         }
2411
2412                         /* Expire states on other CPUs. */
2413                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2414                                 if (cpu == mycpuid)
2415                                         continue;
2416                                 lwkt_send_ipiq(globaldata_find(cpu),
2417                                     ipfw_state_expire_ipifunc, NULL);
2418                         }
2419                         ctx->ipfw_sts_overflow++;
2420                         return (1);
2421                 }
2422         }
2423
2424         switch (cmd->o.opcode) {
2425         case O_KEEP_STATE: /* bidir rule */
2426                 s = ipfw_state_add(ctx, &args->f_id, O_KEEP_STATE, rule, NULL,
2427                     tcp);
2428                 if (s == NULL)
2429                         return (1);
2430                 break;
2431
2432         case O_LIMIT: /* limit number of sessions */
2433                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2434                 if (t == NULL)
2435                         return (1);
2436
2437                 if (*t->t_count >= cmd->conn_limit) {
2438                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2439                                 return (1);
2440                 }
2441                 for (;;) {
2442                         count = *t->t_count;
2443                         if (count >= cmd->conn_limit)
2444                                 return (1);
2445                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2446                                 break;
2447                 }
2448
2449                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2450                 if (s == NULL) {
2451                         /* Undo damage. */
2452                         atomic_subtract_int(t->t_count, 1);
2453                         return (1);
2454                 }
2455                 break;
2456
2457         default:
2458                 panic("unknown state type %u\n", cmd->o.opcode);
2459         }
2460         return (0);
2461 }
2462
2463 static int
2464 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2465     const struct in_addr *in)
2466 {
2467         struct radix_node_head *rnh;
2468         struct sockaddr_in sin;
2469         struct ipfw_tblent *te;
2470
2471         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2472         rnh = ctx->ipfw_tables[tableid];
2473         if (rnh == NULL)
2474                 return (0); /* no match */
2475
2476         memset(&sin, 0, sizeof(sin));
2477         sin.sin_family = AF_INET;
2478         sin.sin_len = sizeof(sin);
2479         sin.sin_addr = *in;
2480
2481         te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2482         if (te == NULL)
2483                 return (0); /* no match */
2484
2485         te->te_use++;
2486         te->te_lastuse = time_second;
2487         return (1); /* match */
2488 }
2489
2490 /*
2491  * Transmit a TCP packet, containing either a RST or a keepalive.
2492  * When flags & TH_RST, we are sending a RST packet, because of a
2493  * "reset" action matched the packet.
2494  * Otherwise we are sending a keepalive, and flags & TH_
2495  *
2496  * Only {src,dst}_{ip,port} of "id" are used.
2497  */
2498 static void
2499 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2500 {
2501         struct mbuf *m;
2502         struct ip *ip;
2503         struct tcphdr *tcp;
2504         struct route sro;       /* fake route */
2505
2506         MGETHDR(m, M_NOWAIT, MT_HEADER);
2507         if (m == NULL)
2508                 return;
2509         m->m_pkthdr.rcvif = NULL;
2510         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2511         m->m_data += max_linkhdr;
2512
2513         ip = mtod(m, struct ip *);
2514         bzero(ip, m->m_len);
2515         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2516         ip->ip_p = IPPROTO_TCP;
2517         tcp->th_off = 5;
2518
2519         /*
2520          * Assume we are sending a RST (or a keepalive in the reverse
2521          * direction), swap src and destination addresses and ports.
2522          */
2523         ip->ip_src.s_addr = htonl(id->dst_ip);
2524         ip->ip_dst.s_addr = htonl(id->src_ip);
2525         tcp->th_sport = htons(id->dst_port);
2526         tcp->th_dport = htons(id->src_port);
2527         if (flags & TH_RST) {   /* we are sending a RST */
2528                 if (flags & TH_ACK) {
2529                         tcp->th_seq = htonl(ack);
2530                         tcp->th_ack = htonl(0);
2531                         tcp->th_flags = TH_RST;
2532                 } else {
2533                         if (flags & TH_SYN)
2534                                 seq++;
2535                         tcp->th_seq = htonl(0);
2536                         tcp->th_ack = htonl(seq);
2537                         tcp->th_flags = TH_RST | TH_ACK;
2538                 }
2539         } else {
2540                 /*
2541                  * We are sending a keepalive. flags & TH_SYN determines
2542                  * the direction, forward if set, reverse if clear.
2543                  * NOTE: seq and ack are always assumed to be correct
2544                  * as set by the caller. This may be confusing...
2545                  */
2546                 if (flags & TH_SYN) {
2547                         /*
2548                          * we have to rewrite the correct addresses!
2549                          */
2550                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2551                         ip->ip_src.s_addr = htonl(id->src_ip);
2552                         tcp->th_dport = htons(id->dst_port);
2553                         tcp->th_sport = htons(id->src_port);
2554                 }
2555                 tcp->th_seq = htonl(seq);
2556                 tcp->th_ack = htonl(ack);
2557                 tcp->th_flags = TH_ACK;
2558         }
2559
2560         /*
2561          * set ip_len to the payload size so we can compute
2562          * the tcp checksum on the pseudoheader
2563          * XXX check this, could save a couple of words ?
2564          */
2565         ip->ip_len = htons(sizeof(struct tcphdr));
2566         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2567
2568         /*
2569          * now fill fields left out earlier
2570          */
2571         ip->ip_ttl = ip_defttl;
2572         ip->ip_len = m->m_pkthdr.len;
2573
2574         bzero(&sro, sizeof(sro));
2575         ip_rtaddr(ip->ip_dst, &sro);
2576
2577         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2578         ip_output(m, NULL, &sro, 0, NULL, NULL);
2579         if (sro.ro_rt)
2580                 RTFREE(sro.ro_rt);
2581 }
2582
2583 /*
2584  * Send a reject message, consuming the mbuf passed as an argument.
2585  */
2586 static void
2587 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2588 {
2589         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2590                 /* We need the IP header in host order for icmp_error(). */
2591                 if (args->eh != NULL) {
2592                         struct ip *ip = mtod(args->m, struct ip *);
2593
2594                         ip->ip_len = ntohs(ip->ip_len);
2595                         ip->ip_off = ntohs(ip->ip_off);
2596                 }
2597                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2598         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2599                 struct tcphdr *const tcp =
2600                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2601
2602                 if ((tcp->th_flags & TH_RST) == 0) {
2603                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2604                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2605                 }
2606                 m_freem(args->m);
2607         } else {
2608                 m_freem(args->m);
2609         }
2610         args->m = NULL;
2611 }
2612
2613 /*
2614  * Given an ip_fw *, lookup_next_rule will return a pointer
2615  * to the next rule, which can be either the jump
2616  * target (for skipto instructions) or the next one in the list (in
2617  * all other cases including a missing jump target).
2618  * The result is also written in the "next_rule" field of the rule.
2619  * Backward jumps are not allowed, so start looking from the next
2620  * rule...
2621  *
2622  * This never returns NULL -- in case we do not have an exact match,
2623  * the next rule is returned. When the ruleset is changed,
2624  * pointers are flushed so we are always correct.
2625  */
2626 static struct ip_fw *
2627 lookup_next_rule(struct ip_fw *me)
2628 {
2629         struct ip_fw *rule = NULL;
2630         ipfw_insn *cmd;
2631
2632         /* look for action, in case it is a skipto */
2633         cmd = ACTION_PTR(me);
2634         if (cmd->opcode == O_LOG)
2635                 cmd += F_LEN(cmd);
2636         if (cmd->opcode == O_SKIPTO) {
2637                 for (rule = me->next; rule; rule = rule->next) {
2638                         if (rule->rulenum >= cmd->arg1)
2639                                 break;
2640                 }
2641         }
2642         if (rule == NULL)                       /* failure or not a skipto */
2643                 rule = me->next;
2644         me->next_rule = rule;
2645         return rule;
2646 }
2647
2648 static int
2649 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
2650                 enum ipfw_opcodes opcode, uid_t uid)
2651 {
2652         struct in_addr src_ip, dst_ip;
2653         struct inpcbinfo *pi;
2654         boolean_t wildcard;
2655         struct inpcb *pcb;
2656
2657         if (fid->proto == IPPROTO_TCP) {
2658                 wildcard = FALSE;
2659                 pi = &tcbinfo[mycpuid];
2660         } else if (fid->proto == IPPROTO_UDP) {
2661                 wildcard = TRUE;
2662                 pi = &udbinfo[mycpuid];
2663         } else {
2664                 return 0;
2665         }
2666
2667         /*
2668          * Values in 'fid' are in host byte order
2669          */
2670         dst_ip.s_addr = htonl(fid->dst_ip);
2671         src_ip.s_addr = htonl(fid->src_ip);
2672         if (oif) {
2673                 pcb = in_pcblookup_hash(pi,
2674                         dst_ip, htons(fid->dst_port),
2675                         src_ip, htons(fid->src_port),
2676                         wildcard, oif);
2677         } else {
2678                 pcb = in_pcblookup_hash(pi,
2679                         src_ip, htons(fid->src_port),
2680                         dst_ip, htons(fid->dst_port),
2681                         wildcard, NULL);
2682         }
2683         if (pcb == NULL || pcb->inp_socket == NULL)
2684                 return 0;
2685
2686         if (opcode == O_UID) {
2687 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
2688                 return !socheckuid(pcb->inp_socket, uid);
2689 #undef socheckuid
2690         } else  {
2691                 return groupmember(uid, pcb->inp_socket->so_cred);
2692         }
2693 }
2694
2695 /*
2696  * The main check routine for the firewall.
2697  *
2698  * All arguments are in args so we can modify them and return them
2699  * back to the caller.
2700  *
2701  * Parameters:
2702  *
2703  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
2704  *              Starts with the IP header.
2705  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
2706  *      args->oif       Outgoing interface, or NULL if packet is incoming.
2707  *              The incoming interface is in the mbuf. (in)
2708  *
2709  *      args->rule      Pointer to the last matching rule (in/out)
2710  *      args->f_id      Addresses grabbed from the packet (out)
2711  *
2712  * Return value:
2713  *
2714  *      If the packet was denied/rejected and has been dropped, *m is equal
2715  *      to NULL upon return.
2716  *
2717  *      IP_FW_DENY      the packet must be dropped.
2718  *      IP_FW_PASS      The packet is to be accepted and routed normally.
2719  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
2720  *      IP_FW_TEE       Tee the packet to port (args->cookie)
2721  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
2722  */
2723 static int
2724 ipfw_chk(struct ip_fw_args *args)
2725 {
2726         /*
2727          * Local variables hold state during the processing of a packet.
2728          *
2729          * IMPORTANT NOTE: to speed up the processing of rules, there
2730          * are some assumption on the values of the variables, which
2731          * are documented here. Should you change them, please check
2732          * the implementation of the various instructions to make sure
2733          * that they still work.
2734          *
2735          * args->eh     The MAC header. It is non-null for a layer2
2736          *      packet, it is NULL for a layer-3 packet.
2737          *
2738          * m | args->m  Pointer to the mbuf, as received from the caller.
2739          *      It may change if ipfw_chk() does an m_pullup, or if it
2740          *      consumes the packet because it calls send_reject().
2741          *      XXX This has to change, so that ipfw_chk() never modifies
2742          *      or consumes the buffer.
2743          * ip   is simply an alias of the value of m, and it is kept
2744          *      in sync with it (the packet is  supposed to start with
2745          *      the ip header).
2746          */
2747         struct mbuf *m = args->m;
2748         struct ip *ip = mtod(m, struct ip *);
2749
2750         /*
2751          * oif | args->oif      If NULL, ipfw_chk has been called on the
2752          *      inbound path (ether_input, ip_input).
2753          *      If non-NULL, ipfw_chk has been called on the outbound path
2754          *      (ether_output, ip_output).
2755          */
2756         struct ifnet *oif = args->oif;
2757
2758         struct ip_fw *f = NULL;         /* matching rule */
2759         int retval = IP_FW_PASS;
2760         struct m_tag *mtag;
2761         struct divert_info *divinfo;
2762
2763         /*
2764          * hlen The length of the IPv4 header.
2765          *      hlen >0 means we have an IPv4 packet.
2766          */
2767         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
2768
2769         /*
2770          * offset       The offset of a fragment. offset != 0 means that
2771          *      we have a fragment at this offset of an IPv4 packet.
2772          *      offset == 0 means that (if this is an IPv4 packet)
2773          *      this is the first or only fragment.
2774          */
2775         u_short offset = 0;
2776
2777         /*
2778          * Local copies of addresses. They are only valid if we have
2779          * an IP packet.
2780          *
2781          * proto        The protocol. Set to 0 for non-ip packets,
2782          *      or to the protocol read from the packet otherwise.
2783          *      proto != 0 means that we have an IPv4 packet.
2784          *
2785          * src_port, dst_port   port numbers, in HOST format. Only
2786          *      valid for TCP and UDP packets.
2787          *
2788          * src_ip, dst_ip       ip addresses, in NETWORK format.
2789          *      Only valid for IPv4 packets.
2790          */
2791         uint8_t proto;
2792         uint16_t src_port = 0, dst_port = 0;    /* NOTE: host format    */
2793         struct in_addr src_ip, dst_ip;          /* NOTE: network format */
2794         uint16_t ip_len = 0;
2795
2796         /*
2797          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
2798          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
2799          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
2800          */
2801         int dyn_dir = MATCH_UNKNOWN;
2802         struct ip_fw *dyn_f = NULL;
2803         int cpuid = mycpuid;
2804         struct ipfw_context *ctx;
2805
2806         ASSERT_NETISR_NCPUS(cpuid);
2807         ctx = ipfw_ctx[cpuid];
2808
2809         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
2810                 return IP_FW_PASS;      /* accept */
2811
2812         if (args->eh == NULL ||         /* layer 3 packet */
2813             (m->m_pkthdr.len >= sizeof(struct ip) &&
2814              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
2815                 hlen = ip->ip_hl << 2;
2816
2817         /*
2818          * Collect parameters into local variables for faster matching.
2819          */
2820         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
2821                 proto = args->f_id.proto = 0;   /* mark f_id invalid */
2822                 goto after_ip_checks;
2823         }
2824
2825         proto = args->f_id.proto = ip->ip_p;
2826         src_ip = ip->ip_src;
2827         dst_ip = ip->ip_dst;
2828         if (args->eh != NULL) { /* layer 2 packets are as on the wire */
2829                 offset = ntohs(ip->ip_off) & IP_OFFMASK;
2830                 ip_len = ntohs(ip->ip_len);
2831         } else {
2832                 offset = ip->ip_off & IP_OFFMASK;
2833                 ip_len = ip->ip_len;
2834         }
2835
2836 #define PULLUP_TO(len)                          \
2837 do {                                            \
2838         if (m->m_len < (len)) {                 \
2839                 args->m = m = m_pullup(m, (len));\
2840                 if (m == NULL)                  \
2841                         goto pullup_failed;     \
2842                 ip = mtod(m, struct ip *);      \
2843         }                                       \
2844 } while (0)
2845
2846         if (offset == 0) {
2847                 switch (proto) {
2848                 case IPPROTO_TCP:
2849                         {
2850                                 struct tcphdr *tcp;
2851
2852                                 PULLUP_TO(hlen + sizeof(struct tcphdr));
2853                                 tcp = L3HDR(struct tcphdr, ip);
2854                                 dst_port = tcp->th_dport;
2855                                 src_port = tcp->th_sport;
2856                                 args->f_id.flags = tcp->th_flags;
2857                         }
2858                         break;
2859
2860                 case IPPROTO_UDP:
2861                         {
2862                                 struct udphdr *udp;
2863
2864                                 PULLUP_TO(hlen + sizeof(struct udphdr));
2865                                 udp = L3HDR(struct udphdr, ip);
2866                                 dst_port = udp->uh_dport;
2867                                 src_port = udp->uh_sport;
2868                         }
2869                         break;
2870
2871                 case IPPROTO_ICMP:
2872                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
2873                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
2874                         break;
2875
2876                 default:
2877                         break;
2878                 }
2879         }
2880
2881 #undef PULLUP_TO
2882
2883         args->f_id.src_ip = ntohl(src_ip.s_addr);
2884         args->f_id.dst_ip = ntohl(dst_ip.s_addr);
2885         args->f_id.src_port = src_port = ntohs(src_port);
2886         args->f_id.dst_port = dst_port = ntohs(dst_port);
2887
2888 after_ip_checks:
2889         if (args->rule) {
2890                 /*
2891                  * Packet has already been tagged. Look for the next rule
2892                  * to restart processing.
2893                  *
2894                  * If fw_one_pass != 0 then just accept it.
2895                  * XXX should not happen here, but optimized out in
2896                  * the caller.
2897                  */
2898                 if (fw_one_pass)
2899                         return IP_FW_PASS;
2900
2901                 /* This rule is being/has been flushed */
2902                 if (ipfw_flushing)
2903                         return IP_FW_DENY;
2904
2905                 KASSERT(args->rule->cpuid == cpuid,
2906                         ("rule used on cpu%d", cpuid));
2907
2908                 /* This rule was deleted */
2909                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
2910                         return IP_FW_DENY;
2911
2912                 f = args->rule->next_rule;
2913                 if (f == NULL)
2914                         f = lookup_next_rule(args->rule);
2915         } else {
2916                 /*
2917                  * Find the starting rule. It can be either the first
2918                  * one, or the one after divert_rule if asked so.
2919                  */
2920                 int skipto;
2921
2922                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
2923                 if (mtag != NULL) {
2924                         divinfo = m_tag_data(mtag);
2925                         skipto = divinfo->skipto;
2926                 } else {
2927                         skipto = 0;
2928                 }
2929
2930                 f = ctx->ipfw_layer3_chain;
2931                 if (args->eh == NULL && skipto != 0) {
2932                         /* No skipto during rule flushing */
2933                         if (ipfw_flushing)
2934                                 return IP_FW_DENY;
2935
2936                         if (skipto >= IPFW_DEFAULT_RULE)
2937                                 return IP_FW_DENY; /* invalid */
2938
2939                         while (f && f->rulenum <= skipto)
2940                                 f = f->next;
2941                         if (f == NULL)  /* drop packet */
2942                                 return IP_FW_DENY;
2943                 } else if (ipfw_flushing) {
2944                         /* Rules are being flushed; skip to default rule */
2945                         f = ctx->ipfw_default_rule;
2946                 }
2947         }
2948         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
2949                 m_tag_delete(m, mtag);
2950
2951         /*
2952          * Now scan the rules, and parse microinstructions for each rule.
2953          */
2954         for (; f; f = f->next) {
2955                 int l, cmdlen;
2956                 ipfw_insn *cmd;
2957                 int skip_or; /* skip rest of OR block */
2958
2959 again:
2960                 if (ctx->ipfw_set_disable & (1 << f->set))
2961                         continue;
2962
2963                 skip_or = 0;
2964                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
2965                      l -= cmdlen, cmd += cmdlen) {
2966                         int match;
2967
2968                         /*
2969                          * check_body is a jump target used when we find a
2970                          * CHECK_STATE, and need to jump to the body of
2971                          * the target rule.
2972                          */
2973
2974 check_body:
2975                         cmdlen = F_LEN(cmd);
2976                         /*
2977                          * An OR block (insn_1 || .. || insn_n) has the
2978                          * F_OR bit set in all but the last instruction.
2979                          * The first match will set "skip_or", and cause
2980                          * the following instructions to be skipped until
2981                          * past the one with the F_OR bit clear.
2982                          */
2983                         if (skip_or) {          /* skip this instruction */
2984                                 if ((cmd->len & F_OR) == 0)
2985                                         skip_or = 0;    /* next one is good */
2986                                 continue;
2987                         }
2988                         match = 0; /* set to 1 if we succeed */
2989
2990                         switch (cmd->opcode) {
2991                         /*
2992                          * The first set of opcodes compares the packet's
2993                          * fields with some pattern, setting 'match' if a
2994                          * match is found. At the end of the loop there is
2995                          * logic to deal with F_NOT and F_OR flags associated
2996                          * with the opcode.
2997                          */
2998                         case O_NOP:
2999                                 match = 1;
3000                                 break;
3001
3002                         case O_FORWARD_MAC:
3003                                 kprintf("ipfw: opcode %d unimplemented\n",
3004                                         cmd->opcode);
3005                                 break;
3006
3007                         case O_GID:
3008                         case O_UID:
3009                                 /*
3010                                  * We only check offset == 0 && proto != 0,
3011                                  * as this ensures that we have an IPv4
3012                                  * packet with the ports info.
3013                                  */
3014                                 if (offset!=0)
3015                                         break;
3016
3017                                 match = ipfw_match_uid(&args->f_id, oif,
3018                                         cmd->opcode,
3019                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3020                                 break;
3021
3022                         case O_RECV:
3023                                 match = iface_match(m->m_pkthdr.rcvif,
3024                                     (ipfw_insn_if *)cmd);
3025                                 break;
3026
3027                         case O_XMIT:
3028                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3029                                 break;
3030
3031                         case O_VIA:
3032                                 match = iface_match(oif ? oif :
3033                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3034                                 break;
3035
3036                         case O_MACADDR2:
3037                                 if (args->eh != NULL) { /* have MAC header */
3038                                         uint32_t *want = (uint32_t *)
3039                                                 ((ipfw_insn_mac *)cmd)->addr;
3040                                         uint32_t *mask = (uint32_t *)
3041                                                 ((ipfw_insn_mac *)cmd)->mask;
3042                                         uint32_t *hdr = (uint32_t *)args->eh;
3043
3044                                         match =
3045                                         (want[0] == (hdr[0] & mask[0]) &&
3046                                          want[1] == (hdr[1] & mask[1]) &&
3047                                          want[2] == (hdr[2] & mask[2]));
3048                                 }
3049                                 break;
3050
3051                         case O_MAC_TYPE:
3052                                 if (args->eh != NULL) {
3053                                         uint16_t t =
3054                                             ntohs(args->eh->ether_type);
3055                                         uint16_t *p =
3056                                             ((ipfw_insn_u16 *)cmd)->ports;
3057                                         int i;
3058
3059                                         /* Special vlan handling */
3060                                         if (m->m_flags & M_VLANTAG)
3061                                                 t = ETHERTYPE_VLAN;
3062
3063                                         for (i = cmdlen - 1; !match && i > 0;
3064                                              i--, p += 2) {
3065                                                 match =
3066                                                 (t >= p[0] && t <= p[1]);
3067                                         }
3068                                 }
3069                                 break;
3070
3071                         case O_FRAG:
3072                                 match = (hlen > 0 && offset != 0);
3073                                 break;
3074
3075                         case O_IN:      /* "out" is "not in" */
3076                                 match = (oif == NULL);
3077                                 break;
3078
3079                         case O_LAYER2:
3080                                 match = (args->eh != NULL);
3081                                 break;
3082
3083                         case O_PROTO:
3084                                 /*
3085                                  * We do not allow an arg of 0 so the
3086                                  * check of "proto" only suffices.
3087                                  */
3088                                 match = (proto == cmd->arg1);
3089                                 break;
3090
3091                         case O_IP_SRC:
3092                                 match = (hlen > 0 &&
3093                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3094                                     src_ip.s_addr);
3095                                 break;
3096
3097                         case O_IP_SRC_MASK:
3098                                 match = (hlen > 0 &&
3099                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3100                                      (src_ip.s_addr &
3101                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3102                                 break;
3103
3104                         case O_IP_SRC_ME:
3105                                 if (hlen > 0) {
3106                                         struct ifnet *tif;
3107
3108                                         tif = INADDR_TO_IFP(&src_ip);
3109                                         match = (tif != NULL);
3110                                 }
3111                                 break;
3112
3113                         case O_IP_SRC_TABLE:
3114                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3115                                     &src_ip);
3116                                 break;
3117
3118                         case O_IP_DST_SET:
3119                         case O_IP_SRC_SET:
3120                                 if (hlen > 0) {
3121                                         uint32_t *d = (uint32_t *)(cmd + 1);
3122                                         uint32_t addr =
3123                                             cmd->opcode == O_IP_DST_SET ?
3124                                                 args->f_id.dst_ip :
3125                                                 args->f_id.src_ip;
3126
3127                                         if (addr < d[0])
3128                                                 break;
3129                                         addr -= d[0]; /* subtract base */
3130                                         match =
3131                                         (addr < cmd->arg1) &&
3132                                          (d[1 + (addr >> 5)] &
3133                                           (1 << (addr & 0x1f)));
3134                                 }
3135                                 break;
3136
3137                         case O_IP_DST:
3138                                 match = (hlen > 0 &&
3139                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3140                                     dst_ip.s_addr);
3141                                 break;
3142
3143                         case O_IP_DST_MASK:
3144                                 match = (hlen > 0) &&
3145                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3146                                      (dst_ip.s_addr &
3147                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3148                                 break;
3149
3150                         case O_IP_DST_ME:
3151                                 if (hlen > 0) {
3152                                         struct ifnet *tif;
3153
3154                                         tif = INADDR_TO_IFP(&dst_ip);
3155                                         match = (tif != NULL);
3156                                 }
3157                                 break;
3158
3159                         case O_IP_DST_TABLE:
3160                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3161                                     &dst_ip);
3162                                 break;
3163
3164                         case O_IP_SRCPORT:
3165                         case O_IP_DSTPORT:
3166                                 /*
3167                                  * offset == 0 && proto != 0 is enough
3168                                  * to guarantee that we have an IPv4
3169                                  * packet with port info.
3170                                  */
3171                                 if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
3172                                     && offset == 0) {
3173                                         uint16_t x =
3174                                             (cmd->opcode == O_IP_SRCPORT) ?
3175                                                 src_port : dst_port ;
3176                                         uint16_t *p =
3177                                             ((ipfw_insn_u16 *)cmd)->ports;
3178                                         int i;
3179
3180                                         for (i = cmdlen - 1; !match && i > 0;
3181                                              i--, p += 2) {
3182                                                 match =
3183                                                 (x >= p[0] && x <= p[1]);
3184                                         }
3185                                 }
3186                                 break;
3187
3188                         case O_ICMPTYPE:
3189                                 match = (offset == 0 && proto==IPPROTO_ICMP &&
3190                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3191                                 break;
3192
3193                         case O_IPOPT:
3194                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3195                                 break;
3196
3197                         case O_IPVER:
3198                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3199                                 break;
3200
3201                         case O_IPTTL:
3202                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3203                                 break;
3204
3205                         case O_IPID:
3206                                 match = (hlen > 0 &&
3207                                     cmd->arg1 == ntohs(ip->ip_id));
3208                                 break;
3209
3210                         case O_IPLEN:
3211                                 match = (hlen > 0 && cmd->arg1 == ip_len);
3212                                 break;
3213
3214                         case O_IPPRECEDENCE:
3215                                 match = (hlen > 0 &&
3216                                     (cmd->arg1 == (ip->ip_tos & 0xe0)));
3217                                 break;
3218
3219                         case O_IPTOS:
3220                                 match = (hlen > 0 &&
3221                                     flags_match(cmd, ip->ip_tos));
3222                                 break;
3223
3224                         case O_TCPFLAGS:
3225                                 match = (proto == IPPROTO_TCP && offset == 0 &&
3226                                     flags_match(cmd,
3227                                         L3HDR(struct tcphdr,ip)->th_flags));
3228                                 break;
3229
3230                         case O_TCPOPTS:
3231                                 match = (proto == IPPROTO_TCP && offset == 0 &&
3232                                     tcpopts_match(ip, cmd));
3233                                 break;
3234
3235                         case O_TCPSEQ:
3236                                 match = (proto == IPPROTO_TCP && offset == 0 &&
3237                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3238                                         L3HDR(struct tcphdr,ip)->th_seq);
3239                                 break;
3240
3241                         case O_TCPACK:
3242                                 match = (proto == IPPROTO_TCP && offset == 0 &&
3243                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3244                                         L3HDR(struct tcphdr,ip)->th_ack);
3245                                 break;
3246
3247                         case O_TCPWIN:
3248                                 match = (proto == IPPROTO_TCP && offset == 0 &&
3249                                     cmd->arg1 ==
3250                                         L3HDR(struct tcphdr,ip)->th_win);
3251                                 break;
3252
3253                         case O_ESTAB:
3254                                 /* reject packets which have SYN only */
3255                                 /* XXX should i also check for TH_ACK ? */
3256                                 match = (proto == IPPROTO_TCP && offset == 0 &&
3257                                     (L3HDR(struct tcphdr,ip)->th_flags &
3258                                      (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3259                                 break;
3260
3261                         case O_LOG:
3262                                 if (fw_verbose) {
3263                                         ipfw_log(ctx, f, hlen, args->eh, m,
3264                                             oif);
3265                                 }
3266                                 match = 1;
3267                                 break;
3268
3269                         case O_PROB:
3270                                 match = (krandom() <
3271                                         ((ipfw_insn_u32 *)cmd)->d[0]);
3272                                 break;
3273
3274                         /*
3275                          * The second set of opcodes represents 'actions',
3276                          * i.e. the terminal part of a rule once the packet
3277                          * matches all previous patterns.
3278                          * Typically there is only one action for each rule,
3279                          * and the opcode is stored at the end of the rule
3280                          * (but there are exceptions -- see below).
3281                          *
3282                          * In general, here we set retval and terminate the
3283                          * outer loop (would be a 'break 3' in some language,
3284                          * but we need to do a 'goto done').
3285                          *
3286                          * Exceptions:
3287                          * O_COUNT and O_SKIPTO actions:
3288                          *   instead of terminating, we jump to the next rule
3289                          *   ('goto next_rule', equivalent to a 'break 2'),
3290                          *   or to the SKIPTO target ('goto again' after
3291                          *   having set f, cmd and l), respectively.
3292                          *
3293                          * O_LIMIT and O_KEEP_STATE: these opcodes are
3294                          *   not real 'actions', and are stored right
3295                          *   before the 'action' part of the rule.
3296                          *   These opcodes try to install an entry in the
3297                          *   state tables; if successful, we continue with
3298                          *   the next opcode (match=1; break;), otherwise
3299                          *   the packet must be dropped ('goto done' after
3300                          *   setting retval).  If static rules are changed
3301                          *   during the state installation, the packet will
3302                          *   be dropped and rule's stats will not beupdated
3303                          *   ('return IP_FW_DENY').
3304                          *
3305                          * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3306                          *   cause a lookup of the state table, and a jump
3307                          *   to the 'action' part of the parent rule
3308                          *   ('goto check_body') if an entry is found, or
3309                          *   (CHECK_STATE only) a jump to the next rule if
3310                          *   the entry is not found ('goto next_rule').
3311                          *   The result of the lookup is cached to make
3312                          *   further instances of these opcodes are
3313                          *   effectively NOPs.  If static rules are changed
3314                          *   during the state looking up, the packet will
3315                          *   be dropped and rule's stats will not be updated
3316                          *   ('return IP_FW_DENY').
3317                          */
3318                         case O_LIMIT:
3319                         case O_KEEP_STATE:
3320                                 if (ipfw_state_install(ctx, f,
3321                                     (ipfw_insn_limit *)cmd, args,
3322                                     (offset == 0 && proto == IPPROTO_TCP) ?
3323                                     L3HDR(struct tcphdr, ip) : NULL)) {
3324                                         retval = IP_FW_DENY;
3325                                         goto done; /* error/limit violation */
3326                                 }
3327                                 match = 1;
3328                                 break;
3329
3330                         case O_PROBE_STATE:
3331                         case O_CHECK_STATE:
3332                                 /*
3333                                  * States are checked at the first keep-state 
3334                                  * check-state occurrence, with the result
3335                                  * being stored in dyn_dir.  The compiler
3336                                  * introduces a PROBE_STATE instruction for
3337                                  * us when we have a KEEP_STATE/LIMIT (because
3338                                  * PROBE_STATE needs to be run first).
3339                                  */
3340                                 if (dyn_dir == MATCH_UNKNOWN) {
3341                                         dyn_f = ipfw_state_lookup_rule(ctx,
3342                                             &args->f_id, &dyn_dir,
3343                                             (offset == 0 &&
3344                                              proto == IPPROTO_TCP) ?
3345                                             L3HDR(struct tcphdr, ip) : NULL,
3346                                             ip_len);
3347                                         if (dyn_f != NULL) {
3348                                                 /*
3349                                                  * Found a rule from a state;
3350                                                  * jump to the 'action' part
3351                                                  * of the rule.
3352                                                  */
3353                                                 f = dyn_f;
3354                                                 cmd = ACTION_PTR(f);
3355                                                 l = f->cmd_len - f->act_ofs;
3356                                                 goto check_body;
3357                                         }
3358                                 }
3359                                 /*
3360                                  * State not found. If CHECK_STATE, skip to
3361                                  * next rule, if PROBE_STATE just ignore and
3362                                  * continue with next opcode.
3363                                  */
3364                                 if (cmd->opcode == O_CHECK_STATE)
3365                                         goto next_rule;
3366                                 match = 1;
3367                                 break;
3368
3369                         case O_ACCEPT:
3370                                 retval = IP_FW_PASS;    /* accept */
3371                                 goto done;
3372
3373                         case O_PIPE:
3374                         case O_QUEUE:
3375                                 args->rule = f; /* report matching rule */
3376                                 args->cookie = cmd->arg1;
3377                                 retval = IP_FW_DUMMYNET;
3378                                 goto done;
3379
3380                         case O_DIVERT:
3381                         case O_TEE:
3382                                 if (args->eh) /* not on layer 2 */
3383                                         break;
3384
3385                                 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
3386                                                  sizeof(*divinfo), M_NOWAIT);
3387                                 if (mtag == NULL) {
3388                                         retval = IP_FW_DENY;
3389                                         goto done;
3390                                 }
3391                                 divinfo = m_tag_data(mtag);
3392
3393                                 divinfo->skipto = f->rulenum;
3394                                 divinfo->port = cmd->arg1;
3395                                 divinfo->tee = (cmd->opcode == O_TEE);
3396                                 m_tag_prepend(m, mtag);
3397
3398                                 args->cookie = cmd->arg1;
3399                                 retval = (cmd->opcode == O_DIVERT) ?
3400                                          IP_FW_DIVERT : IP_FW_TEE;
3401                                 goto done;
3402
3403                         case O_COUNT:
3404                         case O_SKIPTO:
3405                                 f->pcnt++;      /* update stats */
3406                                 f->bcnt += ip_len;
3407                                 f->timestamp = time_second;
3408                                 if (cmd->opcode == O_COUNT)
3409                                         goto next_rule;
3410                                 /* handle skipto */
3411                                 if (f->next_rule == NULL)
3412                                         lookup_next_rule(f);
3413                                 f = f->next_rule;
3414                                 goto again;
3415
3416                         case O_REJECT:
3417                                 /*
3418                                  * Drop the packet and send a reject notice
3419                                  * if the packet is not ICMP (or is an ICMP
3420                                  * query), and it is not multicast/broadcast.
3421                                  */
3422                                 if (hlen > 0 &&
3423                                     (proto != IPPROTO_ICMP ||
3424                                      is_icmp_query(ip)) &&
3425                                     !(m->m_flags & (M_BCAST|M_MCAST)) &&
3426                                     !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
3427                                         /*
3428                                          * Update statistics before the possible
3429                                          * blocking 'send_reject'
3430                                          */
3431                                         f->pcnt++;
3432                                         f->bcnt += ip_len;
3433                                         f->timestamp = time_second;
3434
3435                                         send_reject(args, cmd->arg1,
3436                                             offset,ip_len);
3437                                         m = args->m;
3438
3439                                         /*
3440                                          * Return directly here, rule stats
3441                                          * have been updated above.
3442                                          */
3443                                         return IP_FW_DENY;
3444                                 }
3445                                 /* FALLTHROUGH */
3446                         case O_DENY:
3447                                 retval = IP_FW_DENY;
3448                                 goto done;
3449
3450                         case O_FORWARD_IP:
3451                                 if (args->eh)   /* not valid on layer2 pkts */
3452                                         break;
3453                                 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
3454                                         struct sockaddr_in *sin;
3455
3456                                         mtag = m_tag_get(PACKET_TAG_IPFORWARD,
3457                                                sizeof(*sin), M_NOWAIT);
3458                                         if (mtag == NULL) {
3459                                                 retval = IP_FW_DENY;
3460                                                 goto done;
3461                                         }
3462                                         sin = m_tag_data(mtag);
3463
3464                                         /* Structure copy */
3465                                         *sin = ((ipfw_insn_sa *)cmd)->sa;
3466
3467                                         m_tag_prepend(m, mtag);
3468                                         m->m_pkthdr.fw_flags |=
3469                                                 IPFORWARD_MBUF_TAGGED;
3470                                         m->m_pkthdr.fw_flags &=
3471                                                 ~BRIDGE_MBUF_TAGGED;
3472                                 }
3473                                 retval = IP_FW_PASS;
3474                                 goto done;
3475
3476                         default:
3477                                 panic("-- unknown opcode %d", cmd->opcode);
3478                         } /* end of switch() on opcodes */
3479
3480                         if (cmd->len & F_NOT)
3481                                 match = !match;
3482
3483                         if (match) {
3484                                 if (cmd->len & F_OR)
3485                                         skip_or = 1;
3486                         } else {
3487                                 if (!(cmd->len & F_OR)) /* not an OR block, */
3488                                         break;          /* try next rule    */
3489                         }
3490
3491                 }       /* end of inner for, scan opcodes */
3492
3493 next_rule:;             /* try next rule                */
3494
3495         }               /* end of outer for, scan rules */
3496         kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
3497         return IP_FW_DENY;
3498
3499 done:
3500         /* Update statistics */
3501         f->pcnt++;
3502         f->bcnt += ip_len;
3503         f->timestamp = time_second;
3504         return retval;
3505
3506 pullup_failed:
3507         if (fw_verbose)
3508                 kprintf("pullup failed\n");
3509         return IP_FW_DENY;
3510 }
3511
3512 static void
3513 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
3514 {
3515         struct m_tag *mtag;
3516         struct dn_pkt *pkt;
3517         ipfw_insn *cmd;
3518         const struct ipfw_flow_id *id;
3519         struct dn_flow_id *fid;
3520
3521         M_ASSERTPKTHDR(m);
3522
3523         mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt), M_NOWAIT);
3524         if (mtag == NULL) {
3525                 m_freem(m);
3526                 return;
3527         }
3528         m_tag_prepend(m, mtag);
3529
3530         pkt = m_tag_data(mtag);
3531         bzero(pkt, sizeof(*pkt));
3532
3533         cmd = fwa->rule->cmd + fwa->rule->act_ofs;
3534         if (cmd->opcode == O_LOG)
3535                 cmd += F_LEN(cmd);
3536         KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
3537                 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
3538
3539         pkt->dn_m = m;
3540         pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
3541         pkt->ifp = fwa->oif;
3542         pkt->pipe_nr = pipe_nr;
3543
3544         pkt->cpuid = mycpuid;
3545         pkt->msgport = netisr_curport();
3546
3547         id = &fwa->f_id;
3548         fid = &pkt->id;
3549         fid->fid_dst_ip = id->dst_ip;
3550         fid->fid_src_ip = id->src_ip;
3551         fid->fid_dst_port = id->dst_port;
3552         fid->fid_src_port = id->src_port;
3553         fid->fid_proto = id->proto;
3554         fid->fid_flags = id->flags;
3555
3556         ipfw_ref_rule(fwa->rule);
3557         pkt->dn_priv = fwa->rule;
3558         pkt->dn_unref_priv = ipfw_unref_rule;
3559
3560         if (cmd->opcode == O_PIPE)
3561                 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
3562
3563         m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
3564 }
3565
3566 /*
3567  * When a rule is added/deleted, clear the next_rule pointers in all rules.
3568  * These will be reconstructed on the fly as packets are matched.
3569  */
3570 static void
3571 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
3572 {
3573         struct ip_fw *rule;
3574
3575         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
3576                 rule->next_rule = NULL;
3577 }
3578
3579 static __inline void
3580 ipfw_inc_static_count(struct ip_fw *rule)
3581 {
3582         /* Static rule's counts are updated only on CPU0 */
3583         KKASSERT(mycpuid == 0);
3584
3585         static_count++;
3586         static_ioc_len += IOC_RULESIZE(rule);
3587 }
3588
3589 static __inline void
3590 ipfw_dec_static_count(struct ip_fw *rule)
3591 {
3592         int l = IOC_RULESIZE(rule);
3593
3594         /* Static rule's counts are updated only on CPU0 */
3595         KKASSERT(mycpuid == 0);
3596
3597         KASSERT(static_count > 0, ("invalid static count %u", static_count));
3598         static_count--;
3599
3600         KASSERT(static_ioc_len >= l,
3601                 ("invalid static len %u", static_ioc_len));
3602         static_ioc_len -= l;
3603 }
3604
3605 static void
3606 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
3607 {
3608         if (fwmsg->sibling != NULL) {
3609                 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
3610                 fwmsg->sibling->sibling = rule;
3611         }
3612         fwmsg->sibling = rule;
3613 }
3614
3615 static struct ip_fw *
3616 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3617 {
3618         struct ip_fw *rule;
3619
3620         rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
3621
3622         rule->act_ofs = ioc_rule->act_ofs;
3623         rule->cmd_len = ioc_rule->cmd_len;
3624         rule->rulenum = ioc_rule->rulenum;
3625         rule->set = ioc_rule->set;
3626         rule->usr_flags = ioc_rule->usr_flags;
3627
3628         bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
3629
3630         rule->refcnt = 1;
3631         rule->cpuid = mycpuid;
3632         rule->rule_flags = rule_flags;
3633
3634         return rule;
3635 }
3636
3637 static void
3638 ipfw_add_rule_dispatch(netmsg_t nmsg)
3639 {
3640         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3641         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3642         struct ip_fw *rule;
3643
3644         ASSERT_NETISR_NCPUS(mycpuid);
3645
3646         rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
3647
3648         /*
3649          * Insert rule into the pre-determined position
3650          */
3651         if (fwmsg->prev_rule != NULL) {
3652                 struct ip_fw *prev, *next;
3653
3654                 prev = fwmsg->prev_rule;
3655                 KKASSERT(prev->cpuid == mycpuid);
3656
3657                 next = fwmsg->next_rule;
3658                 KKASSERT(next->cpuid == mycpuid);
3659
3660                 rule->next = next;
3661                 prev->next = rule;
3662
3663                 /*
3664                  * Move to the position on the next CPU
3665                  * before the msg is forwarded.
3666                  */
3667                 fwmsg->prev_rule = prev->sibling;
3668                 fwmsg->next_rule = next->sibling;
3669         } else {
3670                 KKASSERT(fwmsg->next_rule == NULL);
3671                 rule->next = ctx->ipfw_layer3_chain;
3672                 ctx->ipfw_layer3_chain = rule;
3673         }
3674
3675         /* Link rule CPU sibling */
3676         ipfw_link_sibling(fwmsg, rule);
3677
3678         ipfw_flush_rule_ptrs(ctx);
3679
3680         if (mycpuid == 0) {
3681                 /* Statistics only need to be updated once */
3682                 ipfw_inc_static_count(rule);
3683
3684                 /* Return the rule on CPU0 */
3685                 nmsg->lmsg.u.ms_resultp = rule;
3686         }
3687
3688         if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
3689                 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
3690
3691         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3692 }
3693
3694 /*
3695  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
3696  * then possibly create a rule number and add the rule to the list.
3697  * Update the rule_number in the input struct so the caller knows
3698  * it as well.
3699  */
3700 static void
3701 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3702 {
3703         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3704         struct netmsg_ipfw fwmsg;
3705         struct netmsg_base *nmsg;
3706         struct ip_fw *f, *prev, *rule;
3707
3708         ASSERT_NETISR0;
3709
3710         /*
3711          * If rulenum is 0, find highest numbered rule before the
3712          * default rule, and add rule number incremental step.
3713          */
3714         if (ioc_rule->rulenum == 0) {
3715                 int step = autoinc_step;
3716
3717                 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
3718                          step <= IPFW_AUTOINC_STEP_MAX);
3719
3720                 /*
3721                  * Locate the highest numbered rule before default
3722                  */
3723                 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
3724                         if (f->rulenum == IPFW_DEFAULT_RULE)
3725                                 break;
3726                         ioc_rule->rulenum = f->rulenum;
3727                 }
3728                 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
3729                         ioc_rule->rulenum += step;
3730         }
3731         KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
3732                 ioc_rule->rulenum != 0,
3733                 ("invalid rule num %d", ioc_rule->rulenum));
3734
3735         /*
3736          * Now find the right place for the new rule in the sorted list.
3737          */
3738         for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
3739              prev = f, f = f->next) {
3740                 if (f->rulenum > ioc_rule->rulenum) {
3741                         /* Found the location */
3742                         break;
3743                 }
3744         }
3745         KASSERT(f != NULL, ("no default rule?!"));
3746
3747         /*
3748          * Duplicate the rule onto each CPU.
3749          * The rule duplicated on CPU0 will be returned.
3750          */
3751         bzero(&fwmsg, sizeof(fwmsg));
3752         nmsg = &fwmsg.base;
3753         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
3754             ipfw_add_rule_dispatch);
3755         fwmsg.ioc_rule = ioc_rule;
3756         fwmsg.prev_rule = prev;
3757         fwmsg.next_rule = prev == NULL ? NULL : f;
3758         fwmsg.rule_flags = rule_flags;
3759
3760         netisr_domsg_global(nmsg);
3761         KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
3762
3763         rule = nmsg->lmsg.u.ms_resultp;
3764         KKASSERT(rule != NULL && rule->cpuid == mycpuid);
3765
3766         DPRINTF("++ installed rule %d, static count now %d\n",
3767                 rule->rulenum, static_count);
3768 }
3769
3770 /*
3771  * Free storage associated with a static rule (including derived
3772  * states/tracks).
3773  * The caller is in charge of clearing rule pointers to avoid
3774  * dangling pointers.
3775  * @return a pointer to the next entry.
3776  * Arguments are not checked, so they better be correct.
3777  */
3778 static struct ip_fw *
3779 ipfw_delete_rule(struct ipfw_context *ctx,
3780                  struct ip_fw *prev, struct ip_fw *rule)
3781 {
3782         struct ip_fw *n;
3783
3784         n = rule->next;
3785         if (prev == NULL)
3786                 ctx->ipfw_layer3_chain = n;
3787         else
3788                 prev->next = n;
3789
3790         /* Mark the rule as invalid */
3791         rule->rule_flags |= IPFW_RULE_F_INVALID;
3792         rule->next_rule = NULL;
3793         rule->sibling = NULL;
3794 #ifdef foo
3795         /* Don't reset cpuid here; keep various assertion working */
3796         rule->cpuid = -1;
3797 #endif
3798
3799         /* Statistics only need to be updated once */
3800         if (mycpuid == 0)
3801                 ipfw_dec_static_count(rule);
3802
3803         /* Try to free this rule */
3804         ipfw_free_rule(rule);
3805
3806         /* Return the next rule */
3807         return n;
3808 }
3809
3810 static void
3811 ipfw_flush_dispatch(netmsg_t nmsg)
3812 {
3813         int kill_default = nmsg->lmsg.u.ms_result;
3814         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3815         struct ip_fw *rule;
3816
3817         ASSERT_NETISR_NCPUS(mycpuid);
3818
3819         /*
3820          * Flush states.
3821          */
3822         ipfw_state_flush(ctx, NULL);
3823         KASSERT(ctx->ipfw_state_cnt == 0,
3824             ("%d pcpu states remain", ctx->ipfw_state_cnt));
3825         ctx->ipfw_state_loosecnt = 0;
3826         ctx->ipfw_state_lastexp = 0;
3827
3828         /*
3829          * Flush tracks.
3830          */
3831         ipfw_track_flush(ctx, NULL);
3832         ctx->ipfw_track_lastexp = 0;
3833         if (ctx->ipfw_trkcnt_spare != NULL) {
3834                 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
3835                 ctx->ipfw_trkcnt_spare = NULL;
3836         }
3837
3838         ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
3839
3840         while ((rule = ctx->ipfw_layer3_chain) != NULL &&
3841                (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
3842                 ipfw_delete_rule(ctx, NULL, rule);
3843
3844         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3845 }
3846
3847 /*
3848  * Deletes all rules from a chain (including the default rule
3849  * if the second argument is set).
3850  */
3851 static void
3852 ipfw_flush(int kill_default)
3853 {
3854         struct netmsg_base nmsg;
3855 #ifdef INVARIANTS
3856         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3857         int state_cnt;
3858 #endif
3859
3860         ASSERT_NETISR0;
3861
3862         /*
3863          * If 'kill_default' then caller has done the necessary
3864          * msgport syncing; unnecessary to do it again.
3865          */
3866         if (!kill_default) {
3867                 /*
3868                  * Let ipfw_chk() know the rules are going to
3869                  * be flushed, so it could jump directly to
3870                  * the default rule.
3871                  */
3872                 ipfw_flushing = 1;
3873                 /* XXX use priority sync */
3874                 netmsg_service_sync();
3875         }
3876
3877         /*
3878          * Press the 'flush' button
3879          */
3880         bzero(&nmsg, sizeof(nmsg));
3881         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
3882             ipfw_flush_dispatch);
3883         nmsg.lmsg.u.ms_result = kill_default;
3884         netisr_domsg_global(&nmsg);
3885         ipfw_gd.ipfw_state_loosecnt = 0;
3886         ipfw_gd.ipfw_state_globexp = 0;
3887         ipfw_gd.ipfw_track_globexp = 0;
3888
3889 #ifdef INVARIANTS
3890         state_cnt = ipfw_state_cntcoll();
3891         KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
3892
3893         KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
3894             ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
3895
3896         if (kill_default) {
3897                 KASSERT(static_count == 0,
3898                         ("%u static rules remain", static_count));
3899                 KASSERT(static_ioc_len == 0,
3900                         ("%u bytes of static rules remain", static_ioc_len));
3901         } else {
3902                 KASSERT(static_count == 1,
3903                         ("%u static rules remain", static_count));
3904                 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
3905                         ("%u bytes of static rules remain, should be %lu",
3906                          static_ioc_len,
3907                          (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
3908         }
3909 #endif
3910
3911         /* Flush is done */
3912         ipfw_flushing = 0;
3913 }
3914
3915 static void
3916 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
3917 {
3918         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
3919         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3920         struct ip_fw *rule, *prev;
3921
3922         ASSERT_NETISR_NCPUS(mycpuid);
3923
3924         rule = dmsg->start_rule;
3925         KKASSERT(rule->cpuid == mycpuid);
3926         dmsg->start_rule = rule->sibling;
3927
3928         prev = dmsg->prev_rule;
3929         if (prev != NULL) {
3930                 KKASSERT(prev->cpuid == mycpuid);
3931
3932                 /*
3933                  * Move to the position on the next CPU
3934                  * before the msg is forwarded.
3935                  */
3936                 dmsg->prev_rule = prev->sibling;
3937         }
3938
3939         /*
3940          * flush pointers outside the loop, then delete all matching
3941          * rules.  'prev' remains the same throughout the cycle.
3942          */
3943         ipfw_flush_rule_ptrs(ctx);
3944         while (rule && rule->rulenum == dmsg->rulenum) {
3945                 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
3946                         /* Flush states generated by this rule. */
3947                         ipfw_state_flush(ctx, rule);
3948                 }
3949                 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
3950                         /* Flush tracks generated by this rule. */
3951                         ipfw_track_flush(ctx, rule);
3952                 }
3953                 rule = ipfw_delete_rule(ctx, prev, rule);
3954         }
3955
3956         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3957 }
3958
3959 static int
3960 ipfw_alt_delete_rule(uint16_t rulenum)
3961 {
3962         struct ip_fw *prev, *rule;
3963         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3964         struct netmsg_del dmsg;
3965
3966         ASSERT_NETISR0;
3967
3968         /*
3969          * Locate first rule to delete
3970          */
3971         for (prev = NULL, rule = ctx->ipfw_layer3_chain;
3972              rule && rule->rulenum < rulenum;
3973              prev = rule, rule = rule->next)
3974                 ; /* EMPTY */
3975         if (rule->rulenum != rulenum)
3976                 return EINVAL;
3977
3978         /*
3979          * Get rid of the rule duplications on all CPUs
3980          */
3981         bzero(&dmsg, sizeof(dmsg));
3982         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
3983             ipfw_alt_delete_rule_dispatch);
3984         dmsg.prev_rule = prev;
3985         dmsg.start_rule = rule;
3986         dmsg.rulenum = rulenum;
3987
3988         netisr_domsg_global(&dmsg.base);
3989         KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
3990         return 0;
3991 }
3992
3993 static void
3994 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
3995 {
3996         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
3997         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3998         struct ip_fw *prev, *rule;
3999 #ifdef INVARIANTS
4000         int del = 0;
4001 #endif
4002
4003         ASSERT_NETISR_NCPUS(mycpuid);
4004
4005         ipfw_flush_rule_ptrs(ctx);
4006
4007         prev = NULL;
4008         rule = ctx->ipfw_layer3_chain;
4009         while (rule != NULL) {
4010                 if (rule->set == dmsg->from_set) {
4011                         if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4012                                 /* Flush states generated by this rule. */
4013                                 ipfw_state_flush(ctx, rule);
4014                         }
4015                         if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4016                                 /* Flush tracks generated by this rule. */
4017                                 ipfw_track_flush(ctx, rule);
4018                         }
4019                         rule = ipfw_delete_rule(ctx, prev, rule);
4020 #ifdef INVARIANTS
4021                         del = 1;
4022 #endif
4023                 } else {
4024                         prev = rule;
4025                         rule = rule->next;
4026                 }
4027         }
4028         KASSERT(del, ("no match set?!"));
4029
4030         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4031 }
4032
4033 static int
4034 ipfw_alt_delete_ruleset(uint8_t set)
4035 {
4036         struct netmsg_del dmsg;
4037         int del;
4038         struct ip_fw *rule;
4039         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4040
4041         ASSERT_NETISR0;
4042
4043         /*
4044          * Check whether the 'set' exists.  If it exists,
4045          * then check whether any rules within the set will
4046          * try to create states.
4047          */
4048         del = 0;
4049         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4050                 if (rule->set == set)
4051                         del = 1;
4052         }
4053         if (!del)
4054                 return 0; /* XXX EINVAL? */
4055
4056         /*
4057          * Delete this set
4058          */
4059         bzero(&dmsg, sizeof(dmsg));
4060         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4061             ipfw_alt_delete_ruleset_dispatch);
4062         dmsg.from_set = set;
4063         netisr_domsg_global(&dmsg.base);
4064
4065         return 0;
4066 }
4067
4068 static void
4069 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
4070 {
4071         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4072         struct ip_fw *rule;
4073
4074         ASSERT_NETISR_NCPUS(mycpuid);
4075
4076         rule = dmsg->start_rule;
4077         KKASSERT(rule->cpuid == mycpuid);
4078
4079         /*
4080          * Move to the position on the next CPU
4081          * before the msg is forwarded.
4082          */
4083         dmsg->start_rule = rule->sibling;
4084
4085         while (rule && rule->rulenum <= dmsg->rulenum) {
4086                 if (rule->rulenum == dmsg->rulenum)
4087                         rule->set = dmsg->to_set;
4088                 rule = rule->next;
4089         }
4090         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4091 }
4092
4093 static int
4094 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
4095 {
4096         struct netmsg_del dmsg;
4097         struct netmsg_base *nmsg;
4098         struct ip_fw *rule;
4099         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4100
4101         ASSERT_NETISR0;
4102
4103         /*
4104          * Locate first rule to move
4105          */
4106         for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
4107              rule = rule->next) {
4108                 if (rule->rulenum == rulenum && rule->set != set)
4109                         break;
4110         }
4111         if (rule == NULL || rule->rulenum > rulenum)
4112                 return 0; /* XXX error? */
4113
4114         bzero(&dmsg, sizeof(dmsg));
4115         nmsg = &dmsg.base;
4116         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4117             ipfw_alt_move_rule_dispatch);
4118         dmsg.start_rule = rule;
4119         dmsg.rulenum = rulenum;
4120         dmsg.to_set = set;
4121
4122         netisr_domsg_global(nmsg);
4123         KKASSERT(dmsg.start_rule == NULL);
4124         return 0;
4125 }
4126
4127 static void
4128 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
4129 {
4130         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4131         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4132         struct ip_fw *rule;
4133
4134         ASSERT_NETISR_NCPUS(mycpuid);
4135
4136         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4137                 if (rule->set == dmsg->from_set)
4138                         rule->set = dmsg->to_set;
4139         }
4140         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4141 }
4142
4143 static int
4144 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
4145 {
4146         struct netmsg_del dmsg;
4147         struct netmsg_base *nmsg;
4148
4149         ASSERT_NETISR0;
4150
4151         bzero(&dmsg, sizeof(dmsg));
4152         nmsg = &dmsg.base;
4153         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4154             ipfw_alt_move_ruleset_dispatch);
4155         dmsg.from_set = from_set;
4156         dmsg.to_set = to_set;
4157
4158         netisr_domsg_global(nmsg);
4159         return 0;
4160 }
4161
4162 static void
4163 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
4164 {
4165         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4166         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4167         struct ip_fw *rule;
4168
4169         ASSERT_NETISR_NCPUS(mycpuid);
4170
4171         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4172                 if (rule->set == dmsg->from_set)
4173                         rule->set = dmsg->to_set;
4174                 else if (rule->set == dmsg->to_set)
4175                         rule->set = dmsg->from_set;
4176         }
4177         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4178 }
4179
4180 static int
4181 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
4182 {
4183         struct netmsg_del dmsg;
4184         struct netmsg_base *nmsg;
4185
4186         ASSERT_NETISR0;
4187
4188         bzero(&dmsg, sizeof(dmsg));
4189         nmsg = &dmsg.base;
4190         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4191             ipfw_alt_swap_ruleset_dispatch);
4192         dmsg.from_set = set1;
4193         dmsg.to_set = set2;
4194
4195         netisr_domsg_global(nmsg);
4196         return 0;
4197 }
4198
4199 /*
4200  * Remove all rules with given number, and also do set manipulation.
4201  *
4202  * The argument is an uint32_t. The low 16 bit are the rule or set number,
4203  * the next 8 bits are the new set, the top 8 bits are the command:
4204  *
4205  *      0       delete rules with given number
4206  *      1       delete rules with given set number
4207  *      2       move rules with given number to new set
4208  *      3       move rules with given set number to new set
4209  *      4       swap sets with given numbers
4210  */
4211 static int
4212 ipfw_ctl_alter(uint32_t arg)
4213 {
4214         uint16_t rulenum;
4215         uint8_t cmd, new_set;
4216         int error = 0;
4217
4218         ASSERT_NETISR0;
4219
4220         rulenum = arg & 0xffff;
4221         cmd = (arg >> 24) & 0xff;
4222         new_set = (arg >> 16) & 0xff;
4223
4224         if (cmd > 4)
4225                 return EINVAL;
4226         if (new_set >= IPFW_DEFAULT_SET)
4227                 return EINVAL;
4228         if (cmd == 0 || cmd == 2) {
4229                 if (rulenum == IPFW_DEFAULT_RULE)
4230                         return EINVAL;
4231         } else {
4232                 if (rulenum >= IPFW_DEFAULT_SET)
4233                         return EINVAL;
4234         }
4235
4236         switch (cmd) {
4237         case 0: /* delete rules with given number */
4238                 error = ipfw_alt_delete_rule(rulenum);
4239                 break;
4240
4241         case 1: /* delete all rules with given set number */
4242                 error = ipfw_alt_delete_ruleset(rulenum);
4243                 break;
4244
4245         case 2: /* move rules with given number to new set */
4246                 error = ipfw_alt_move_rule(rulenum, new_set);
4247                 break;
4248
4249         case 3: /* move rules with given set number to new set */
4250                 error = ipfw_alt_move_ruleset(rulenum, new_set);
4251                 break;
4252
4253         case 4: /* swap two sets */
4254                 error = ipfw_alt_swap_ruleset(rulenum, new_set);
4255                 break;
4256         }
4257         return error;
4258 }
4259
4260 /*
4261  * Clear counters for a specific rule.
4262  */
4263 static void
4264 clear_counters(struct ip_fw *rule, int log_only)
4265 {
4266         ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
4267
4268         if (log_only == 0) {
4269                 rule->bcnt = rule->pcnt = 0;
4270                 rule->timestamp = 0;
4271         }
4272         if (l->o.opcode == O_LOG)
4273                 l->log_left = l->max_log;
4274 }
4275
4276 static void
4277 ipfw_zero_entry_dispatch(netmsg_t nmsg)
4278 {
4279         struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
4280         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4281         struct ip_fw *rule;
4282
4283         ASSERT_NETISR_NCPUS(mycpuid);
4284
4285         if (zmsg->rulenum == 0) {
4286                 KKASSERT(zmsg->start_rule == NULL);
4287
4288                 ctx->ipfw_norule_counter = 0;
4289                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4290                         clear_counters(rule, zmsg->log_only);
4291         } else {
4292                 struct ip_fw *start = zmsg->start_rule;
4293
4294                 KKASSERT(start->cpuid == mycpuid);
4295                 KKASSERT(start->rulenum == zmsg->rulenum);
4296
4297                 /*
4298                  * We can have multiple rules with the same number, so we
4299                  * need to clear them all.
4300                  */
4301                 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
4302                      rule = rule->next)
4303                         clear_counters(rule, zmsg->log_only);
4304
4305                 /*
4306                  * Move to the position on the next CPU
4307                  * before the msg is forwarded.
4308                  */
4309                 zmsg->start_rule = start->sibling;
4310         }
4311         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4312 }
4313
4314 /*
4315  * Reset some or all counters on firewall rules.
4316  * @arg frwl is null to clear all entries, or contains a specific
4317  * rule number.
4318  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
4319  */
4320 static int
4321 ipfw_ctl_zero_entry(int rulenum, int log_only)
4322 {
4323         struct netmsg_zent zmsg;
4324         struct netmsg_base *nmsg;
4325         const char *msg;
4326         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4327
4328         ASSERT_NETISR0;
4329
4330         bzero(&zmsg, sizeof(zmsg));
4331         nmsg = &zmsg.base;
4332         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4333             ipfw_zero_entry_dispatch);
4334         zmsg.log_only = log_only;
4335
4336         if (rulenum == 0) {
4337                 msg = log_only ? "ipfw: All logging counts reset.\n"
4338                                : "ipfw: Accounting cleared.\n";
4339         } else {
4340                 struct ip_fw *rule;
4341
4342                 /*
4343                  * Locate the first rule with 'rulenum'
4344                  */
4345                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4346                         if (rule->rulenum == rulenum)
4347                                 break;
4348                 }
4349                 if (rule == NULL) /* we did not find any matching rules */
4350                         return (EINVAL);
4351                 zmsg.start_rule = rule;
4352                 zmsg.rulenum = rulenum;
4353
4354                 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
4355                                : "ipfw: Entry %d cleared.\n";
4356         }
4357         netisr_domsg_global(nmsg);
4358         KKASSERT(zmsg.start_rule == NULL);
4359
4360         if (fw_verbose)
4361                 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
4362         return (0);
4363 }
4364
4365 /*
4366  * Check validity of the structure before insert.
4367  * Fortunately rules are simple, so this mostly need to check rule sizes.
4368  */
4369 static int
4370 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
4371 {
4372         int l, cmdlen = 0;
4373         int have_action = 0;
4374         ipfw_insn *cmd;
4375
4376         *rule_flags = 0;
4377
4378         /* Check for valid size */
4379         if (size < sizeof(*rule)) {
4380                 kprintf("ipfw: rule too short\n");
4381                 return EINVAL;
4382         }
4383         l = IOC_RULESIZE(rule);
4384         if (l != size) {
4385                 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
4386                 return EINVAL;
4387         }
4388
4389         /* Check rule number */
4390         if (rule->rulenum == IPFW_DEFAULT_RULE) {
4391                 kprintf("ipfw: invalid rule number\n");
4392                 return EINVAL;
4393         }
4394
4395         /*
4396          * Now go for the individual checks. Very simple ones, basically only
4397          * instruction sizes.
4398          */
4399         for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
4400              l -= cmdlen, cmd += cmdlen) {
4401                 cmdlen = F_LEN(cmd);
4402                 if (cmdlen > l) {
4403                         kprintf("ipfw: opcode %d size truncated\n",
4404                                 cmd->opcode);
4405                         return EINVAL;
4406                 }
4407
4408                 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
4409
4410                 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT) {
4411                         /* This rule will generate states. */
4412                         *rule_flags |= IPFW_RULE_F_GENSTATE;
4413                         if (cmd->opcode == O_LIMIT)
4414                                 *rule_flags |= IPFW_RULE_F_GENTRACK;
4415                 }
4416
4417                 switch (cmd->opcode) {
4418                 case O_NOP:
4419                 case O_PROBE_STATE:
4420                 case O_KEEP_STATE:
4421                 case O_PROTO:
4422                 case O_IP_SRC_ME:
4423                 case O_IP_DST_ME:
4424                 case O_LAYER2:
4425                 case O_IN:
4426                 case O_FRAG:
4427                 case O_IPOPT:
4428                 case O_IPLEN:
4429                 case O_IPID:
4430                 case O_IPTOS:
4431                 case O_IPPRECEDENCE:
4432                 case O_IPTTL:
4433                 case O_IPVER:
4434                 case O_TCPWIN:
4435                 case O_TCPFLAGS:
4436                 case O_TCPOPTS:
4437                 case O_ESTAB:
4438                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
4439                                 goto bad_size;
4440                         break;
4441
4442                 case O_IP_SRC_TABLE:
4443                 case O_IP_DST_TABLE:
4444                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
4445                                 goto bad_size;
4446                         if (cmd->arg1 >= ipfw_table_max) {
4447                                 kprintf("ipfw: invalid table id %u, max %d\n",
4448                                     cmd->arg1, ipfw_table_max);
4449                                 return EINVAL;
4450                         }
4451                         break;
4452
4453                 case O_UID:
4454                 case O_GID:
4455                 case O_IP_SRC:
4456                 case O_IP_DST:
4457                 case O_TCPSEQ:
4458                 case O_TCPACK:
4459                 case O_PROB:
4460                 case O_ICMPTYPE:
4461                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
4462                                 goto bad_size;
4463                         break;
4464
4465                 case O_LIMIT:
4466                         if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
4467                                 goto bad_size;
4468                         break;
4469
4470                 case O_LOG:
4471                         if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
4472                                 goto bad_size;
4473
4474                         ((ipfw_insn_log *)cmd)->log_left =
4475                             ((ipfw_insn_log *)cmd)->max_log;
4476
4477                         break;
4478
4479                 case O_IP_SRC_MASK:
4480                 case O_IP_DST_MASK:
4481                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
4482                                 goto bad_size;
4483                         if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
4484                                 kprintf("ipfw: opcode %d, useless rule\n",
4485                                         cmd->opcode);
4486                                 return EINVAL;
4487                         }
4488                         break;
4489
4490                 case O_IP_SRC_SET:
4491                 case O_IP_DST_SET:
4492                         if (cmd->arg1 == 0 || cmd->arg1 > 256) {
4493                                 kprintf("ipfw: invalid set size %d\n",
4494                                         cmd->arg1);
4495                                 return EINVAL;
4496                         }
4497                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
4498                             (cmd->arg1+31)/32 )
4499                                 goto bad_size;
4500                         break;
4501
4502                 case O_MACADDR2:
4503                         if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
4504                                 goto bad_size;
4505                         break;
4506
4507                 case O_MAC_TYPE:
4508                 case O_IP_SRCPORT:
4509                 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
4510                         if (cmdlen < 2 || cmdlen > 31)
4511                                 goto bad_size;
4512                         break;
4513
4514                 case O_RECV:
4515                 case O_XMIT:
4516                 case O_VIA:
4517                         if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
4518                                 goto bad_size;
4519                         break;
4520
4521                 case O_PIPE:
4522                 case O_QUEUE:
4523                         if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
4524                                 goto bad_size;
4525                         goto check_action;
4526
4527                 case O_FORWARD_IP:
4528                         if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
4529                                 goto bad_size;
4530                         } else {
4531                                 in_addr_t fwd_addr;
4532
4533                                 fwd_addr = ((ipfw_insn_sa *)cmd)->
4534                                            sa.sin_addr.s_addr;
4535                                 if (IN_MULTICAST(ntohl(fwd_addr))) {
4536                                         kprintf("ipfw: try forwarding to "
4537                                                 "multicast address\n");
4538                                         return EINVAL;
4539                                 }
4540                         }
4541                         goto check_action;
4542
4543                 case O_FORWARD_MAC: /* XXX not implemented yet */
4544                 case O_CHECK_STATE:
4545                 case O_COUNT:
4546                 case O_ACCEPT:
4547                 case O_DENY:
4548                 case O_REJECT:
4549                 case O_SKIPTO:
4550                 case O_DIVERT:
4551                 case O_TEE:
4552                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
4553                                 goto bad_size;
4554 check_action:
4555                         if (have_action) {
4556                                 kprintf("ipfw: opcode %d, multiple actions"
4557                                         " not allowed\n",
4558                                         cmd->opcode);
4559                                 return EINVAL;
4560                         }
4561                         have_action = 1;
4562                         if (l != cmdlen) {
4563                                 kprintf("ipfw: opcode %d, action must be"
4564                                         " last opcode\n",
4565                                         cmd->opcode);
4566                                 return EINVAL;
4567                         }
4568                         break;
4569                 default:
4570                         kprintf("ipfw: opcode %d, unknown opcode\n",
4571                                 cmd->opcode);
4572                         return EINVAL;
4573                 }
4574         }
4575         if (have_action == 0) {
4576                 kprintf("ipfw: missing action\n");
4577                 return EINVAL;
4578         }
4579         return 0;
4580
4581 bad_size:
4582         kprintf("ipfw: opcode %d size %d wrong\n",
4583                 cmd->opcode, cmdlen);
4584         return EINVAL;
4585 }
4586
4587 static int
4588 ipfw_ctl_add_rule(struct sockopt *sopt)
4589 {
4590         struct ipfw_ioc_rule *ioc_rule;
4591         size_t size;
4592         uint32_t rule_flags;
4593         int error;
4594
4595         ASSERT_NETISR0;
4596         
4597         size = sopt->sopt_valsize;
4598         if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
4599             size < sizeof(*ioc_rule)) {
4600                 return EINVAL;
4601         }
4602         if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
4603                 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
4604                                           IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
4605         }
4606         ioc_rule = sopt->sopt_val;
4607
4608         error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
4609         if (error)
4610                 return error;
4611
4612         ipfw_add_rule(ioc_rule, rule_flags);
4613
4614         if (sopt->sopt_dir == SOPT_GET)
4615                 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
4616         return 0;
4617 }
4618
4619 static void *
4620 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
4621     struct ipfw_ioc_rule *ioc_rule)
4622 {
4623         const struct ip_fw *sibling;
4624 #ifdef INVARIANTS
4625         int i;
4626 #endif
4627
4628         ASSERT_NETISR0;
4629         KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
4630
4631         ioc_rule->act_ofs = rule->act_ofs;
4632         ioc_rule->cmd_len = rule->cmd_len;
4633         ioc_rule->rulenum = rule->rulenum;
4634         ioc_rule->set = rule->set;
4635         ioc_rule->usr_flags = rule->usr_flags;
4636
4637         ioc_rule->set_disable = ctx->ipfw_set_disable;
4638         ioc_rule->static_count = static_count;
4639         ioc_rule->static_len = static_ioc_len;
4640
4641         /*
4642          * Visit (read-only) all of the rule's duplications to get
4643          * the necessary statistics
4644          */
4645 #ifdef INVARIANTS
4646         i = 0;
4647 #endif
4648         ioc_rule->pcnt = 0;
4649         ioc_rule->bcnt = 0;
4650         ioc_rule->timestamp = 0;
4651         for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
4652                 ioc_rule->pcnt += sibling->pcnt;
4653                 ioc_rule->bcnt += sibling->bcnt;
4654                 if (sibling->timestamp > ioc_rule->timestamp)
4655                         ioc_rule->timestamp = sibling->timestamp;
4656 #ifdef INVARIANTS
4657                 ++i;
4658 #endif
4659         }
4660         KASSERT(i == netisr_ncpus,
4661             ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
4662
4663         bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
4664
4665         return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
4666 }
4667
4668 static boolean_t
4669 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
4670 {
4671         struct ipfw_ioc_flowid *ioc_id;
4672
4673         if (trk->tc_expire == 0) {
4674                 /* Not a scanned one. */
4675                 return (FALSE);
4676         }
4677
4678         ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
4679             0 : trk->tc_expire - time_uptime;
4680         ioc_state->pcnt = 0;
4681         ioc_state->bcnt = 0;
4682
4683         ioc_state->dyn_type = O_LIMIT_PARENT;
4684         ioc_state->count = trk->tc_count;
4685
4686         ioc_state->rulenum = trk->tc_rulenum;
4687
4688         ioc_id = &ioc_state->id;
4689         ioc_id->type = ETHERTYPE_IP;
4690         ioc_id->u.ip.proto = trk->tc_proto;
4691         ioc_id->u.ip.src_ip = trk->tc_saddr;
4692         ioc_id->u.ip.dst_ip = trk->tc_daddr;
4693         ioc_id->u.ip.src_port = trk->tc_sport;
4694         ioc_id->u.ip.dst_port = trk->tc_dport;
4695
4696         return (TRUE);
4697 }
4698
4699 static boolean_t
4700 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
4701 {
4702         struct ipfw_ioc_flowid *ioc_id;
4703
4704         if (s->st_type == O_ANCHOR)
4705                 return (FALSE);
4706
4707         ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
4708             0 : s->st_expire - time_uptime;
4709         ioc_state->pcnt = s->st_pcnt;
4710         ioc_state->bcnt = s->st_bcnt;
4711
4712         ioc_state->dyn_type = s->st_type;
4713         ioc_state->count = 0;
4714
4715         ioc_state->rulenum = s->st_rule->rulenum;
4716
4717         ioc_id = &ioc_state->id;
4718         ioc_id->type = ETHERTYPE_IP;
4719         ioc_id->u.ip.proto = s->st_proto;
4720         ipfw_key_4tuple(&s->st_key,
4721             &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
4722             &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
4723
4724         return (TRUE);
4725 }
4726
4727 static void
4728 ipfw_state_copy_dispatch(netmsg_t nmsg)
4729 {
4730         struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
4731         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4732         const struct ipfw_state *s;
4733         const struct ipfw_track *t;
4734
4735         ASSERT_NETISR_NCPUS(mycpuid);
4736         KASSERT(nm->state_cnt < nm->state_cntmax,
4737             ("invalid state count %d, max %d",
4738              nm->state_cnt, nm->state_cntmax));
4739
4740         TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
4741                 if (ipfw_state_copy(s, nm->ioc_state)) {
4742                         nm->ioc_state++;
4743                         nm->state_cnt++;
4744                         if (nm->state_cnt == nm->state_cntmax)
4745                                 goto done;
4746                 }
4747         }
4748
4749         /*
4750          * Prepare tracks in the global track tree for userland.
4751          */
4752         TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
4753                 struct ipfw_trkcnt *trk;
4754
4755                 if (t->t_count == NULL) /* anchor */
4756                         continue;
4757                 trk = t->t_trkcnt;
4758
4759                 /*
4760                  * Only one netisr can run this function at
4761                  * any time, and only this function accesses
4762                  * trkcnt's tc_expire, so this is safe w/o
4763                  * ipfw_gd.ipfw_trkcnt_token.
4764                  */
4765                 if (trk->tc_expire > t->t_expire)
4766                         continue;
4767                 trk->tc_expire = t->t_expire;
4768         }
4769
4770         /*
4771          * Copy tracks in the global track tree to userland in
4772          * the last netisr.
4773          */
4774         if (mycpuid == netisr_ncpus - 1) {
4775                 struct ipfw_trkcnt *trk;
4776
4777                 KASSERT(nm->state_cnt < nm->state_cntmax,
4778                     ("invalid state count %d, max %d",
4779                      nm->state_cnt, nm->state_cntmax));
4780
4781                 IPFW_TRKCNT_TOKGET;
4782                 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
4783                         if (ipfw_track_copy(trk, nm->ioc_state)) {
4784                                 nm->ioc_state++;
4785                                 nm->state_cnt++;
4786                                 if (nm->state_cnt == nm->state_cntmax) {
4787                                         IPFW_TRKCNT_TOKREL;
4788                                         goto done;
4789                                 }
4790                         }
4791                 }
4792                 IPFW_TRKCNT_TOKREL;
4793         }
4794 done:
4795         if (nm->state_cnt == nm->state_cntmax) {
4796                 /* No more space; done. */
4797                 netisr_replymsg(&nm->base, 0);
4798         } else {
4799                 netisr_forwardmsg(&nm->base, mycpuid + 1);
4800         }
4801 }
4802
4803 static int
4804 ipfw_ctl_get_rules(struct sockopt *sopt)
4805 {
4806         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4807         struct ip_fw *rule;
4808         void *bp;
4809         size_t size;
4810         int state_cnt;
4811
4812         ASSERT_NETISR0;
4813
4814         /*
4815          * pass up a copy of the current rules. Static rules
4816          * come first (the last of which has number IPFW_DEFAULT_RULE),
4817          * followed by a possibly empty list of states.
4818          */
4819
4820         size = static_ioc_len;  /* size of static rules */
4821
4822         /*
4823          * Size of the states.
4824          * XXX take tracks as state for userland compat.
4825          */
4826         state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
4827         state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
4828         size += state_cnt * sizeof(struct ipfw_ioc_state);
4829
4830         if (sopt->sopt_valsize < size) {
4831                 /* short length, no need to return incomplete rules */
4832                 /* XXX: if superuser, no need to zero buffer */
4833                 bzero(sopt->sopt_val, sopt->sopt_valsize); 
4834                 return 0;
4835         }
4836         bp = sopt->sopt_val;
4837
4838         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4839                 bp = ipfw_copy_rule(ctx, rule, bp);
4840
4841         if (state_cnt) {
4842                 struct netmsg_cpstate nm;
4843 #ifdef INVARIANTS
4844                 size_t old_size = size;
4845 #endif
4846
4847                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
4848                     MSGF_PRIORITY, ipfw_state_copy_dispatch);
4849                 nm.ioc_state = bp;
4850                 nm.state_cntmax = state_cnt;
4851                 nm.state_cnt = 0;
4852                 netisr_domsg_global(&nm.base);
4853
4854                 /*
4855                  * The # of states may be shrinked after the snapshot
4856                  * of the state count was taken.  To give user a correct
4857                  * state count, nm->state_cnt is used to recalculate
4858                  * the actual size.
4859                  */
4860                 size = static_ioc_len +
4861                     (nm.state_cnt * sizeof(struct ipfw_ioc_state));
4862                 KKASSERT(size <= old_size);
4863         }
4864
4865         sopt->sopt_valsize = size;
4866         return 0;
4867 }
4868
4869 static void
4870 ipfw_set_disable_dispatch(netmsg_t nmsg)
4871 {
4872         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4873
4874         ASSERT_NETISR_NCPUS(mycpuid);
4875
4876         ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
4877         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4878 }
4879
4880 static void
4881 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
4882 {
4883         struct netmsg_base nmsg;
4884         uint32_t set_disable;
4885
4886         ASSERT_NETISR0;
4887
4888         /* IPFW_DEFAULT_SET is always enabled */
4889         enable |= (1 << IPFW_DEFAULT_SET);
4890         set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
4891
4892         bzero(&nmsg, sizeof(nmsg));
4893         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4894             ipfw_set_disable_dispatch);
4895         nmsg.lmsg.u.ms_result32 = set_disable;
4896
4897         netisr_domsg_global(&nmsg);
4898 }
4899
4900 static void
4901 ipfw_table_create_dispatch(netmsg_t nm)
4902 {
4903         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4904         int tblid = nm->lmsg.u.ms_result;
4905
4906         ASSERT_NETISR_NCPUS(mycpuid);
4907
4908         if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
4909             rn_cpumaskhead(mycpuid), 32))
4910                 panic("ipfw: create table%d failed", tblid);
4911
4912         netisr_forwardmsg(&nm->base, mycpuid + 1);
4913 }
4914
4915 static int
4916 ipfw_table_create(struct sockopt *sopt)
4917 {
4918         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4919         struct ipfw_ioc_table *tbl;
4920         struct netmsg_base nm;
4921
4922         ASSERT_NETISR0;
4923
4924         if (sopt->sopt_valsize != sizeof(*tbl))
4925                 return (EINVAL);
4926
4927         tbl = sopt->sopt_val;
4928         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
4929                 return (EINVAL);
4930
4931         if (ctx->ipfw_tables[tbl->tableid] != NULL)
4932                 return (EEXIST);
4933
4934         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4935             ipfw_table_create_dispatch);
4936         nm.lmsg.u.ms_result = tbl->tableid;
4937         netisr_domsg_global(&nm);
4938
4939         return (0);
4940 }
4941
4942 static void
4943 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
4944 {
4945         struct radix_node *ret;
4946
4947         ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
4948         if (ret != rn)
4949                 panic("deleted other table entry");
4950         kfree(ret, M_IPFW);
4951 }
4952
4953 static int
4954 ipfw_table_killent(struct radix_node *rn, void *xrnh)
4955 {
4956
4957         ipfw_table_killrn(xrnh, rn);
4958         return (0);
4959 }
4960
4961 static void
4962 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
4963     int destroy)
4964 {
4965         struct radix_node_head *rnh;
4966
4967         ASSERT_NETISR_NCPUS(mycpuid);
4968
4969         rnh = ctx->ipfw_tables[tableid];
4970         rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
4971         if (destroy) {
4972                 Free(rnh);
4973                 ctx->ipfw_tables[tableid] = NULL;
4974         }
4975 }
4976
4977 static void
4978 ipfw_table_flush_dispatch(netmsg_t nmsg)
4979 {
4980         struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
4981         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4982
4983         ASSERT_NETISR_NCPUS(mycpuid);
4984
4985         ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
4986         netisr_forwardmsg(&nm->base, mycpuid + 1);
4987 }
4988
4989 static void
4990 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
4991 {
4992         int i;
4993
4994         ASSERT_NETISR_NCPUS(mycpuid);
4995
4996         for (i = 0; i < ipfw_table_max; ++i) {
4997                 if (ctx->ipfw_tables[i] != NULL)
4998                         ipfw_table_flush_oncpu(ctx, i, destroy);
4999         }
5000 }
5001
5002 static void
5003 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5004 {
5005         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5006
5007         ASSERT_NETISR_NCPUS(mycpuid);
5008
5009         ipfw_table_flushall_oncpu(ctx, 0);
5010         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5011 }
5012
5013 static int
5014 ipfw_table_flush(struct sockopt *sopt)
5015 {
5016         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5017         struct ipfw_ioc_table *tbl;
5018         struct netmsg_tblflush nm;
5019
5020         ASSERT_NETISR0;
5021
5022         if (sopt->sopt_valsize != sizeof(*tbl))
5023                 return (EINVAL);
5024
5025         tbl = sopt->sopt_val;
5026         if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
5027                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5028                     MSGF_PRIORITY, ipfw_table_flushall_dispatch);
5029                 netisr_domsg_global(&nm.base);
5030                 return (0);
5031         }
5032
5033         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5034                 return (EINVAL);
5035
5036         if (ctx->ipfw_tables[tbl->tableid] == NULL)
5037                 return (ENOENT);
5038
5039         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5040             ipfw_table_flush_dispatch);
5041         nm.tableid = tbl->tableid;
5042         nm.destroy = 0;
5043         if (sopt->sopt_name == IP_FW_TBL_DESTROY)
5044                 nm.destroy = 1;
5045         netisr_domsg_global(&nm.base);
5046
5047         return (0);
5048 }
5049
5050 static int
5051 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
5052 {
5053         int *cnt = xcnt;
5054
5055         (*cnt)++;
5056         return (0);
5057 }
5058
5059 static int
5060 ipfw_table_cpent(struct radix_node *rn, void *xcp)
5061 {
5062         struct ipfw_table_cp *cp = xcp;
5063         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5064         struct ipfw_ioc_tblent *ioc_te;
5065 #ifdef INVARIANTS
5066         int cnt;
5067 #endif
5068
5069         KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
5070             cp->te_idx, cp->te_cnt));
5071         ioc_te = &cp->te[cp->te_idx];
5072
5073         if (te->te_nodes->rn_mask != NULL) {
5074                 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
5075                     *te->te_nodes->rn_mask);
5076         } else {
5077                 ioc_te->netmask.sin_len = 0;
5078         }
5079         memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
5080
5081         ioc_te->use = te->te_use;
5082         ioc_te->last_used = te->te_lastuse;
5083 #ifdef INVARIANTS
5084         cnt = 1;
5085 #endif
5086
5087         while ((te = te->te_sibling) != NULL) {
5088 #ifdef INVARIANTS
5089                 ++cnt;
5090 #endif
5091                 ioc_te->use += te->te_use;
5092                 if (te->te_lastuse > ioc_te->last_used)
5093                         ioc_te->last_used = te->te_lastuse;
5094         }
5095         KASSERT(cnt == netisr_ncpus,
5096             ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
5097
5098         cp->te_idx++;
5099
5100         return (0);
5101 }
5102
5103 static int
5104 ipfw_table_get(struct sockopt *sopt)
5105 {
5106         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5107         struct radix_node_head *rnh;
5108         struct ipfw_ioc_table *tbl;
5109         struct ipfw_ioc_tblcont *cont;
5110         struct ipfw_table_cp cp;
5111         int cnt = 0, sz;
5112
5113         ASSERT_NETISR0;
5114
5115         if (sopt->sopt_valsize < sizeof(*tbl))
5116                 return (EINVAL);
5117
5118         tbl = sopt->sopt_val;
5119         if (tbl->tableid < 0) {
5120                 struct ipfw_ioc_tbllist *list;
5121                 int i;
5122
5123                 /*
5124                  * List available table ids.
5125                  */
5126                 for (i = 0; i < ipfw_table_max; ++i) {
5127                         if (ctx->ipfw_tables[i] != NULL)
5128                                 ++cnt;
5129                 }
5130
5131                 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
5132                 if (sopt->sopt_valsize < sz) {
5133                         bzero(sopt->sopt_val, sopt->sopt_valsize);
5134                         return (E2BIG);
5135                 }
5136                 list = sopt->sopt_val;
5137                 list->tablecnt = cnt;
5138
5139                 cnt = 0;
5140                 for (i = 0; i < ipfw_table_max; ++i) {
5141                         if (ctx->ipfw_tables[i] != NULL) {
5142                                 KASSERT(cnt < list->tablecnt,
5143                                     ("invalid idx %d, cnt %d",
5144                                      cnt, list->tablecnt));
5145                                 list->tables[cnt++] = i;
5146                         }
5147                 }
5148                 sopt->sopt_valsize = sz;
5149                 return (0);
5150         } else if (tbl->tableid >= ipfw_table_max) {
5151                 return (EINVAL);
5152         }
5153
5154         rnh = ctx->ipfw_tables[tbl->tableid];
5155         if (rnh == NULL)
5156                 return (ENOENT);
5157         rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
5158
5159         sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
5160         if (sopt->sopt_valsize < sz) {
5161                 bzero(sopt->sopt_val, sopt->sopt_valsize);
5162                 return (E2BIG);
5163         }
5164         cont = sopt->sopt_val;
5165         cont->entcnt = cnt;
5166
5167         cp.te = cont->ent;
5168         cp.te_idx = 0;
5169         cp.te_cnt = cnt;
5170         rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
5171
5172         sopt->sopt_valsize = sz;
5173         return (0);
5174 }
5175
5176 static void
5177 ipfw_table_add_dispatch(netmsg_t nmsg)
5178 {
5179         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5180         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5181         struct radix_node_head *rnh;
5182         struct ipfw_tblent *te;
5183
5184         ASSERT_NETISR_NCPUS(mycpuid);
5185
5186         rnh = ctx->ipfw_tables[nm->tableid];
5187
5188         te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
5189         te->te_nodes->rn_key = (char *)&te->te_key;
5190         memcpy(&te->te_key, nm->key, sizeof(te->te_key));
5191
5192         if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
5193             te->te_nodes) == NULL) {
5194                 if (mycpuid == 0) {
5195                         kfree(te, M_IPFW);
5196                         netisr_replymsg(&nm->base, EEXIST);
5197                         return;
5198                 }
5199                 panic("rnh_addaddr failed");
5200         }
5201
5202         /* Link siblings. */
5203         if (nm->sibling != NULL)
5204                 nm->sibling->te_sibling = te;
5205         nm->sibling = te;
5206
5207         netisr_forwardmsg(&nm->base, mycpuid + 1);
5208 }
5209
5210 static void
5211 ipfw_table_del_dispatch(netmsg_t nmsg)
5212 {
5213         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5214         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5215         struct radix_node_head *rnh;
5216         struct radix_node *rn;
5217
5218         ASSERT_NETISR_NCPUS(mycpuid);
5219
5220         rnh = ctx->ipfw_tables[nm->tableid];
5221         rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
5222         if (rn == NULL) {
5223                 if (mycpuid == 0) {
5224                         netisr_replymsg(&nm->base, ESRCH);
5225                         return;
5226                 }
5227                 panic("rnh_deladdr failed");
5228         }
5229         kfree(rn, M_IPFW);
5230
5231         netisr_forwardmsg(&nm->base, mycpuid + 1);
5232 }
5233
5234 static int
5235 ipfw_table_alt(struct sockopt *sopt)
5236 {
5237         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5238         struct ipfw_ioc_tblcont *tbl;
5239         struct ipfw_ioc_tblent *te;
5240         struct sockaddr_in key0;
5241         struct sockaddr *netmask = NULL, *key;
5242         struct netmsg_tblent nm;
5243
5244         ASSERT_NETISR0;
5245
5246         if (sopt->sopt_valsize != sizeof(*tbl))
5247                 return (EINVAL);
5248         tbl = sopt->sopt_val;
5249
5250         if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
5251                 return (EINVAL);
5252         if (tbl->entcnt != 1)
5253                 return (EINVAL);
5254
5255         if (ctx->ipfw_tables[tbl->tableid] == NULL)
5256                 return (ENOENT);
5257         te = &tbl->ent[0];
5258
5259         if (te->key.sin_family != AF_INET ||
5260             te->key.sin_port != 0 ||
5261             te->key.sin_len != sizeof(struct sockaddr_in))
5262                 return (EINVAL);
5263         key = (struct sockaddr *)&te->key;
5264
5265         if (te->netmask.sin_len != 0) {
5266                 if (te->netmask.sin_port != 0 ||
5267                     te->netmask.sin_len > sizeof(struct sockaddr_in))
5268                         return (EINVAL);
5269                 netmask = (struct sockaddr *)&te->netmask;
5270                 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
5271                 key = (struct sockaddr *)&key0;
5272         }
5273
5274         if (sopt->sopt_name == IP_FW_TBL_ADD) {
5275                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5276                     MSGF_PRIORITY, ipfw_table_add_dispatch);
5277         } else {
5278                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5279                     MSGF_PRIORITY, ipfw_table_del_dispatch);
5280         }
5281         nm.key = key;
5282         nm.netmask = netmask;
5283         nm.tableid = tbl->tableid;
5284         nm.sibling = NULL;
5285         return (netisr_domsg_global(&nm.base));
5286 }
5287
5288 static int
5289 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
5290 {
5291         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5292
5293         te->te_use = 0;
5294         te->te_lastuse = 0;
5295         return (0);
5296 }
5297
5298 static void
5299 ipfw_table_zero_dispatch(netmsg_t nmsg)
5300 {
5301         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5302         struct radix_node_head *rnh;
5303
5304         ASSERT_NETISR_NCPUS(mycpuid);
5305
5306         rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
5307         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5308
5309         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5310 }
5311
5312 static void
5313 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
5314 {
5315         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5316         int i;
5317
5318         ASSERT_NETISR_NCPUS(mycpuid);
5319
5320         for (i = 0; i < ipfw_table_max; ++i) {
5321                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
5322
5323                 if (rnh != NULL)
5324                         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5325         }
5326         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5327 }
5328
5329 static int
5330 ipfw_table_zero(struct sockopt *sopt)
5331 {
5332         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5333         struct netmsg_base nm;
5334         struct ipfw_ioc_table *tbl;
5335
5336         ASSERT_NETISR0;
5337
5338         if (sopt->sopt_valsize != sizeof(*tbl))
5339                 return (EINVAL);
5340         tbl = sopt->sopt_val;
5341
5342         if (tbl->tableid < 0) {
5343                 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5344                     ipfw_table_zeroall_dispatch);
5345                 netisr_domsg_global(&nm);
5346                 return (0);
5347         } else if (tbl->tableid >= ipfw_table_max) {
5348                 return (EINVAL);
5349         } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
5350                 return (ENOENT);
5351         }
5352
5353         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5354             ipfw_table_zero_dispatch);
5355         nm.lmsg.u.ms_result = tbl->tableid;
5356         netisr_domsg_global(&nm);
5357
5358         return (0);
5359 }
5360
5361 static int
5362 ipfw_table_killexp(struct radix_node *rn, void *xnm)
5363 {
5364         struct netmsg_tblexp *nm = xnm;
5365         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5366
5367         if (te->te_expired) {
5368                 ipfw_table_killrn(nm->rnh, rn);
5369                 nm->expcnt++;
5370         }
5371         return (0);
5372 }
5373
5374 static void
5375 ipfw_table_expire_dispatch(netmsg_t nmsg)
5376 {
5377         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5378         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5379         struct radix_node_head *rnh;
5380
5381         ASSERT_NETISR_NCPUS(mycpuid);
5382
5383         rnh = ctx->ipfw_tables[nm->tableid];
5384         nm->rnh = rnh;
5385         rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5386
5387         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5388             ("not all expired addresses (%d) were deleted (%d)",
5389              nm->cnt * (mycpuid + 1), nm->expcnt));
5390
5391         netisr_forwardmsg(&nm->base, mycpuid + 1);
5392 }
5393
5394 static void
5395 ipfw_table_expireall_dispatch(netmsg_t nmsg)
5396 {
5397         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5398         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5399         int i;
5400
5401         ASSERT_NETISR_NCPUS(mycpuid);
5402
5403         for (i = 0; i < ipfw_table_max; ++i) {
5404                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
5405
5406                 if (rnh == NULL)
5407                         continue;
5408                 nm->rnh = rnh;
5409                 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5410         }
5411
5412         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5413             ("not all expired addresses (%d) were deleted (%d)",
5414              nm->cnt * (mycpuid + 1), nm->expcnt));
5415
5416         netisr_forwardmsg(&nm->base, mycpuid + 1);
5417 }
5418
5419 static int
5420 ipfw_table_markexp(struct radix_node *rn, void *xnm)
5421 {
5422         struct netmsg_tblexp *nm = xnm;
5423         struct ipfw_tblent *te;
5424         time_t lastuse;
5425
5426         te = (struct ipfw_tblent *)rn;
5427         lastuse = te->te_lastuse;
5428
5429         while ((te = te->te_sibling) != NULL) {
5430                 if (te->te_lastuse > lastuse)
5431                         lastuse = te->te_lastuse;
5432         }
5433         if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
5434                 /* Not expired */
5435                 return (0);
5436         }
5437
5438         te = (struct ipfw_tblent *)rn;
5439         te->te_expired = 1;
5440         while ((te = te->te_sibling) != NULL)
5441                 te->te_expired = 1;
5442         nm->cnt++;
5443
5444         return (0);
5445 }
5446
5447 static int
5448 ipfw_table_expire(struct sockopt *sopt)
5449 {
5450         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5451         struct netmsg_tblexp nm;
5452         struct ipfw_ioc_tblexp *tbl;
5453         struct radix_node_head *rnh;
5454
5455         ASSERT_NETISR0;
5456
5457         if (sopt->sopt_valsize != sizeof(*tbl))
5458                 return (EINVAL);
5459         tbl = sopt->sopt_val;
5460         tbl->expcnt = 0;
5461
5462         nm.expcnt = 0;
5463         nm.cnt = 0;
5464         nm.expire = tbl->expire;
5465
5466         if (tbl->tableid < 0) {
5467                 int i;
5468
5469                 for (i = 0; i < ipfw_table_max; ++i) {
5470                         rnh = ctx->ipfw_tables[i];
5471                         if (rnh == NULL)
5472                                 continue;
5473                         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5474                 }
5475                 if (nm.cnt == 0) {
5476                         /* No addresses can be expired. */
5477                         return (0);
5478                 }
5479                 tbl->expcnt = nm.cnt;
5480
5481                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5482                     MSGF_PRIORITY, ipfw_table_expireall_dispatch);
5483                 nm.tableid = -1;
5484                 netisr_domsg_global(&nm.base);
5485                 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5486                     ("not all expired addresses (%d) were deleted (%d)",
5487                      nm.cnt * netisr_ncpus, nm.expcnt));
5488
5489                 return (0);
5490         } else if (tbl->tableid >= ipfw_table_max) {
5491                 return (EINVAL);
5492         }
5493
5494         rnh = ctx->ipfw_tables[tbl->tableid];
5495         if (rnh == NULL)
5496                 return (ENOENT);
5497         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5498         if (nm.cnt == 0) {
5499                 /* No addresses can be expired. */
5500                 return (0);
5501         }
5502         tbl->expcnt = nm.cnt;
5503
5504         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5505             ipfw_table_expire_dispatch);
5506         nm.tableid = tbl->tableid;
5507         netisr_domsg_global(&nm.base);
5508         KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5509             ("not all expired addresses (%d) were deleted (%d)",
5510              nm.cnt * netisr_ncpus, nm.expcnt));
5511         return (0);
5512 }
5513
5514 /*
5515  * {set|get}sockopt parser.
5516  */
5517 static int
5518 ipfw_ctl(struct sockopt *sopt)
5519 {
5520         int error, rulenum;
5521         uint32_t *masks;
5522         size_t size;
5523
5524         ASSERT_NETISR0;
5525
5526         error = 0;
5527
5528         switch (sopt->sopt_name) {
5529         case IP_FW_GET:
5530                 error = ipfw_ctl_get_rules(sopt);
5531                 break;
5532
5533         case IP_FW_FLUSH:
5534                 ipfw_flush(0 /* keep default rule */);
5535                 break;
5536
5537         case IP_FW_ADD:
5538                 error = ipfw_ctl_add_rule(sopt);
5539                 break;
5540
5541         case IP_FW_DEL:
5542                 /*
5543                  * IP_FW_DEL is used for deleting single rules or sets,
5544                  * and (ab)used to atomically manipulate sets.
5545                  * Argument size is used to distinguish between the two:
5546                  *    sizeof(uint32_t)
5547                  *      delete single rule or set of rules,
5548                  *      or reassign rules (or sets) to a different set.
5549                  *    2 * sizeof(uint32_t)
5550                  *      atomic disable/enable sets.
5551                  *      first uint32_t contains sets to be disabled,
5552                  *      second uint32_t contains sets to be enabled.
5553                  */
5554                 masks = sopt->sopt_val;
5555                 size = sopt->sopt_valsize;
5556                 if (size == sizeof(*masks)) {
5557                         /*
5558                          * Delete or reassign static rule
5559                          */
5560                         error = ipfw_ctl_alter(masks[0]);
5561                 } else if (size == (2 * sizeof(*masks))) {
5562                         /*
5563                          * Set enable/disable
5564                          */
5565                         ipfw_ctl_set_disable(masks[0], masks[1]);
5566                 } else {
5567                         error = EINVAL;
5568                 }
5569                 break;
5570
5571         case IP_FW_ZERO:
5572         case IP_FW_RESETLOG: /* argument is an int, the rule number */
5573                 rulenum = 0;
5574
5575                 if (sopt->sopt_val != 0) {
5576                     error = soopt_to_kbuf(sopt, &rulenum,
5577                             sizeof(int), sizeof(int));
5578                     if (error)
5579                         break;
5580                 }
5581                 error = ipfw_ctl_zero_entry(rulenum,
5582                         sopt->sopt_name == IP_FW_RESETLOG);
5583                 break;
5584
5585         case IP_FW_TBL_CREATE:
5586                 error = ipfw_table_create(sopt);
5587                 break;
5588
5589         case IP_FW_TBL_ADD:
5590         case IP_FW_TBL_DEL:
5591                 error = ipfw_table_alt(sopt);
5592                 break;
5593
5594         case IP_FW_TBL_FLUSH:
5595         case IP_FW_TBL_DESTROY:
5596                 error = ipfw_table_flush(sopt);
5597                 break;
5598
5599         case IP_FW_TBL_GET:
5600                 error = ipfw_table_get(sopt);
5601                 break;
5602
5603         case IP_FW_TBL_ZERO:
5604                 error = ipfw_table_zero(sopt);
5605                 break;
5606
5607         case IP_FW_TBL_EXPIRE:
5608                 error = ipfw_table_expire(sopt);
5609                 break;
5610
5611         default:
5612                 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
5613                 error = EINVAL;
5614         }
5615         return error;
5616 }
5617
5618 static void
5619 ipfw_keepalive_done(struct ipfw_context *ctx)
5620 {
5621
5622         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5623             ("keepalive is not in progress"));
5624         ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
5625         callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
5626             ipfw_keepalive, NULL);
5627 }
5628
5629 static void
5630 ipfw_keepalive_more(struct ipfw_context *ctx)
5631 {
5632         struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
5633
5634         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5635             ("keepalive is not in progress"));
5636         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
5637             ("keepalive more did not finish"));
5638         netisr_sendmsg_oncpu(nm);
5639 }
5640
5641 static void
5642 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
5643 {
5644         struct ipfw_state *s;
5645         int scanned = 0, expired = 0, kept = 0;
5646
5647         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5648             ("keepalive is not in progress"));
5649
5650         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
5651                 uint32_t ack_rev, ack_fwd;
5652                 struct ipfw_flow_id id;
5653
5654                 if (scanned++ >= ipfw_state_scan_max) {
5655                         ipfw_keepalive_more(ctx);
5656                         return;
5657                 }
5658
5659                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
5660                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
5661
5662                 if (s->st_type == O_ANCHOR)
5663                         continue;
5664
5665                 if (TIME_LEQ(s->st_expire, time_uptime)) {
5666                         /* State expired. */
5667                         ipfw_state_del(ctx, s);
5668                         if (++expired >= ipfw_state_expire_max) {
5669                                 ipfw_keepalive_more(ctx);
5670                                 return;
5671                         }
5672                         continue;
5673                 }
5674
5675                 /*
5676                  * Keep alive processing
5677                  */
5678
5679                 if (s->st_proto != IPPROTO_TCP)
5680                         continue;
5681                 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
5682                         continue;
5683                 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
5684                     s->st_expire))
5685                         continue;       /* too early */
5686
5687                 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
5688                     &id.dst_ip, &id.dst_port);
5689                 ack_rev = s->st_ack_rev;
5690                 ack_fwd = s->st_ack_fwd;
5691
5692                 send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
5693                 send_pkt(&id, ack_fwd - 1, ack_rev, 0);
5694
5695                 if (++kept >= ipfw_keepalive_max) {
5696                         ipfw_keepalive_more(ctx);
5697                         return;
5698                 }
5699         }
5700         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
5701         ipfw_keepalive_done(ctx);
5702 }
5703
5704 static void
5705 ipfw_keepalive_more_dispatch(netmsg_t nm)
5706 {
5707         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5708         struct ipfw_state *anchor;
5709
5710         ASSERT_NETISR_NCPUS(mycpuid);
5711         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5712             ("keepalive is not in progress"));
5713
5714         /* Reply ASAP */
5715         netisr_replymsg(&nm->base, 0);
5716
5717         anchor = &ctx->ipfw_keepalive_anch;
5718         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
5719                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
5720                 ipfw_keepalive_done(ctx);
5721                 return;
5722         }
5723         ipfw_keepalive_loop(ctx, anchor);
5724 }
5725
5726 /*
5727  * This procedure is only used to handle keepalives. It is invoked
5728  * every dyn_keepalive_period
5729  */
5730 static void
5731 ipfw_keepalive_dispatch(netmsg_t nm)
5732 {
5733         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5734         struct ipfw_state *anchor;
5735
5736         ASSERT_NETISR_NCPUS(mycpuid);
5737         KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
5738             ("keepalive is in progress"));
5739         ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
5740
5741         /* Reply ASAP */
5742         crit_enter();
5743         netisr_replymsg(&nm->base, 0);
5744         crit_exit();
5745
5746         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
5747                 ipfw_keepalive_done(ctx);
5748                 return;
5749         }
5750
5751         anchor = &ctx->ipfw_keepalive_anch;
5752         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
5753         ipfw_keepalive_loop(ctx, anchor);
5754 }
5755
5756 /*
5757  * This procedure is only used to handle keepalives. It is invoked
5758  * every dyn_keepalive_period
5759  */
5760 static void
5761 ipfw_keepalive(void *dummy __unused)
5762 {
5763         struct netmsg_base *msg;
5764
5765         KKASSERT(mycpuid < netisr_ncpus);
5766         msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
5767
5768         crit_enter();
5769         if (msg->lmsg.ms_flags & MSGF_DONE)
5770                 netisr_sendmsg_oncpu(msg);
5771         crit_exit();
5772 }
5773
5774 static int
5775 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
5776 {
5777         struct ip_fw_args args;
5778         struct mbuf *m = *m0;
5779         struct m_tag *mtag;
5780         int tee = 0, error = 0, ret;
5781
5782         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
5783                 /* Extract info from dummynet tag */
5784                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
5785                 KKASSERT(mtag != NULL);
5786                 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
5787                 KKASSERT(args.rule != NULL);
5788
5789                 m_tag_delete(m, mtag);
5790                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
5791         } else {
5792                 args.rule = NULL;
5793         }
5794
5795         args.eh = NULL;
5796         args.oif = NULL;
5797         args.m = m;
5798         ret = ipfw_chk(&args);
5799         m = args.m;
5800
5801         if (m == NULL) {
5802                 error = EACCES;
5803                 goto back;
5804         }
5805
5806         switch (ret) {
5807         case IP_FW_PASS:
5808                 break;
5809
5810         case IP_FW_DENY:
5811                 m_freem(m);
5812                 m = NULL;
5813                 error = EACCES;
5814                 break;
5815
5816         case IP_FW_DUMMYNET:
5817                 /* Send packet to the appropriate pipe */
5818                 ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
5819                 break;
5820
5821         case IP_FW_TEE:
5822                 tee = 1;
5823                 /* FALL THROUGH */
5824
5825         case IP_FW_DIVERT:
5826                 /*
5827                  * Must clear bridge tag when changing
5828                  */
5829                 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
5830                 if (ip_divert_p != NULL) {
5831                         m = ip_divert_p(m, tee, 1);
5832                 } else {
5833                         m_freem(m);
5834                         m = NULL;
5835                         /* not sure this is the right error msg */
5836                         error = EACCES;
5837                 }
5838                 break;
5839
5840         default:
5841                 panic("unknown ipfw return value: %d", ret);
5842         }
5843 back:
5844         *m0 = m;
5845         return error;
5846 }
5847
5848 static int
5849 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
5850 {
5851         struct ip_fw_args args;
5852         struct mbuf *m = *m0;
5853         struct m_tag *mtag;
5854         int tee = 0, error = 0, ret;
5855
5856         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
5857                 /* Extract info from dummynet tag */
5858                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
5859                 KKASSERT(mtag != NULL);
5860                 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
5861                 KKASSERT(args.rule != NULL);
5862
5863                 m_tag_delete(m, mtag);
5864                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
5865         } else {
5866                 args.rule = NULL;
5867         }
5868
5869         args.eh = NULL;
5870         args.m = m;
5871         args.oif = ifp;
5872         ret = ipfw_chk(&args);
5873         m = args.m;
5874
5875         if (m == NULL) {
5876                 error = EACCES;
5877                 goto back;
5878         }
5879
5880         switch (ret) {
5881         case IP_FW_PASS:
5882                 break;
5883
5884         case IP_FW_DENY:
5885                 m_freem(m);
5886                 m = NULL;
5887                 error = EACCES;
5888                 break;
5889
5890         case IP_FW_DUMMYNET:
5891                 ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
5892                 break;
5893
5894         case IP_FW_TEE:
5895                 tee = 1;
5896                 /* FALL THROUGH */
5897
5898         case IP_FW_DIVERT:
5899                 if (ip_divert_p != NULL) {
5900                         m = ip_divert_p(m, tee, 0);
5901                 } else {
5902                         m_freem(m);
5903                         m = NULL;
5904                         /* not sure this is the right error msg */
5905                         error = EACCES;
5906                 }
5907                 break;
5908
5909         default:
5910                 panic("unknown ipfw return value: %d", ret);
5911         }
5912 back:
5913         *m0 = m;
5914         return error;
5915 }
5916
5917 static void
5918 ipfw_hook(void)
5919 {
5920         struct pfil_head *pfh;
5921
5922         ASSERT_NETISR0;
5923
5924         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
5925         if (pfh == NULL)
5926                 return;
5927
5928         pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
5929         pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
5930 }
5931
5932 static void
5933 ipfw_dehook(void)
5934 {
5935         struct pfil_head *pfh;
5936
5937         ASSERT_NETISR0;
5938
5939         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
5940         if (pfh == NULL)
5941                 return;
5942
5943         pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
5944         pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
5945 }
5946
5947 static int
5948 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
5949 {
5950         int dyn_cnt;
5951
5952         dyn_cnt = ipfw_state_cntcoll();
5953         dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
5954
5955         return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
5956 }
5957
5958 static int
5959 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
5960 {
5961         int state_cnt;
5962
5963         state_cnt = ipfw_state_cntcoll();
5964         return (sysctl_handle_int(oidp, &state_cnt, 0, req));
5965 }
5966
5967 static int
5968 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
5969 {
5970         int state_max, error;
5971
5972         state_max = ipfw_state_max;
5973         error = sysctl_handle_int(oidp, &state_max, 0, req);
5974         if (error || req->newptr == NULL)
5975                 return (error);
5976
5977         if (state_max < 1)
5978                 return (EINVAL);
5979
5980         ipfw_state_max_set(state_max);
5981         return (0);
5982 }
5983
5984 static int
5985 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
5986 {
5987         int dyn_max, error;
5988
5989         dyn_max = ipfw_state_max + ipfw_track_max;
5990
5991         error = sysctl_handle_int(oidp, &dyn_max, 0, req);
5992         if (error || req->newptr == NULL)
5993                 return (error);
5994
5995         if (dyn_max < 2)
5996                 return (EINVAL);
5997
5998         ipfw_state_max_set(dyn_max / 2);
5999         ipfw_track_max = dyn_max / 2;
6000         return (0);
6001 }
6002
6003 static void
6004 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
6005 {
6006         int enable = nmsg->lmsg.u.ms_result;
6007
6008         ASSERT_NETISR0;
6009
6010         if (fw_enable == enable)
6011                 goto reply;
6012
6013         fw_enable = enable;
6014         if (fw_enable)
6015                 ipfw_hook();
6016         else
6017                 ipfw_dehook();
6018 reply:
6019         netisr_replymsg(&nmsg->base, 0);
6020 }
6021
6022 static int
6023 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
6024 {
6025         struct netmsg_base nmsg;
6026         int enable, error;
6027
6028         enable = fw_enable;
6029         error = sysctl_handle_int(oidp, &enable, 0, req);
6030         if (error || req->newptr == NULL)
6031                 return error;
6032
6033         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6034             ipfw_sysctl_enable_dispatch);
6035         nmsg.lmsg.u.ms_result = enable;
6036
6037         return netisr_domsg(&nmsg, 0);
6038 }
6039
6040 static int
6041 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
6042 {
6043         return sysctl_int_range(oidp, arg1, arg2, req,
6044                IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
6045 }
6046
6047 static int
6048 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
6049 {
6050
6051         return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
6052 }
6053
6054 static int
6055 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
6056 {
6057         u_long stat = 0;
6058         int cpu, error;
6059
6060         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6061                 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
6062
6063         error = sysctl_handle_long(oidp, &stat, 0, req);
6064         if (error || req->newptr == NULL)
6065                 return (error);
6066
6067         /* Zero out this stat. */
6068         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6069                 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
6070         return (0);
6071 }
6072
6073 static void
6074 ipfw_ctx_init_dispatch(netmsg_t nmsg)
6075 {
6076         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
6077         struct ipfw_context *ctx;
6078         struct ip_fw *def_rule;
6079
6080         ASSERT_NETISR_NCPUS(mycpuid);
6081
6082         ctx = kmalloc(__offsetof(struct ipfw_context,
6083             ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
6084
6085         RB_INIT(&ctx->ipfw_state_tree);
6086         TAILQ_INIT(&ctx->ipfw_state_list);
6087
6088         RB_INIT(&ctx->ipfw_track_tree);
6089         TAILQ_INIT(&ctx->ipfw_track_list);
6090
6091         callout_init_mp(&ctx->ipfw_stateto_ch);
6092         netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
6093             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
6094         ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
6095         netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
6096             MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
6097
6098         callout_init_mp(&ctx->ipfw_trackto_ch);
6099         netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
6100             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
6101         netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
6102             MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
6103
6104         callout_init_mp(&ctx->ipfw_keepalive_ch);
6105         netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
6106             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
6107         ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
6108         netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
6109             MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
6110
6111         ipfw_ctx[mycpuid] = ctx;
6112
6113         def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
6114
6115         def_rule->act_ofs = 0;
6116         def_rule->rulenum = IPFW_DEFAULT_RULE;
6117         def_rule->cmd_len = 1;
6118         def_rule->set = IPFW_DEFAULT_SET;
6119
6120         def_rule->cmd[0].len = 1;
6121 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
6122         def_rule->cmd[0].opcode = O_ACCEPT;
6123 #else
6124         if (filters_default_to_accept)
6125                 def_rule->cmd[0].opcode = O_ACCEPT;
6126         else
6127                 def_rule->cmd[0].opcode = O_DENY;
6128 #endif
6129
6130         def_rule->refcnt = 1;
6131         def_rule->cpuid = mycpuid;
6132
6133         /* Install the default rule */
6134         ctx->ipfw_default_rule = def_rule;
6135         ctx->ipfw_layer3_chain = def_rule;
6136
6137         /* Link rule CPU sibling */
6138         ipfw_link_sibling(fwmsg, def_rule);
6139
6140         /* Statistics only need to be updated once */
6141         if (mycpuid == 0)
6142                 ipfw_inc_static_count(def_rule);
6143
6144         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6145 }
6146
6147 static void
6148 ipfw_init_dispatch(netmsg_t nmsg)
6149 {
6150         struct netmsg_ipfw fwmsg;
6151         int error = 0, cpu;
6152
6153         ASSERT_NETISR0;
6154
6155         if (IPFW_LOADED) {
6156                 kprintf("IP firewall already loaded\n");
6157                 error = EEXIST;
6158                 goto reply;
6159         }
6160
6161         if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
6162                 ipfw_table_max = UINT16_MAX;
6163
6164         /* Initialize global track tree. */
6165         RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
6166         IPFW_TRKCNT_TOKINIT;
6167
6168         ipfw_state_max_set(ipfw_state_max);
6169         ipfw_state_headroom = 8 * netisr_ncpus;
6170
6171         bzero(&fwmsg, sizeof(fwmsg));
6172         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6173             ipfw_ctx_init_dispatch);
6174         netisr_domsg_global(&fwmsg.base);
6175
6176         ip_fw_chk_ptr = ipfw_chk;
6177         ip_fw_ctl_ptr = ipfw_ctl;
6178         ip_fw_dn_io_ptr = ipfw_dummynet_io;
6179
6180         kprintf("ipfw2 initialized, default to %s, logging ",
6181                 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
6182                 O_ACCEPT ? "accept" : "deny");
6183
6184 #ifdef IPFIREWALL_VERBOSE
6185         fw_verbose = 1;
6186 #endif
6187 #ifdef IPFIREWALL_VERBOSE_LIMIT
6188         verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
6189 #endif
6190         if (fw_verbose == 0) {
6191                 kprintf("disabled\n");
6192         } else if (verbose_limit == 0) {
6193                 kprintf("unlimited\n");
6194         } else {
6195                 kprintf("limited to %d packets/entry by default\n",
6196                         verbose_limit);
6197         }
6198
6199         ip_fw_loaded = 1;
6200         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
6201                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
6202                     ipfw_state_expire_ipifunc, NULL, cpu);
6203                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
6204                     ipfw_track_expire_ipifunc, NULL, cpu);
6205                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
6206                     ipfw_keepalive, NULL, cpu);
6207         }
6208
6209         if (fw_enable)
6210                 ipfw_hook();
6211 reply:
6212         netisr_replymsg(&nmsg->base, error);
6213 }
6214
6215 static int
6216 ipfw_init(void)
6217 {
6218         struct netmsg_base smsg;
6219
6220         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6221             ipfw_init_dispatch);
6222         return netisr_domsg(&smsg, 0);
6223 }
6224
6225 #ifdef KLD_MODULE
6226
6227 static void
6228 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
6229 {
6230         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6231
6232         ASSERT_NETISR_NCPUS(mycpuid);
6233
6234         callout_stop_sync(&ctx->ipfw_stateto_ch);
6235         callout_stop_sync(&ctx->ipfw_trackto_ch);
6236         callout_stop_sync(&ctx->ipfw_keepalive_ch);
6237
6238         crit_enter();
6239         netisr_dropmsg(&ctx->ipfw_stateexp_more);
6240         netisr_dropmsg(&ctx->ipfw_stateexp_nm);
6241         netisr_dropmsg(&ctx->ipfw_trackexp_more);
6242         netisr_dropmsg(&ctx->ipfw_trackexp_nm);
6243         netisr_dropmsg(&ctx->ipfw_keepalive_more);
6244         netisr_dropmsg(&ctx->ipfw_keepalive_nm);
6245         crit_exit();
6246
6247         ipfw_table_flushall_oncpu(ctx, 1);
6248
6249         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6250 }
6251
6252 static void
6253 ipfw_fini_dispatch(netmsg_t nmsg)
6254 {
6255         struct netmsg_base nm;
6256         int error = 0, cpu;
6257
6258         ASSERT_NETISR0;
6259
6260         if (ipfw_gd.ipfw_refcnt != 0) {
6261                 error = EBUSY;
6262                 goto reply;
6263         }
6264
6265         ip_fw_loaded = 0;
6266         ipfw_dehook();
6267
6268         /* Synchronize any inflight state/track expire IPIs. */
6269         lwkt_synchronize_ipiqs("ipfwfini");
6270
6271         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6272             ipfw_ctx_fini_dispatch);
6273         netisr_domsg_global(&nm);
6274
6275         ip_fw_chk_ptr = NULL;
6276         ip_fw_ctl_ptr = NULL;
6277         ip_fw_dn_io_ptr = NULL;
6278         ipfw_flush(1 /* kill default rule */);
6279
6280         /* Free pre-cpu context */
6281         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6282                 kfree(ipfw_ctx[cpu], M_IPFW);
6283
6284         kprintf("IP firewall unloaded\n");
6285 reply:
6286         netisr_replymsg(&nmsg->base, error);
6287 }
6288
6289 static int
6290 ipfw_fini(void)
6291 {
6292         struct netmsg_base smsg;
6293
6294         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6295             ipfw_fini_dispatch);
6296         return netisr_domsg(&smsg, 0);
6297 }
6298
6299 #endif  /* KLD_MODULE */
6300
6301 static int
6302 ipfw_modevent(module_t mod, int type, void *unused)
6303 {
6304         int err = 0;
6305
6306         switch (type) {
6307         case MOD_LOAD:
6308                 err = ipfw_init();
6309                 break;
6310
6311         case MOD_UNLOAD:
6312 #ifndef KLD_MODULE
6313                 kprintf("ipfw statically compiled, cannot unload\n");
6314                 err = EBUSY;
6315 #else
6316                 err = ipfw_fini();
6317 #endif
6318                 break;
6319         default:
6320                 break;
6321         }
6322         return err;
6323 }
6324
6325 static moduledata_t ipfwmod = {
6326         "ipfw",
6327         ipfw_modevent,
6328         0
6329 };
6330 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
6331 MODULE_VERSION(ipfw, 1);