Merge branch 'vendor/LIBRESSL'
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27
28 /*
29  * Implement IP packet firewall (new version)
30  */
31
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
62
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79
80 #include <net/ipfw/ip_fw2.h>
81
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85         if (fw_debug > 0) \
86                 kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...)       ((void)0)
90 #endif
91
92 /*
93  * Description about per-CPU rule duplication:
94  *
95  * Module loading/unloading and all ioctl operations are serialized
96  * by netisr0, so we don't have any ordering or locking problems.
97  *
98  * Following graph shows how operation on per-CPU rule list is
99  * performed [2 CPU case]:
100  *
101  *   CPU0                 CPU1
102  *
103  * netisr0 <------------------------------------+
104  *  domsg                                       |
105  *    :                                         |
106  *    :(delete/add...)                          |
107  *    :                                         |
108  *    :         netmsg                          | netmsg
109  *  forwardmsg---------->netisr1                |
110  *                          :                   |
111  *                          :(delete/add...)    |
112  *                          :                   |
113  *                          :                   |
114  *                        replymsg--------------+
115  *
116  *
117  *
118  * Rule structure [2 CPU case]
119  *
120  *    CPU0               CPU1
121  *
122  * layer3_chain       layer3_chain
123  *     |                  |
124  *     V                  V
125  * +-------+ sibling  +-------+ sibling
126  * | rule1 |--------->| rule1 |--------->NULL
127  * +-------+          +-------+
128  *     |                  |
129  *     |next              |next
130  *     V                  V
131  * +-------+ sibling  +-------+ sibling
132  * | rule2 |--------->| rule2 |--------->NULL
133  * +-------+          +-------+
134  *
135  * ip_fw.sibling:
136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
137  *    iterate layer3_chain in netisr0; the current rule's duplication
138  *    to the other CPUs could safely be read-only accessed through
139  *    ip_fw.sibling.
140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141  *    a) In netisr0 rule3 is determined to be inserted between rule1
142  *       and rule2.  To make this decision we need to iterate the
143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
145  *       in netisr0 as next_rule.
146  *    b) After the insertion in netisr0 is done, we will move on to
147  *       netisr1.  But instead of relocating the rule3's position in
148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
149  *       netmsg's prev_rule to rule1->sibling and next_rule to
150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
151  *       netisr0.
152  */
153
154 /*
155  * Description of states and tracks.
156  *
157  * Both states and tracks are stored in per-cpu RB trees instead of
158  * per-cpu hash tables to avoid the worst case hash degeneration.
159  *
160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161  * measured in seconds and depending on the flags.
162  *
163  * When a packet is received, its address fields are first masked with
164  * the mask defined for the rule, then matched against the entries in
165  * the per-cpu state RB tree.  States are generated by 'keep-state'
166  * and 'limit' options.
167  *
168  * The max number of states is ipfw_state_max.  When we reach the
169  * maximum number of states we do not create anymore.  This is done to
170  * avoid consuming too much memory, but also too much time when
171  * searching on each packet.
172  *
173  * Each state holds a pointer to the parent ipfw rule of the current
174  * CPU so we know what action to perform.  States are removed when the
175  * parent rule is deleted.  XXX we should make them survive.
176  *
177  * There are some limitations with states -- we do not obey the
178  * 'randomized match', and we do not do multiple passes through the
179  * firewall.  XXX check the latter!!!
180  *
181  * States grow independently on each CPU, e.g. 2 CPU case:
182  *
183  *        CPU0                     CPU1
184  * ...................      ...................
185  * :  state RB tree  :      :  state RB tree  :
186  * :                 :      :                 :
187  * : state1   state2 :      :      state3     :
188  * :     |    |      :      :        |        :
189  * :.....|....|......:      :........|........:
190  *       |    |                      |
191  *       |    |                      |st_rule
192  *       |    |                      |
193  *       V    V                      V
194  *     +-------+                 +-------+
195  *     | rule1 |                 | rule1 |
196  *     +-------+                 +-------+
197  *
198  * Tracks are used to enforce limits on the number of sessions.  Tracks
199  * are generated by 'limit' option.
200  *
201  * The max number of tracks is ipfw_track_max.  When we reach the
202  * maximum number of tracks we do not create anymore.  This is done to
203  * avoid consuming too much memory.
204  *
205  * Tracks are organized into two layers, track counter RB tree is
206  * shared between CPUs, track RB tree is per-cpu.  States generated by
207  * 'limit' option are linked to the track in addition to the per-cpu
208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
209  *
210  *             ..............................
211  *             :    track counter RB tree   :
212  *             :                            :
213  *             :        +-----------+       :
214  *             :        |  trkcnt1  |       :
215  *             :        |           |       :
216  *             :      +--->counter<----+    :
217  *             :      | |           |  |    :
218  *             :      | +-----------+  |    :
219  *             :......|................|....:
220  *                    |                |
221  *        CPU0        |                |         CPU1
222  * .................  |t_count         |  .................
223  * : track RB tree :  |                |  : track RB tree :
224  * :               :  |                |  :               :
225  * : +-->track1-------+                +--------track2    :
226  * : |     A       :                      :               :
227  * : |     |       :                      :               :
228  * :.|.....|.......:                      :...............:
229  *   |     +----------------+
230  *   | .................... |
231  *   | :   state RB tree  : |st_track
232  *   | :                  : |
233  *   +---state1    state2---+
234  *     :     |       |    :
235  *     :.....|.......|....:
236  *           |       |
237  *           |       |st_rule
238  *           V       V
239  *         +----------+
240  *         |   rule1  |
241  *         +----------+
242  */
243
244 #define IPFW_AUTOINC_STEP_MIN   1
245 #define IPFW_AUTOINC_STEP_MAX   1000
246 #define IPFW_AUTOINC_STEP_DEF   100
247
248 #define IPFW_TABLE_MAX_DEF      64
249
250 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
252
253 #define MATCH_REVERSE           0
254 #define MATCH_FORWARD           1
255 #define MATCH_NONE              2
256 #define MATCH_UNKNOWN           3
257
258 #define TIME_LEQ(a, b)          ((a) - (b) <= 0)
259
260 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
261 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
262                                  (IPFW_STATE_TCPFLAGS << 8))
263
264 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
265 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
266 #define BOTH_RST                (TH_RST | (TH_RST << 8))
267 /* TH_ACK here means FIN was ACKed. */
268 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
269
270 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
271                                  (((s)->st_state & BOTH_RST) ||         \
272                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
273
274 #define O_ANCHOR                O_NOP
275
276 #define IPFW_ISXLAT(type)       ((type) == O_REDIRECT)
277 #define IPFW_XLAT_INVALID(s)    (IPFW_ISXLAT((s)->st_type) &&   \
278                                  ((struct ipfw_xlat *)(s))->xlat_invalid)
279
280 #define IPFW_MBUF_XLATINS       FW_MBUF_PRIVATE1
281 #define IPFW_MBUF_XLATFWD       FW_MBUF_PRIVATE2
282
283 #define IPFW_XLATE_INSERT       0x0001
284 #define IPFW_XLATE_FORWARD      0x0002
285 #define IPFW_XLATE_OUTPUT       0x0004
286
287 struct netmsg_ipfw {
288         struct netmsg_base      base;
289         const struct ipfw_ioc_rule *ioc_rule;
290         struct ip_fw            *next_rule;
291         struct ip_fw            *prev_rule;
292         struct ip_fw            *sibling;
293         uint32_t                rule_flags;
294         struct ip_fw            **cross_rules;
295 };
296
297 struct netmsg_del {
298         struct netmsg_base      base;
299         struct ip_fw            *start_rule;
300         struct ip_fw            *prev_rule;
301         uint16_t                rulenum;
302         uint8_t                 from_set;
303         uint8_t                 to_set;
304 };
305
306 struct netmsg_zent {
307         struct netmsg_base      base;
308         struct ip_fw            *start_rule;
309         uint16_t                rulenum;
310         uint16_t                log_only;
311 };
312
313 struct netmsg_cpstate {
314         struct netmsg_base      base;
315         struct ipfw_ioc_state   *ioc_state;
316         int                     state_cntmax;
317         int                     state_cnt;
318 };
319
320 struct netmsg_tblent {
321         struct netmsg_base      base;
322         struct sockaddr         *key;
323         struct sockaddr         *netmask;
324         struct ipfw_tblent      *sibling;
325         int                     tableid;
326 };
327
328 struct netmsg_tblflush {
329         struct netmsg_base      base;
330         int                     tableid;
331         int                     destroy;
332 };
333
334 struct netmsg_tblexp {
335         struct netmsg_base      base;
336         time_t                  expire;
337         int                     tableid;
338         int                     cnt;
339         int                     expcnt;
340         struct radix_node_head  *rnh;
341 };
342
343 struct ipfw_table_cp {
344         struct ipfw_ioc_tblent  *te;
345         int                     te_idx;
346         int                     te_cnt;
347 };
348
349 struct ip_fw_local {
350         /*
351          * offset       The offset of a fragment. offset != 0 means that
352          *      we have a fragment at this offset of an IPv4 packet.
353          *      offset == 0 means that (if this is an IPv4 packet)
354          *      this is the first or only fragment.
355          */
356         u_short                 offset;
357
358         /*
359          * Local copies of addresses. They are only valid if we have
360          * an IP packet.
361          *
362          * proto        The protocol. Set to 0 for non-ip packets,
363          *      or to the protocol read from the packet otherwise.
364          *      proto != 0 means that we have an IPv4 packet.
365          *
366          * src_port, dst_port   port numbers, in HOST format. Only
367          *      valid for TCP and UDP packets.
368          *
369          * src_ip, dst_ip       ip addresses, in NETWORK format.
370          *      Only valid for IPv4 packets.
371          */
372         uint8_t                 proto;
373         uint16_t                src_port;       /* NOTE: host format    */
374         uint16_t                dst_port;       /* NOTE: host format    */
375         struct in_addr          src_ip;         /* NOTE: network format */
376         struct in_addr          dst_ip;         /* NOTE: network format */
377         uint16_t                ip_len;
378         struct tcphdr           *tcp;
379 };
380
381 struct ipfw_addrs {
382         uint32_t                addr1;  /* host byte order */
383         uint32_t                addr2;  /* host byte order */
384 };
385
386 struct ipfw_ports {
387         uint16_t                port1;  /* host byte order */
388         uint16_t                port2;  /* host byte order */
389 };
390
391 struct ipfw_key {
392         union {
393                 struct ipfw_addrs addrs;
394                 uint64_t        value;
395         } addr_u;
396         union {
397                 struct ipfw_ports ports;
398                 uint32_t        value;
399         } port_u;
400         uint8_t                 proto;
401         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
402         uint16_t                rsvd2;
403 };
404
405 #define IPFW_KEY_SWAP_ADDRS     0x1
406 #define IPFW_KEY_SWAP_PORTS     0x2
407 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
408
409 struct ipfw_trkcnt {
410         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
411         struct ipfw_key         tc_key;
412         uintptr_t               tc_ruleid;
413         int                     tc_refs;
414         int                     tc_count;
415         time_t                  tc_expire;      /* userland get-only */
416         uint16_t                tc_rulenum;     /* userland get-only */
417 } __cachealign;
418
419 #define tc_addrs                tc_key.addr_u.value
420 #define tc_ports                tc_key.port_u.value
421 #define tc_proto                tc_key.proto
422 #define tc_saddr                tc_key.addr_u.addrs.addr1
423 #define tc_daddr                tc_key.addr_u.addrs.addr2
424 #define tc_sport                tc_key.port_u.ports.port1
425 #define tc_dport                tc_key.port_u.ports.port2
426
427 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
428
429 struct ipfw_state;
430
431 struct ipfw_track {
432         RB_ENTRY(ipfw_track)    t_rblink;
433         struct ipfw_key         t_key;
434         struct ip_fw            *t_rule;
435         time_t                  t_lastexp;
436         LIST_HEAD(, ipfw_state) t_state_list;
437         time_t                  t_expire;
438         volatile int            *t_count;
439         struct ipfw_trkcnt      *t_trkcnt;
440         TAILQ_ENTRY(ipfw_track) t_link;
441 };
442
443 #define t_addrs                 t_key.addr_u.value
444 #define t_ports                 t_key.port_u.value
445 #define t_proto                 t_key.proto
446 #define t_saddr                 t_key.addr_u.addrs.addr1
447 #define t_daddr                 t_key.addr_u.addrs.addr2
448 #define t_sport                 t_key.port_u.ports.port1
449 #define t_dport                 t_key.port_u.ports.port2
450
451 RB_HEAD(ipfw_track_tree, ipfw_track);
452 TAILQ_HEAD(ipfw_track_list, ipfw_track);
453
454 struct ipfw_state {
455         RB_ENTRY(ipfw_state)    st_rblink;
456         struct ipfw_key         st_key;
457
458         time_t                  st_expire;      /* expire time */
459         struct ip_fw            *st_rule;
460
461         uint64_t                st_pcnt;        /* packets */
462         uint64_t                st_bcnt;        /* bytes */
463
464         /*
465          * st_state:
466          * State of this rule, typically a combination of TCP flags.
467          *
468          * st_ack_fwd/st_ack_rev:
469          * Most recent ACKs in forward and reverse direction.  They
470          * are used to generate keepalives.
471          */
472         uint32_t                st_state;
473         uint32_t                st_ack_fwd;     /* host byte order */
474         uint32_t                st_seq_fwd;     /* host byte order */
475         uint32_t                st_ack_rev;     /* host byte order */
476         uint32_t                st_seq_rev;     /* host byte order */
477
478         uint16_t                st_flags;       /* IPFW_STATE_F_ */
479         uint16_t                st_type;        /* KEEP_STATE/LIMIT/RDR */
480         struct ipfw_track       *st_track;
481
482         LIST_ENTRY(ipfw_state)  st_trklink;
483         TAILQ_ENTRY(ipfw_state) st_link;
484 };
485
486 #define st_addrs                st_key.addr_u.value
487 #define st_ports                st_key.port_u.value
488 #define st_proto                st_key.proto
489 #define st_swap                 st_key.swap
490
491 #define IPFW_STATE_F_ACKFWD     0x0001
492 #define IPFW_STATE_F_SEQFWD     0x0002
493 #define IPFW_STATE_F_ACKREV     0x0004
494 #define IPFW_STATE_F_SEQREV     0x0008
495 #define IPFW_STATE_F_XLATSRC    0x0010
496 #define IPFW_STATE_F_XLATSLAVE  0x0020
497 #define IPFW_STATE_F_LINKED     0x0040
498
499 #define IPFW_STATE_SCANSKIP(s)  ((s)->st_type == O_ANCHOR ||    \
500                                  ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
501
502 /* Expired or being deleted. */
503 #define IPFW_STATE_ISDEAD(s)    (TIME_LEQ((s)->st_expire, time_uptime) || \
504                                  IPFW_XLAT_INVALID((s)))
505
506 TAILQ_HEAD(ipfw_state_list, ipfw_state);
507 RB_HEAD(ipfw_state_tree, ipfw_state);
508
509 struct ipfw_xlat {
510         struct ipfw_state       xlat_st;        /* MUST be the first field */
511         uint32_t                xlat_addr;      /* network byte order */
512         uint16_t                xlat_port;      /* network byte order */
513         uint16_t                xlat_dir;       /* MATCH_ */
514         struct ifnet            *xlat_ifp;      /* matching ifnet */
515         struct ipfw_xlat        *xlat_pair;     /* paired state */
516         int                     xlat_pcpu;      /* paired cpu */
517         volatile int            xlat_invalid;   /* invalid, but not dtor yet */
518         volatile uint64_t       xlat_crefs;     /* cross references */
519         struct netmsg_base      xlat_freenm;    /* for remote free */
520 };
521
522 #define xlat_type               xlat_st.st_type
523 #define xlat_flags              xlat_st.st_flags
524 #define xlat_rule               xlat_st.st_rule
525 #define xlat_bcnt               xlat_st.st_bcnt
526 #define xlat_pcnt               xlat_st.st_pcnt
527
528 struct ipfw_tblent {
529         struct radix_node       te_nodes[2];
530         struct sockaddr_in      te_key;
531         u_long                  te_use;
532         time_t                  te_lastuse;
533         struct ipfw_tblent      *te_sibling;
534         volatile int            te_expired;
535 };
536
537 struct ipfw_context {
538         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
539         struct ip_fw            *ipfw_default_rule;     /* default rule */
540         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
541
542         /*
543          * ipfw_set_disable contains one bit per set value (0..31).
544          * If the bit is set, all rules with the corresponding set
545          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
546          * default rule and CANNOT be disabled.
547          */
548         uint32_t                ipfw_set_disable;
549
550         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
551
552         struct ip_fw            *ipfw_cont_rule;
553         struct ipfw_xlat        *ipfw_cont_xlat;
554
555         struct ipfw_state_tree  ipfw_state_tree;
556         struct ipfw_state_list  ipfw_state_list;
557         int                     ipfw_state_loosecnt;
558         int                     ipfw_state_cnt;
559
560         union {
561                 struct ipfw_state state;
562                 struct ipfw_track track;
563                 struct ipfw_trkcnt trkcnt;
564         } ipfw_tmpkey;
565
566         struct ipfw_track_tree  ipfw_track_tree;
567         struct ipfw_track_list  ipfw_track_list;
568         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
569
570         struct callout          ipfw_stateto_ch;
571         time_t                  ipfw_state_lastexp;
572         struct netmsg_base      ipfw_stateexp_nm;
573         struct netmsg_base      ipfw_stateexp_more;
574         struct ipfw_state       ipfw_stateexp_anch;
575
576         struct callout          ipfw_trackto_ch;
577         time_t                  ipfw_track_lastexp;
578         struct netmsg_base      ipfw_trackexp_nm;
579         struct netmsg_base      ipfw_trackexp_more;
580         struct ipfw_track       ipfw_trackexp_anch;
581
582         struct callout          ipfw_keepalive_ch;
583         struct netmsg_base      ipfw_keepalive_nm;
584         struct netmsg_base      ipfw_keepalive_more;
585         struct ipfw_state       ipfw_keepalive_anch;
586
587         struct callout          ipfw_xlatreap_ch;
588         struct netmsg_base      ipfw_xlatreap_nm;
589         struct ipfw_state_list  ipfw_xlatreap;
590
591         /*
592          * Statistics
593          */
594         u_long                  ipfw_sts_reap;
595         u_long                  ipfw_sts_reapfailed;
596         u_long                  ipfw_sts_overflow;
597         u_long                  ipfw_sts_nomem;
598         u_long                  ipfw_sts_tcprecycled;
599
600         u_long                  ipfw_tks_nomem;
601         u_long                  ipfw_tks_reap;
602         u_long                  ipfw_tks_reapfailed;
603         u_long                  ipfw_tks_overflow;
604         u_long                  ipfw_tks_cntnomem;
605
606         u_long                  ipfw_frags;
607         u_long                  ipfw_defraged;
608         u_long                  ipfw_defrag_remote;
609
610         u_long                  ipfw_xlated;
611         u_long                  ipfw_xlate_split;
612         u_long                  ipfw_xlate_conflicts;
613         u_long                  ipfw_xlate_cresolved;
614
615         /* Last field */
616         struct radix_node_head  *ipfw_tables[];
617 };
618
619 #define IPFW_FLAG_KEEPALIVE     0x01
620 #define IPFW_FLAG_STATEEXP      0x02
621 #define IPFW_FLAG_TRACKEXP      0x04
622 #define IPFW_FLAG_STATEREAP     0x08
623 #define IPFW_FLAG_TRACKREAP     0x10
624
625 #define ipfw_state_tmpkey       ipfw_tmpkey.state
626 #define ipfw_track_tmpkey       ipfw_tmpkey.track
627 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
628
629 struct ipfw_global {
630         int                     ipfw_state_loosecnt;    /* cache aligned */
631         time_t                  ipfw_state_globexp __cachealign;
632
633         struct lwkt_token       ipfw_trkcnt_token __cachealign;
634         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
635         int                     ipfw_trkcnt_cnt;
636         time_t                  ipfw_track_globexp;
637
638         /* Accessed in netisr0. */
639         struct ip_fw            *ipfw_crossref_free __cachealign;
640         struct callout          ipfw_crossref_ch;
641         struct netmsg_base      ipfw_crossref_nm;
642
643 #ifdef KLD_MODULE
644         /*
645          * Module can not be unloaded, if there are references to
646          * certains rules of ipfw(4), e.g. dummynet(4)
647          */
648         int                     ipfw_refcnt __cachealign;
649 #endif
650 } __cachealign;
651
652 static struct ipfw_context      *ipfw_ctx[MAXCPU];
653
654 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
655
656 /*
657  * Following two global variables are accessed and updated only
658  * in netisr0.
659  */
660 static uint32_t static_count;   /* # of static rules */
661 static uint32_t static_ioc_len; /* bytes of static rules */
662
663 /*
664  * If 1, then ipfw static rules are being flushed,
665  * ipfw_chk() will skip to the default rule.
666  */
667 static int ipfw_flushing;
668
669 static int fw_verbose;
670 static int verbose_limit;
671
672 static int fw_debug;
673 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
674
675 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
676
677 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
678 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
679
680 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
681
682 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
683 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
684     "Firewall statistics");
685
686 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
687     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
688 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
689     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
690     "Rule number autincrement step");
691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
692     &fw_one_pass, 0,
693     "Only do a single pass through ipfw when using dummynet(4)");
694 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
695     &fw_debug, 0, "Enable printing of debug ip_fw statements");
696 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
697     &fw_verbose, 0, "Log matches to ipfw rules");
698 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
699     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
700 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
701     &ipfw_table_max, 0, "Max # of tables");
702
703 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
704 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
705 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
706 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
707 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
708 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
709
710 /*
711  * Timeouts for various events in handing states.
712  *
713  * NOTE:
714  * 1 == 0~1 second.
715  * 2 == 1~2 second(s).
716  *
717  * We use 2 seconds for FIN lifetime, so that the states will not be
718  * ripped prematurely.
719  */
720 static uint32_t dyn_ack_lifetime = 300;
721 static uint32_t dyn_syn_lifetime = 20;
722 static uint32_t dyn_finwait_lifetime = 20;
723 static uint32_t dyn_fin_lifetime = 2;
724 static uint32_t dyn_rst_lifetime = 2;
725 static uint32_t dyn_udp_lifetime = 10;
726 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
727
728 /*
729  * Keepalives are sent if dyn_keepalive is set. They are sent every
730  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
731  * seconds of lifetime of a rule.
732  */
733 static uint32_t dyn_keepalive_interval = 20;
734 static uint32_t dyn_keepalive_period = 5;
735 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
736
737 static struct ipfw_global       ipfw_gd;
738 static int      ipfw_state_loosecnt_updthr;
739 static int      ipfw_state_max = 4096;  /* max # of states */
740 static int      ipfw_track_max = 4096;  /* max # of tracks */
741
742 static int      ipfw_state_headroom;    /* setup at module load time */
743 static int      ipfw_state_reap_min = 8;
744 static int      ipfw_state_expire_max = 32;
745 static int      ipfw_state_scan_max = 256;
746 static int      ipfw_keepalive_max = 8;
747 static int      ipfw_track_reap_max = 4;
748 static int      ipfw_track_expire_max = 16;
749 static int      ipfw_track_scan_max = 128;
750
751 static eventhandler_tag ipfw_ifaddr_event;
752
753 /* Compat */
754 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
755     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
756     "Number of states and tracks");
757 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
758     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
759     "Max number of states and tracks");
760
761 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
762     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
763     "Number of states");
764 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
765     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
766     "Max number of states");
767 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
768     &ipfw_state_headroom, 0, "headroom for state reap");
769 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
770     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
771 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
772     &ipfw_track_max, 0, "Max number of tracks");
773 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
774     &static_count, 0, "Number of static rules");
775 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
776     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
777 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
778     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
779 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
780     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
781 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
782     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
783 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
784     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
785 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
786     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
787 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
788     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
789 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
790     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
791 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
792     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
793     "I", "# of states to scan for each expire iteration");
794 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
795     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
796     "I", "# of states to expire for each expire iteration");
797 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
798     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
799     "I", "# of states to expire for each expire iteration");
800 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
801     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
802     "I", "# of states to reap for state shortage");
803 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
804     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
805     "I", "# of tracks to scan for each expire iteration");
806 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
807     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
808     "I", "# of tracks to expire for each expire iteration");
809 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
810     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
811     "I", "# of tracks to reap for track shortage");
812
813 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
814     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
815     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
816     "LU", "# of state reaps due to states shortage");
817 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
818     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
819     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
820     "LU", "# of state reap failure");
821 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
822     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
823     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
824     "LU", "# of state overflow");
825 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
826     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
827     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
828     "LU", "# of state allocation failure");
829 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
830     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
831     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
832     "LU", "# of state deleted due to fast TCP port recycling");
833
834 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
835     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
836     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
837     "LU", "# of track allocation failure");
838 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
839     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
840     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
841     "LU", "# of track reap due to tracks shortage");
842 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
843     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
844     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
845     "LU", "# of track reap failure");
846 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
847     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
848     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
849     "LU", "# of track overflow");
850 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
851     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
852     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
853     "LU", "# of track counter allocation failure");
854 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
855     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
856     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
857     "LU", "# of IP fragements defraged");
858 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
859     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
860     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
861     "LU", "# of IP packets after defrag");
862 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
863     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
864     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
865     "LU", "# of IP packets after defrag dispatched to remote cpus");
866 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
867     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
868     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
869     "LU", "# address/port translations");
870 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
871     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
872     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
873     "LU", "# address/port translations split between different cpus");
874 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
875     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
876     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
877     "LU", "# address/port translations conflicts on remote cpu");
878 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
879     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
880     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
881     "LU", "# address/port translations conflicts resolved on remote cpu");
882
883 static int              ipfw_state_cmp(struct ipfw_state *,
884                             struct ipfw_state *);
885 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
886                             struct ipfw_trkcnt *);
887 static int              ipfw_track_cmp(struct ipfw_track *,
888                             struct ipfw_track *);
889
890 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
892
893 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
895
896 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
898
899 static int              ipfw_chk(struct ip_fw_args *);
900 static void             ipfw_track_expire_ipifunc(void *);
901 static void             ipfw_state_expire_ipifunc(void *);
902 static void             ipfw_keepalive(void *);
903 static int              ipfw_state_expire_start(struct ipfw_context *,
904                             int, int);
905 static void             ipfw_crossref_timeo(void *);
906 static void             ipfw_state_remove(struct ipfw_context *,
907                             struct ipfw_state *);
908 static void             ipfw_xlat_reap_timeo(void *);
909 static void             ipfw_defrag_redispatch(struct mbuf *, int,
910                             struct ip_fw *);
911
912 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
914 #define IPFW_TRKCNT_TOKINIT     \
915         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
916
917 static void
918 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
919     const struct sockaddr *netmask)
920 {
921         const u_char *cp1 = (const u_char *)src;
922         u_char *cp2 = (u_char *)dst;
923         const u_char *cp3 = (const u_char *)netmask;
924         u_char *cplim = cp2 + *cp3;
925         u_char *cplim2 = cp2 + *cp1;
926
927         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
928         cp3 += 2;
929         if (cplim > cplim2)
930                 cplim = cplim2;
931         while (cp2 < cplim)
932                 *cp2++ = *cp1++ & *cp3++;
933         if (cp2 < cplim2)
934                 bzero(cp2, cplim2 - cp2);
935 }
936
937 static __inline uint16_t
938 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
939 {
940         uint32_t l;
941
942         if (udp && !cksum)
943                 return (0x0000);
944         l = cksum + old - new;
945         l = (l >> 16) + (l & 65535);
946         l = l & 65535;
947         if (udp && !l)
948                 return (0xFFFF);
949         return (l);
950 }
951
952 static __inline void
953 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
954     in_addr_t daddr, uint16_t dport, uint8_t proto)
955 {
956
957         key->proto = proto;
958         key->swap = 0;
959
960         if (saddr < daddr) {
961                 key->addr_u.addrs.addr1 = daddr;
962                 key->addr_u.addrs.addr2 = saddr;
963                 key->swap |= IPFW_KEY_SWAP_ADDRS;
964         } else {
965                 key->addr_u.addrs.addr1 = saddr;
966                 key->addr_u.addrs.addr2 = daddr;
967         }
968
969         if (sport < dport) {
970                 key->port_u.ports.port1 = dport;
971                 key->port_u.ports.port2 = sport;
972                 key->swap |= IPFW_KEY_SWAP_PORTS;
973         } else {
974                 key->port_u.ports.port1 = sport;
975                 key->port_u.ports.port2 = dport;
976         }
977
978         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
979                 key->swap |= IPFW_KEY_SWAP_PORTS;
980         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
981                 key->swap |= IPFW_KEY_SWAP_ADDRS;
982 }
983
984 static __inline void
985 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
986     in_addr_t *daddr, uint16_t *dport)
987 {
988
989         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
990                 *saddr = key->addr_u.addrs.addr2;
991                 *daddr = key->addr_u.addrs.addr1;
992         } else {
993                 *saddr = key->addr_u.addrs.addr1;
994                 *daddr = key->addr_u.addrs.addr2;
995         }
996
997         if (key->swap & IPFW_KEY_SWAP_PORTS) {
998                 *sport = key->port_u.ports.port2;
999                 *dport = key->port_u.ports.port1;
1000         } else {
1001                 *sport = key->port_u.ports.port1;
1002                 *dport = key->port_u.ports.port2;
1003         }
1004 }
1005
1006 static int
1007 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1008 {
1009
1010         if (s1->st_proto > s2->st_proto)
1011                 return (1);
1012         if (s1->st_proto < s2->st_proto)
1013                 return (-1);
1014
1015         if (s1->st_addrs > s2->st_addrs)
1016                 return (1);
1017         if (s1->st_addrs < s2->st_addrs)
1018                 return (-1);
1019
1020         if (s1->st_ports > s2->st_ports)
1021                 return (1);
1022         if (s1->st_ports < s2->st_ports)
1023                 return (-1);
1024
1025         if (s1->st_swap == s2->st_swap ||
1026             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1027                 return (0);
1028
1029         if (s1->st_swap > s2->st_swap)
1030                 return (1);
1031         else
1032                 return (-1);
1033 }
1034
1035 static int
1036 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1037 {
1038
1039         if (t1->tc_proto > t2->tc_proto)
1040                 return (1);
1041         if (t1->tc_proto < t2->tc_proto)
1042                 return (-1);
1043
1044         if (t1->tc_addrs > t2->tc_addrs)
1045                 return (1);
1046         if (t1->tc_addrs < t2->tc_addrs)
1047                 return (-1);
1048
1049         if (t1->tc_ports > t2->tc_ports)
1050                 return (1);
1051         if (t1->tc_ports < t2->tc_ports)
1052                 return (-1);
1053
1054         if (t1->tc_ruleid > t2->tc_ruleid)
1055                 return (1);
1056         if (t1->tc_ruleid < t2->tc_ruleid)
1057                 return (-1);
1058
1059         return (0);
1060 }
1061
1062 static int
1063 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1064 {
1065
1066         if (t1->t_proto > t2->t_proto)
1067                 return (1);
1068         if (t1->t_proto < t2->t_proto)
1069                 return (-1);
1070
1071         if (t1->t_addrs > t2->t_addrs)
1072                 return (1);
1073         if (t1->t_addrs < t2->t_addrs)
1074                 return (-1);
1075
1076         if (t1->t_ports > t2->t_ports)
1077                 return (1);
1078         if (t1->t_ports < t2->t_ports)
1079                 return (-1);
1080
1081         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1082                 return (1);
1083         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1084                 return (-1);
1085
1086         return (0);
1087 }
1088
1089 static __inline struct ipfw_state *
1090 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1091 {
1092         struct ipfw_state *dup;
1093
1094         KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1095             ("state %p was linked", s));
1096         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1097         if (dup == NULL) {
1098                 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1099                 s->st_flags |= IPFW_STATE_F_LINKED;
1100         }
1101         return (dup);
1102 }
1103
1104 static __inline void
1105 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1106 {
1107
1108         KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1109             ("state %p was not linked", s));
1110         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1111         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1112         s->st_flags &= ~IPFW_STATE_F_LINKED;
1113 }
1114
1115 static void
1116 ipfw_state_max_set(int state_max)
1117 {
1118
1119         ipfw_state_max = state_max;
1120         /* Allow 5% states over-allocation. */
1121         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1122 }
1123
1124 static __inline int
1125 ipfw_state_cntcoll(void)
1126 {
1127         int cpu, state_cnt = 0;
1128
1129         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1130                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1131         return (state_cnt);
1132 }
1133
1134 static __inline int
1135 ipfw_state_cntsync(void)
1136 {
1137         int state_cnt;
1138
1139         state_cnt = ipfw_state_cntcoll();
1140         ipfw_gd.ipfw_state_loosecnt = state_cnt;
1141         return (state_cnt);
1142 }
1143
1144 static __inline int
1145 ipfw_free_rule(struct ip_fw *rule)
1146 {
1147         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1148         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1149         rule->refcnt--;
1150         if (rule->refcnt == 0) {
1151                 if (rule->cross_rules != NULL)
1152                         kfree(rule->cross_rules, M_IPFW);
1153                 kfree(rule, M_IPFW);
1154                 return 1;
1155         }
1156         return 0;
1157 }
1158
1159 static void
1160 ipfw_unref_rule(void *priv)
1161 {
1162         ipfw_free_rule(priv);
1163 #ifdef KLD_MODULE
1164         KASSERT(ipfw_gd.ipfw_refcnt > 0,
1165             ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1166         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1167 #endif
1168 }
1169
1170 static __inline void
1171 ipfw_ref_rule(struct ip_fw *rule)
1172 {
1173         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1174 #ifdef KLD_MODULE
1175         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1176 #endif
1177         rule->refcnt++;
1178 }
1179
1180 /*
1181  * This macro maps an ip pointer into a layer3 header pointer of type T
1182  */
1183 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1184
1185 static __inline int
1186 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1187 {
1188         int type = L3HDR(struct icmp,ip)->icmp_type;
1189         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1190         int idx = type / 32;
1191
1192         if (idx >= idx_max)
1193                 return (0);
1194         return (cmd->d[idx] & (1 << (type % 32)));
1195 }
1196
1197 static __inline int
1198 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1199 {
1200         int code = L3HDR(struct icmp,ip)->icmp_code;
1201         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1202         int idx = code / 32;
1203
1204         if (idx >= idx_max)
1205                 return (0);
1206         return (cmd->d[idx] & (1 << (code % 32)));
1207 }
1208
1209 #define TT      ((1 << ICMP_ECHO) | \
1210                  (1 << ICMP_ROUTERSOLICIT) | \
1211                  (1 << ICMP_TSTAMP) | \
1212                  (1 << ICMP_IREQ) | \
1213                  (1 << ICMP_MASKREQ))
1214
1215 static int
1216 is_icmp_query(struct ip *ip)
1217 {
1218         int type = L3HDR(struct icmp, ip)->icmp_type;
1219
1220         return (type < 32 && (TT & (1 << type)));
1221 }
1222
1223 #undef TT
1224
1225 /*
1226  * The following checks use two arrays of 8 or 16 bits to store the
1227  * bits that we want set or clear, respectively. They are in the
1228  * low and high half of cmd->arg1 or cmd->d[0].
1229  *
1230  * We scan options and store the bits we find set. We succeed if
1231  *
1232  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1233  *
1234  * The code is sometimes optimized not to store additional variables.
1235  */
1236 static int
1237 flags_match(ipfw_insn *cmd, uint8_t bits)
1238 {
1239         u_char want_clear;
1240         bits = ~bits;
1241
1242         if (((cmd->arg1 & 0xff) & bits) != 0)
1243                 return 0; /* some bits we want set were clear */
1244
1245         want_clear = (cmd->arg1 >> 8) & 0xff;
1246         if ((want_clear & bits) != want_clear)
1247                 return 0; /* some bits we want clear were set */
1248         return 1;
1249 }
1250
1251 static int
1252 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1253 {
1254         int optlen, bits = 0;
1255         u_char *cp = (u_char *)(ip + 1);
1256         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1257
1258         for (; x > 0; x -= optlen, cp += optlen) {
1259                 int opt = cp[IPOPT_OPTVAL];
1260
1261                 if (opt == IPOPT_EOL)
1262                         break;
1263
1264                 if (opt == IPOPT_NOP) {
1265                         optlen = 1;
1266                 } else {
1267                         optlen = cp[IPOPT_OLEN];
1268                         if (optlen <= 0 || optlen > x)
1269                                 return 0; /* invalid or truncated */
1270                 }
1271
1272                 switch (opt) {
1273                 case IPOPT_LSRR:
1274                         bits |= IP_FW_IPOPT_LSRR;
1275                         break;
1276
1277                 case IPOPT_SSRR:
1278                         bits |= IP_FW_IPOPT_SSRR;
1279                         break;
1280
1281                 case IPOPT_RR:
1282                         bits |= IP_FW_IPOPT_RR;
1283                         break;
1284
1285                 case IPOPT_TS:
1286                         bits |= IP_FW_IPOPT_TS;
1287                         break;
1288
1289                 default:
1290                         break;
1291                 }
1292         }
1293         return (flags_match(cmd, bits));
1294 }
1295
1296 static int
1297 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1298 {
1299         int optlen, bits = 0;
1300         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1301         u_char *cp = (u_char *)(tcp + 1);
1302         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1303
1304         for (; x > 0; x -= optlen, cp += optlen) {
1305                 int opt = cp[0];
1306
1307                 if (opt == TCPOPT_EOL)
1308                         break;
1309
1310                 if (opt == TCPOPT_NOP) {
1311                         optlen = 1;
1312                 } else {
1313                         optlen = cp[1];
1314                         if (optlen <= 0)
1315                                 break;
1316                 }
1317
1318                 switch (opt) {
1319                 case TCPOPT_MAXSEG:
1320                         bits |= IP_FW_TCPOPT_MSS;
1321                         break;
1322
1323                 case TCPOPT_WINDOW:
1324                         bits |= IP_FW_TCPOPT_WINDOW;
1325                         break;
1326
1327                 case TCPOPT_SACK_PERMITTED:
1328                 case TCPOPT_SACK:
1329                         bits |= IP_FW_TCPOPT_SACK;
1330                         break;
1331
1332                 case TCPOPT_TIMESTAMP:
1333                         bits |= IP_FW_TCPOPT_TS;
1334                         break;
1335
1336                 case TCPOPT_CC:
1337                 case TCPOPT_CCNEW:
1338                 case TCPOPT_CCECHO:
1339                         bits |= IP_FW_TCPOPT_CC;
1340                         break;
1341
1342                 default:
1343                         break;
1344                 }
1345         }
1346         return (flags_match(cmd, bits));
1347 }
1348
1349 static int
1350 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1351 {
1352         if (ifp == NULL)        /* no iface with this packet, match fails */
1353                 return 0;
1354
1355         /* Check by name or by IP address */
1356         if (cmd->name[0] != '\0') { /* match by name */
1357                 /* Check name */
1358                 if (cmd->p.glob) {
1359                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1360                                 return(1);
1361                 } else {
1362                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1363                                 return(1);
1364                 }
1365         } else {
1366                 struct ifaddr_container *ifac;
1367
1368                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1369                         struct ifaddr *ia = ifac->ifa;
1370
1371                         if (ia->ifa_addr == NULL)
1372                                 continue;
1373                         if (ia->ifa_addr->sa_family != AF_INET)
1374                                 continue;
1375                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1376                             (ia->ifa_addr))->sin_addr.s_addr)
1377                                 return(1);      /* match */
1378                 }
1379         }
1380         return(0);      /* no match, fail ... */
1381 }
1382
1383 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1384
1385 /*
1386  * We enter here when we have a rule with O_LOG.
1387  * XXX this function alone takes about 2Kbytes of code!
1388  */
1389 static void
1390 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1391     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1392 {
1393         char *action;
1394         int limit_reached = 0;
1395         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1396
1397         fragment[0] = '\0';
1398         proto[0] = '\0';
1399
1400         if (f == NULL) {        /* bogus pkt */
1401                 if (verbose_limit != 0 &&
1402                     ctx->ipfw_norule_counter >= verbose_limit)
1403                         return;
1404                 ctx->ipfw_norule_counter++;
1405                 if (ctx->ipfw_norule_counter == verbose_limit)
1406                         limit_reached = verbose_limit;
1407                 action = "Refuse";
1408         } else {        /* O_LOG is the first action, find the real one */
1409                 ipfw_insn *cmd = ACTION_PTR(f);
1410                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1411
1412                 if (l->max_log != 0 && l->log_left == 0)
1413                         return;
1414                 l->log_left--;
1415                 if (l->log_left == 0)
1416                         limit_reached = l->max_log;
1417                 cmd += F_LEN(cmd);      /* point to first action */
1418                 if (cmd->opcode == O_PROB)
1419                         cmd += F_LEN(cmd);
1420
1421                 action = action2;
1422                 switch (cmd->opcode) {
1423                 case O_DENY:
1424                         action = "Deny";
1425                         break;
1426
1427                 case O_REJECT:
1428                         if (cmd->arg1==ICMP_REJECT_RST) {
1429                                 action = "Reset";
1430                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1431                                 action = "Reject";
1432                         } else {
1433                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1434                                           cmd->arg1);
1435                         }
1436                         break;
1437
1438                 case O_ACCEPT:
1439                         action = "Accept";
1440                         break;
1441
1442                 case O_COUNT:
1443                         action = "Count";
1444                         break;
1445
1446                 case O_DIVERT:
1447                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1448                         break;
1449
1450                 case O_TEE:
1451                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1452                         break;
1453
1454                 case O_SKIPTO:
1455                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1456                         break;
1457
1458                 case O_PIPE:
1459                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1460                         break;
1461
1462                 case O_QUEUE:
1463                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1464                         break;
1465
1466                 case O_FORWARD_IP:
1467                         {
1468                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1469                                 int len;
1470
1471                                 len = ksnprintf(SNPARGS(action2, 0),
1472                                     "Forward to %s",
1473                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1474                                 if (sa->sa.sin_port) {
1475                                         ksnprintf(SNPARGS(action2, len), ":%d",
1476                                                   sa->sa.sin_port);
1477                                 }
1478                         }
1479                         break;
1480
1481                 default:
1482                         action = "UNKNOWN";
1483                         break;
1484                 }
1485         }
1486
1487         if (hlen == 0) {        /* non-ip */
1488                 ksnprintf(SNPARGS(proto, 0), "MAC");
1489         } else {
1490                 struct ip *ip = mtod(m, struct ip *);
1491                 /* these three are all aliases to the same thing */
1492                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1493                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1494                 struct udphdr *const udp = (struct udphdr *)icmp;
1495
1496                 int ip_off, offset, ip_len;
1497                 int len;
1498
1499                 if (eh != NULL) { /* layer 2 packets are as on the wire */
1500                         ip_off = ntohs(ip->ip_off);
1501                         ip_len = ntohs(ip->ip_len);
1502                 } else {
1503                         ip_off = ip->ip_off;
1504                         ip_len = ip->ip_len;
1505                 }
1506                 offset = ip_off & IP_OFFMASK;
1507                 switch (ip->ip_p) {
1508                 case IPPROTO_TCP:
1509                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1510                                         kinet_ntoa(ip->ip_src, abuf));
1511                         if (offset == 0) {
1512                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1513                                           ntohs(tcp->th_sport),
1514                                           kinet_ntoa(ip->ip_dst, abuf),
1515                                           ntohs(tcp->th_dport));
1516                         } else {
1517                                 ksnprintf(SNPARGS(proto, len), " %s",
1518                                           kinet_ntoa(ip->ip_dst, abuf));
1519                         }
1520                         break;
1521
1522                 case IPPROTO_UDP:
1523                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1524                                         kinet_ntoa(ip->ip_src, abuf));
1525                         if (offset == 0) {
1526                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1527                                           ntohs(udp->uh_sport),
1528                                           kinet_ntoa(ip->ip_dst, abuf),
1529                                           ntohs(udp->uh_dport));
1530                         } else {
1531                                 ksnprintf(SNPARGS(proto, len), " %s",
1532                                           kinet_ntoa(ip->ip_dst, abuf));
1533                         }
1534                         break;
1535
1536                 case IPPROTO_ICMP:
1537                         if (offset == 0) {
1538                                 len = ksnprintf(SNPARGS(proto, 0),
1539                                                 "ICMP:%u.%u ",
1540                                                 icmp->icmp_type,
1541                                                 icmp->icmp_code);
1542                         } else {
1543                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1544                         }
1545                         len += ksnprintf(SNPARGS(proto, len), "%s",
1546                                          kinet_ntoa(ip->ip_src, abuf));
1547                         ksnprintf(SNPARGS(proto, len), " %s",
1548                                   kinet_ntoa(ip->ip_dst, abuf));
1549                         break;
1550
1551                 default:
1552                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1553                                         kinet_ntoa(ip->ip_src, abuf));
1554                         ksnprintf(SNPARGS(proto, len), " %s",
1555                                   kinet_ntoa(ip->ip_dst, abuf));
1556                         break;
1557                 }
1558
1559                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1560                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1561                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1562                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1563                 }
1564         }
1565
1566         if (oif || m->m_pkthdr.rcvif) {
1567                 log(LOG_SECURITY | LOG_INFO,
1568                     "ipfw: %d %s %s %s via %s%s\n",
1569                     f ? f->rulenum : -1,
1570                     action, proto, oif ? "out" : "in",
1571                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1572                     fragment);
1573         } else {
1574                 log(LOG_SECURITY | LOG_INFO,
1575                     "ipfw: %d %s %s [no if info]%s\n",
1576                     f ? f->rulenum : -1,
1577                     action, proto, fragment);
1578         }
1579
1580         if (limit_reached) {
1581                 log(LOG_SECURITY | LOG_NOTICE,
1582                     "ipfw: limit %d reached on entry %d\n",
1583                     limit_reached, f ? f->rulenum : -1);
1584         }
1585 }
1586
1587 #undef SNPARGS
1588
1589 static void
1590 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1591 {
1592         struct ip_fw *rule = slave_x->xlat_rule;
1593
1594         KKASSERT(rule->cpuid == mycpuid);
1595
1596         /* No more cross references; free this pair now. */
1597         kfree(x, M_IPFW);
1598         kfree(slave_x, M_IPFW);
1599
1600         /* See the comment in ipfw_ip_xlate_dispatch(). */
1601         rule->cross_refs--;
1602 }
1603
1604 static void
1605 ipfw_xlat_reap_dispatch(netmsg_t nm)
1606 {
1607         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1608         struct ipfw_state *s, *ns;
1609
1610         ASSERT_NETISR_NCPUS(mycpuid);
1611
1612         crit_enter();
1613         /* Reply ASAP. */
1614         netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1615         crit_exit();
1616
1617         /* TODO: limit scanning depth */
1618         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1619                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1620                 struct ipfw_xlat *slave_x = x->xlat_pair;
1621                 uint64_t crefs;
1622
1623                 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1624                 if (crefs == 0) {
1625                         TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1626                         ipfw_xlat_reap(x, slave_x);
1627                 }
1628         }
1629         if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1630                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1631                     &ctx->ipfw_xlatreap_nm);
1632         }
1633 }
1634
1635 static void
1636 ipfw_xlat_reap_timeo(void *xnm)
1637 {
1638         struct netmsg_base *nm = xnm;
1639
1640         KKASSERT(mycpuid < netisr_ncpus);
1641
1642         crit_enter();
1643         if (nm->lmsg.ms_flags & MSGF_DONE)
1644                 netisr_sendmsg_oncpu(nm);
1645         crit_exit();
1646 }
1647
1648 static void
1649 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1650 {
1651         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1652         struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1653         struct ipfw_xlat *slave_x = x->xlat_pair;
1654         uint64_t crefs;
1655
1656         ASSERT_NETISR_NCPUS(mycpuid);
1657
1658         KKASSERT(slave_x != NULL);
1659         KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1660
1661         KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1662             ("master xlat is still linked"));
1663         if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1664                 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1665
1666         /* See the comment in ipfw_ip_xlate_dispatch(). */
1667         slave_x->xlat_crefs--;
1668
1669         crefs = slave_x->xlat_crefs + x->xlat_crefs;
1670         if (crefs == 0) {
1671                 ipfw_xlat_reap(x, slave_x);
1672                 return;
1673         }
1674
1675         if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1676                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1677                     &ctx->ipfw_xlatreap_nm);
1678         }
1679
1680         /*
1681          * This pair is still referenced; defer its destruction.
1682          * YYY reuse st_link.
1683          */
1684         TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1685 }
1686
1687 static __inline void
1688 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1689 {
1690
1691         x->xlat_invalid = 1;
1692         x->xlat_pair->xlat_invalid = 1;
1693 }
1694
1695 static void
1696 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1697 {
1698         struct ipfw_xlat *x, *slave_x;
1699         struct netmsg_base *nm;
1700
1701         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1702             IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1703         KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1704             ("delete slave xlat"));
1705
1706         KASSERT(ctx->ipfw_state_cnt > 0,
1707             ("invalid state count %d", ctx->ipfw_state_cnt));
1708         ctx->ipfw_state_cnt--;
1709         if (ctx->ipfw_state_loosecnt > 0)
1710                 ctx->ipfw_state_loosecnt--;
1711
1712         /*
1713          * Unhook this state.
1714          */
1715         if (s->st_track != NULL) {
1716                 struct ipfw_track *t = s->st_track;
1717
1718                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1719                     ("track state list is empty"));
1720                 LIST_REMOVE(s, st_trklink);
1721
1722                 KASSERT(*t->t_count > 0,
1723                     ("invalid track count %d", *t->t_count));
1724                 atomic_subtract_int(t->t_count, 1);
1725         }
1726         ipfw_state_unlink(ctx, s);
1727
1728         /*
1729          * Free this state.  Xlat requires special processing,
1730          * since xlat are paired state and they could be on
1731          * different cpus.
1732          */
1733
1734         if (!IPFW_ISXLAT(s->st_type)) {
1735                 /* Not xlat; free now. */
1736                 kfree(s, M_IPFW);
1737                 /* Done! */
1738                 return;
1739         }
1740         x = (struct ipfw_xlat *)s;
1741
1742         if (x->xlat_pair == NULL) {
1743                 /* Not setup yet; free now. */
1744                 kfree(x, M_IPFW);
1745                 /* Done! */
1746                 return;
1747         }
1748         slave_x = x->xlat_pair;
1749         KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1750
1751         if (x->xlat_pcpu == mycpuid) {
1752                 /*
1753                  * Paired states are on the same cpu; delete this
1754                  * pair now.
1755                  */
1756                 KKASSERT(x->xlat_crefs == 0);
1757                 KKASSERT(slave_x->xlat_crefs == 0);
1758                 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1759                         ipfw_state_unlink(ctx, &slave_x->xlat_st);
1760                 kfree(x, M_IPFW);
1761                 kfree(slave_x, M_IPFW);
1762                 return;
1763         }
1764
1765         /*
1766          * Free the paired states on the cpu owning the slave xlat.
1767          */
1768
1769         /* 
1770          * Mark the state pair invalid; completely deleting them
1771          * may take some time.
1772          */
1773         ipfw_xlat_invalidate(x);
1774
1775         nm = &x->xlat_freenm;
1776         netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1777             ipfw_xlat_free_dispatch);
1778         nm->lmsg.u.ms_resultp = x;
1779
1780         /* See the comment in ipfw_xlate_redispatch(). */
1781         x->xlat_rule->cross_refs++;
1782         x->xlat_crefs++;
1783
1784         netisr_sendmsg(nm, x->xlat_pcpu);
1785 }
1786
1787 static void
1788 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1789 {
1790
1791         if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1792                 KKASSERT(IPFW_ISXLAT(s->st_type));
1793                 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1794                 ipfw_state_unlink(ctx, s);
1795                 return;
1796         }
1797         ipfw_state_del(ctx, s);
1798 }
1799
1800 static int
1801 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1802 {
1803         struct ipfw_state *s, *anchor;
1804         int expired;
1805
1806         if (reap_max < ipfw_state_reap_min)
1807                 reap_max = ipfw_state_reap_min;
1808
1809         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1810                 /*
1811                  * Kick start state expiring.  Ignore scan limit,
1812                  * we are short of states.
1813                  */
1814                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1815                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1816                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1817                 return (expired);
1818         }
1819
1820         /*
1821          * States are being expired.
1822          */
1823
1824         if (ctx->ipfw_state_cnt == 0)
1825                 return (0);
1826
1827         expired = 0;
1828         anchor = &ctx->ipfw_stateexp_anch;
1829         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1830                 /*
1831                  * Ignore scan limit; we are short of states.
1832                  */
1833
1834                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1835                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1836
1837                 if (IPFW_STATE_SCANSKIP(s))
1838                         continue;
1839
1840                 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1841                         ipfw_state_del(ctx, s);
1842                         if (++expired >= reap_max)
1843                                 break;
1844                         if ((expired & 0xff) == 0 && 
1845                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1846                             ipfw_state_max)
1847                                 break;
1848                 }
1849         }
1850         /*
1851          * NOTE:
1852          * Leave the anchor on the list, even if the end of the list has
1853          * been reached.  ipfw_state_expire_more_dispatch() will handle
1854          * the removal.
1855          */
1856         return (expired);
1857 }
1858
1859 static void
1860 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1861 {
1862         struct ipfw_state *s, *sn;
1863
1864         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1865                 if (IPFW_STATE_SCANSKIP(s))
1866                         continue;
1867                 if (rule != NULL && s->st_rule != rule)
1868                         continue;
1869                 ipfw_state_del(ctx, s);
1870         }
1871 }
1872
1873 static void
1874 ipfw_state_expire_done(struct ipfw_context *ctx)
1875 {
1876
1877         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1878             ("stateexp is not in progress"));
1879         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1880         callout_reset(&ctx->ipfw_stateto_ch, hz,
1881             ipfw_state_expire_ipifunc, NULL);
1882 }
1883
1884 static void
1885 ipfw_state_expire_more(struct ipfw_context *ctx)
1886 {
1887         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1888
1889         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1890             ("stateexp is not in progress"));
1891         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1892             ("stateexp more did not finish"));
1893         netisr_sendmsg_oncpu(nm);
1894 }
1895
1896 static int
1897 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1898     int scan_max, int expire_max)
1899 {
1900         struct ipfw_state *s;
1901         int scanned = 0, expired = 0;
1902
1903         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1904             ("stateexp is not in progress"));
1905
1906         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1907                 if (scanned++ >= scan_max) {
1908                         ipfw_state_expire_more(ctx);
1909                         return (expired);
1910                 }
1911
1912                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1913                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1914
1915                 if (IPFW_STATE_SCANSKIP(s))
1916                         continue;
1917
1918                 if (IPFW_STATE_ISDEAD(s) ||
1919                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1920                      IPFW_STATE_TCPCLOSED(s))) {
1921                         ipfw_state_del(ctx, s);
1922                         if (++expired >= expire_max) {
1923                                 ipfw_state_expire_more(ctx);
1924                                 return (expired);
1925                         }
1926                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1927                             (expired & 0xff) == 0 &&
1928                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1929                             ipfw_state_max) {
1930                                 ipfw_state_expire_more(ctx);
1931                                 return (expired);
1932                         }
1933                 }
1934         }
1935         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1936         ipfw_state_expire_done(ctx);
1937         return (expired);
1938 }
1939
1940 static void
1941 ipfw_state_expire_more_dispatch(netmsg_t nm)
1942 {
1943         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1944         struct ipfw_state *anchor;
1945
1946         ASSERT_NETISR_NCPUS(mycpuid);
1947         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1948             ("statexp is not in progress"));
1949
1950         /* Reply ASAP */
1951         netisr_replymsg(&nm->base, 0);
1952
1953         anchor = &ctx->ipfw_stateexp_anch;
1954         if (ctx->ipfw_state_cnt == 0) {
1955                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1956                 ipfw_state_expire_done(ctx);
1957                 return;
1958         }
1959         ipfw_state_expire_loop(ctx, anchor,
1960             ipfw_state_scan_max, ipfw_state_expire_max);
1961 }
1962
1963 static int
1964 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1965 {
1966         struct ipfw_state *anchor;
1967
1968         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1969             ("stateexp is in progress"));
1970         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1971
1972         if (ctx->ipfw_state_cnt == 0) {
1973                 ipfw_state_expire_done(ctx);
1974                 return (0);
1975         }
1976
1977         /*
1978          * Do not expire more than once per second, it is useless.
1979          */
1980         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1981             ctx->ipfw_state_lastexp == time_uptime) {
1982                 ipfw_state_expire_done(ctx);
1983                 return (0);
1984         }
1985         ctx->ipfw_state_lastexp = time_uptime;
1986
1987         anchor = &ctx->ipfw_stateexp_anch;
1988         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1989         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1990 }
1991
1992 static void
1993 ipfw_state_expire_dispatch(netmsg_t nm)
1994 {
1995         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1996
1997         ASSERT_NETISR_NCPUS(mycpuid);
1998
1999         /* Reply ASAP */
2000         crit_enter();
2001         netisr_replymsg(&nm->base, 0);
2002         crit_exit();
2003
2004         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2005                 /* Running; done. */
2006                 return;
2007         }
2008         ipfw_state_expire_start(ctx,
2009             ipfw_state_scan_max, ipfw_state_expire_max);
2010 }
2011
2012 static void
2013 ipfw_state_expire_ipifunc(void *dummy __unused)
2014 {
2015         struct netmsg_base *msg;
2016
2017         KKASSERT(mycpuid < netisr_ncpus);
2018         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2019
2020         crit_enter();
2021         if (msg->lmsg.ms_flags & MSGF_DONE)
2022                 netisr_sendmsg_oncpu(msg);
2023         crit_exit();
2024 }
2025
2026 static boolean_t
2027 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2028 {
2029         uint32_t seq = ntohl(tcp->th_seq);
2030         uint32_t ack = ntohl(tcp->th_ack);
2031
2032         if (tcp->th_flags & TH_RST)
2033                 return (TRUE);
2034
2035         if (dir == MATCH_FORWARD) {
2036                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2037                         s->st_flags |= IPFW_STATE_F_SEQFWD;
2038                         s->st_seq_fwd = seq;
2039                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2040                         s->st_seq_fwd = seq;
2041                 } else {
2042                         /* Out-of-sequence; done. */
2043                         return (FALSE);
2044                 }
2045                 if (tcp->th_flags & TH_ACK) {
2046                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2047                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
2048                                 s->st_ack_fwd = ack;
2049                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2050                                 s->st_ack_fwd = ack;
2051                         } else {
2052                                 /* Out-of-sequence; done. */
2053                                 return (FALSE);
2054                         }
2055
2056                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2057                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2058                                 s->st_state |= (TH_ACK << 8);
2059                 }
2060         } else {
2061                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2062                         s->st_flags |= IPFW_STATE_F_SEQREV;
2063                         s->st_seq_rev = seq;
2064                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2065                         s->st_seq_rev = seq;
2066                 } else {
2067                         /* Out-of-sequence; done. */
2068                         return (FALSE);
2069                 }
2070                 if (tcp->th_flags & TH_ACK) {
2071                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2072                                 s->st_flags |= IPFW_STATE_F_ACKREV;
2073                                 s->st_ack_rev= ack;
2074                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2075                                 s->st_ack_rev = ack;
2076                         } else {
2077                                 /* Out-of-sequence; done. */
2078                                 return (FALSE);
2079                         }
2080
2081                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2082                             s->st_ack_rev == s->st_seq_fwd + 1)
2083                                 s->st_state |= TH_ACK;
2084                 }
2085         }
2086         return (TRUE);
2087 }
2088
2089 static void
2090 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2091     const struct tcphdr *tcp, struct ipfw_state *s)
2092 {
2093
2094         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2095                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2096
2097                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2098                         return;
2099
2100                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2101                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2102                 case TH_SYN:                            /* opening */
2103                         s->st_expire = time_uptime + dyn_syn_lifetime;
2104                         break;
2105
2106                 case BOTH_SYN:                  /* move to established */
2107                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
2108                 case BOTH_SYN | (TH_FIN << 8):
2109                         s->st_expire = time_uptime + dyn_ack_lifetime;
2110                         break;
2111
2112                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
2113                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2114                                 /* And both FINs were ACKed. */
2115                                 s->st_expire = time_uptime + dyn_fin_lifetime;
2116                         } else {
2117                                 s->st_expire = time_uptime +
2118                                     dyn_finwait_lifetime;
2119                         }
2120                         break;
2121
2122                 default:
2123 #if 0
2124                         /*
2125                          * reset or some invalid combination, but can also
2126                          * occur if we use keep-state the wrong way.
2127                          */
2128                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2129                                 kprintf("invalid state: 0x%x\n", s->st_state);
2130 #endif
2131                         s->st_expire = time_uptime + dyn_rst_lifetime;
2132                         break;
2133                 }
2134         } else if (pkt->proto == IPPROTO_UDP) {
2135                 s->st_expire = time_uptime + dyn_udp_lifetime;
2136         } else {
2137                 /* other protocols */
2138                 s->st_expire = time_uptime + dyn_short_lifetime;
2139         }
2140 }
2141
2142 /*
2143  * Lookup a state.
2144  */
2145 static struct ipfw_state *
2146 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2147     int *match_direction, const struct tcphdr *tcp)
2148 {
2149         struct ipfw_state *key, *s;
2150         int dir = MATCH_NONE;
2151
2152         key = &ctx->ipfw_state_tmpkey;
2153         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2154             pkt->dst_ip, pkt->dst_port, pkt->proto);
2155         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2156         if (s == NULL)
2157                 goto done; /* not found. */
2158         if (IPFW_STATE_ISDEAD(s)) {
2159                 ipfw_state_remove(ctx, s);
2160                 s = NULL;
2161                 goto done;
2162         }
2163         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2164                 /* TCP ports recycling is too fast. */
2165                 ctx->ipfw_sts_tcprecycled++;
2166                 ipfw_state_remove(ctx, s);
2167                 s = NULL;
2168                 goto done;
2169         }
2170
2171         if (s->st_swap == key->st_swap) {
2172                 dir = MATCH_FORWARD;
2173         } else {
2174                 KASSERT((s->st_swap & key->st_swap) == 0,
2175                     ("found mismatch state"));
2176                 dir = MATCH_REVERSE;
2177         }
2178
2179         /* Update this state. */
2180         ipfw_state_update(pkt, dir, tcp, s);
2181
2182         if (s->st_track != NULL) {
2183                 /* This track has been used. */
2184                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2185         }
2186 done:
2187         if (match_direction)
2188                 *match_direction = dir;
2189         return (s);
2190 }
2191
2192 static struct ipfw_state *
2193 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2194     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2195 {
2196         struct ipfw_state *s;
2197         size_t sz;
2198
2199         KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2200             ("invalid state type %u", type));
2201
2202         sz = sizeof(struct ipfw_state);
2203         if (IPFW_ISXLAT(type))
2204                 sz = sizeof(struct ipfw_xlat);
2205
2206         s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2207         if (s == NULL) {
2208                 ctx->ipfw_sts_nomem++;
2209                 return (NULL);
2210         }
2211
2212         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2213             id->dst_ip, id->dst_port, id->proto);
2214
2215         s->st_rule = rule;
2216         s->st_type = type;
2217         if (IPFW_ISXLAT(type)) {
2218                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2219
2220                 x->xlat_dir = MATCH_NONE;
2221                 x->xlat_pcpu = -1;
2222         }
2223
2224         /*
2225          * Update this state:
2226          * Set st_expire and st_state.
2227          */
2228         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2229
2230         return (s);
2231 }
2232
2233 static struct ipfw_state *
2234 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2235     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2236     const struct tcphdr *tcp)
2237 {
2238         struct ipfw_state *s, *dup;
2239
2240         s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2241         if (s == NULL)
2242                 return (NULL);
2243
2244         ctx->ipfw_state_cnt++;
2245         ctx->ipfw_state_loosecnt++;
2246         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2247                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2248                 ctx->ipfw_state_loosecnt = 0;
2249         }
2250
2251         dup = ipfw_state_link(ctx, s);
2252         if (dup != NULL)
2253                 panic("ipfw: %u state exists %p", type, dup);
2254
2255         if (t != NULL) {
2256                 /* Keep the track referenced. */
2257                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2258                 s->st_track = t;
2259         }
2260         return (s);
2261 }
2262
2263 static boolean_t
2264 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2265 {
2266         struct ipfw_trkcnt *trk;
2267         boolean_t trk_freed = FALSE;
2268
2269         KASSERT(t->t_count != NULL, ("track anchor"));
2270         KASSERT(LIST_EMPTY(&t->t_state_list),
2271             ("invalid track is still referenced"));
2272
2273         trk = t->t_trkcnt;
2274         KASSERT(trk != NULL, ("track has no trkcnt"));
2275
2276         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2277         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2278         kfree(t, M_IPFW);
2279
2280         /*
2281          * fdrop() style reference counting.
2282          * See kern/kern_descrip.c fdrop().
2283          */
2284         for (;;) {
2285                 int refs = trk->tc_refs;
2286
2287                 cpu_ccfence();
2288                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2289                 if (refs == 1) {
2290                         IPFW_TRKCNT_TOKGET;
2291                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2292                                 KASSERT(trk->tc_count == 0,
2293                                     ("%d states reference this trkcnt",
2294                                      trk->tc_count));
2295                                 RB_REMOVE(ipfw_trkcnt_tree,
2296                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
2297
2298                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2299                                     ("invalid trkcnt cnt %d",
2300                                      ipfw_gd.ipfw_trkcnt_cnt));
2301                                 ipfw_gd.ipfw_trkcnt_cnt--;
2302                                 IPFW_TRKCNT_TOKREL;
2303
2304                                 if (ctx->ipfw_trkcnt_spare == NULL)
2305                                         ctx->ipfw_trkcnt_spare = trk;
2306                                 else
2307                                         kfree(trk, M_IPFW);
2308                                 trk_freed = TRUE;
2309                                 break; /* done! */
2310                         }
2311                         IPFW_TRKCNT_TOKREL;
2312                         /* retry */
2313                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2314                         break; /* done! */
2315                 }
2316                 /* retry */
2317         }
2318         return (trk_freed);
2319 }
2320
2321 static void
2322 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2323 {
2324         struct ipfw_track *t, *tn;
2325
2326         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2327                 if (t->t_count == NULL) /* anchor */
2328                         continue;
2329                 if (rule != NULL && t->t_rule != rule)
2330                         continue;
2331                 ipfw_track_free(ctx, t);
2332         }
2333 }
2334
2335 static boolean_t
2336 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2337     boolean_t reap)
2338 {
2339         struct ipfw_state *s, *sn;
2340         boolean_t ret = FALSE;
2341
2342         KASSERT(t->t_count != NULL, ("track anchor"));
2343
2344         if (LIST_EMPTY(&t->t_state_list))
2345                 return (FALSE);
2346
2347         /*
2348          * Do not expire more than once per second, it is useless.
2349          */
2350         if (t->t_lastexp == time_uptime)
2351                 return (FALSE);
2352         t->t_lastexp = time_uptime;
2353
2354         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2355                 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2356                         KASSERT(s->st_track == t,
2357                             ("state track %p does not match %p",
2358                              s->st_track, t));
2359                         ipfw_state_del(ctx, s);
2360                         ret = TRUE;
2361                 }
2362         }
2363         return (ret);
2364 }
2365
2366 static __inline struct ipfw_trkcnt *
2367 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2368 {
2369         struct ipfw_trkcnt *trk;
2370
2371         if (ctx->ipfw_trkcnt_spare != NULL) {
2372                 trk = ctx->ipfw_trkcnt_spare;
2373                 ctx->ipfw_trkcnt_spare = NULL;
2374         } else {
2375                 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2376                     M_INTWAIT | M_NULLOK);
2377         }
2378         return (trk);
2379 }
2380
2381 static void
2382 ipfw_track_expire_done(struct ipfw_context *ctx)
2383 {
2384
2385         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2386             ("trackexp is not in progress"));
2387         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2388         callout_reset(&ctx->ipfw_trackto_ch, hz,
2389             ipfw_track_expire_ipifunc, NULL);
2390 }
2391
2392 static void
2393 ipfw_track_expire_more(struct ipfw_context *ctx)
2394 {
2395         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2396
2397         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2398             ("trackexp is not in progress"));
2399         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2400             ("trackexp more did not finish"));
2401         netisr_sendmsg_oncpu(nm);
2402 }
2403
2404 static int
2405 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2406     int scan_max, int expire_max)
2407 {
2408         struct ipfw_track *t;
2409         int scanned = 0, expired = 0;
2410         boolean_t reap = FALSE;
2411
2412         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2413             ("trackexp is not in progress"));
2414
2415         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2416                 reap = TRUE;
2417
2418         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2419                 if (scanned++ >= scan_max) {
2420                         ipfw_track_expire_more(ctx);
2421                         return (expired);
2422                 }
2423
2424                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2425                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2426
2427                 if (t->t_count == NULL) /* anchor */
2428                         continue;
2429
2430                 ipfw_track_state_expire(ctx, t, reap);
2431                 if (!LIST_EMPTY(&t->t_state_list)) {
2432                         /* There are states referencing this track. */
2433                         continue;
2434                 }
2435
2436                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2437                         /* Expired. */
2438                         if (ipfw_track_free(ctx, t)) {
2439                                 if (++expired >= expire_max) {
2440                                         ipfw_track_expire_more(ctx);
2441                                         return (expired);
2442                                 }
2443                         }
2444                 }
2445         }
2446         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2447         ipfw_track_expire_done(ctx);
2448         return (expired);
2449 }
2450
2451 static int
2452 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2453 {
2454         struct ipfw_track *anchor;
2455
2456         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2457             ("trackexp is in progress"));
2458         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2459
2460         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2461                 ipfw_track_expire_done(ctx);
2462                 return (0);
2463         }
2464
2465         /*
2466          * Do not expire more than once per second, it is useless.
2467          */
2468         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2469             ctx->ipfw_track_lastexp == time_uptime) {
2470                 ipfw_track_expire_done(ctx);
2471                 return (0);
2472         }
2473         ctx->ipfw_track_lastexp = time_uptime;
2474
2475         anchor = &ctx->ipfw_trackexp_anch;
2476         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2477         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2478 }
2479
2480 static void
2481 ipfw_track_expire_more_dispatch(netmsg_t nm)
2482 {
2483         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2484         struct ipfw_track *anchor;
2485
2486         ASSERT_NETISR_NCPUS(mycpuid);
2487         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2488             ("trackexp is not in progress"));
2489
2490         /* Reply ASAP */
2491         netisr_replymsg(&nm->base, 0);
2492
2493         anchor = &ctx->ipfw_trackexp_anch;
2494         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2495                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2496                 ipfw_track_expire_done(ctx);
2497                 return;
2498         }
2499         ipfw_track_expire_loop(ctx, anchor,
2500             ipfw_track_scan_max, ipfw_track_expire_max);
2501 }
2502
2503 static void
2504 ipfw_track_expire_dispatch(netmsg_t nm)
2505 {
2506         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2507
2508         ASSERT_NETISR_NCPUS(mycpuid);
2509
2510         /* Reply ASAP */
2511         crit_enter();
2512         netisr_replymsg(&nm->base, 0);
2513         crit_exit();
2514
2515         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2516                 /* Running; done. */
2517                 return;
2518         }
2519         ipfw_track_expire_start(ctx,
2520             ipfw_track_scan_max, ipfw_track_expire_max);
2521 }
2522
2523 static void
2524 ipfw_track_expire_ipifunc(void *dummy __unused)
2525 {
2526         struct netmsg_base *msg;
2527
2528         KKASSERT(mycpuid < netisr_ncpus);
2529         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2530
2531         crit_enter();
2532         if (msg->lmsg.ms_flags & MSGF_DONE)
2533                 netisr_sendmsg_oncpu(msg);
2534         crit_exit();
2535 }
2536
2537 static int
2538 ipfw_track_reap(struct ipfw_context *ctx)
2539 {
2540         struct ipfw_track *t, *anchor;
2541         int expired;
2542
2543         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2544                 /*
2545                  * Kick start track expiring.  Ignore scan limit,
2546                  * we are short of tracks.
2547                  */
2548                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2549                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2550                     ipfw_track_reap_max);
2551                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2552                 return (expired);
2553         }
2554
2555         /*
2556          * Tracks are being expired.
2557          */
2558
2559         if (RB_EMPTY(&ctx->ipfw_track_tree))
2560                 return (0);
2561
2562         expired = 0;
2563         anchor = &ctx->ipfw_trackexp_anch;
2564         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2565                 /*
2566                  * Ignore scan limit; we are short of tracks.
2567                  */
2568
2569                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2570                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2571
2572                 if (t->t_count == NULL) /* anchor */
2573                         continue;
2574
2575                 ipfw_track_state_expire(ctx, t, TRUE);
2576                 if (!LIST_EMPTY(&t->t_state_list)) {
2577                         /* There are states referencing this track. */
2578                         continue;
2579                 }
2580
2581                 if (ipfw_track_free(ctx, t)) {
2582                         if (++expired >= ipfw_track_reap_max) {
2583                                 ipfw_track_expire_more(ctx);
2584                                 break;
2585                         }
2586                 }
2587         }
2588         /*
2589          * NOTE:
2590          * Leave the anchor on the list, even if the end of the list has
2591          * been reached.  ipfw_track_expire_more_dispatch() will handle
2592          * the removal.
2593          */
2594         return (expired);
2595 }
2596
2597 static struct ipfw_track *
2598 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2599     uint16_t limit_mask, struct ip_fw *rule)
2600 {
2601         struct ipfw_track *key, *t, *dup;
2602         struct ipfw_trkcnt *trk, *ret;
2603         boolean_t do_expire = FALSE;
2604
2605         KASSERT(rule->track_ruleid != 0,
2606             ("rule %u has no track ruleid", rule->rulenum));
2607
2608         key = &ctx->ipfw_track_tmpkey;
2609         key->t_proto = id->proto;
2610         key->t_addrs = 0;
2611         key->t_ports = 0;
2612         key->t_rule = rule;
2613         if (limit_mask & DYN_SRC_ADDR)
2614                 key->t_saddr = id->src_ip;
2615         if (limit_mask & DYN_DST_ADDR)
2616                 key->t_daddr = id->dst_ip;
2617         if (limit_mask & DYN_SRC_PORT)
2618                 key->t_sport = id->src_port;
2619         if (limit_mask & DYN_DST_PORT)
2620                 key->t_dport = id->dst_port;
2621
2622         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2623         if (t != NULL)
2624                 goto done;
2625
2626         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2627         if (t == NULL) {
2628                 ctx->ipfw_tks_nomem++;
2629                 return (NULL);
2630         }
2631
2632         t->t_key = key->t_key;
2633         t->t_rule = rule;
2634         t->t_lastexp = 0;
2635         LIST_INIT(&t->t_state_list);
2636
2637         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2638                 time_t globexp, uptime;
2639
2640                 trk = NULL;
2641                 do_expire = TRUE;
2642
2643                 /*
2644                  * Do not expire globally more than once per second,
2645                  * it is useless.
2646                  */
2647                 uptime = time_uptime;
2648                 globexp = ipfw_gd.ipfw_track_globexp;
2649                 if (globexp != uptime &&
2650                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2651                     globexp, uptime)) {
2652                         int cpu;
2653
2654                         /* Expire tracks on other CPUs. */
2655                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2656                                 if (cpu == mycpuid)
2657                                         continue;
2658                                 lwkt_send_ipiq(globaldata_find(cpu),
2659                                     ipfw_track_expire_ipifunc, NULL);
2660                         }
2661                 }
2662         } else {
2663                 trk = ipfw_trkcnt_alloc(ctx);
2664         }
2665         if (trk == NULL) {
2666                 struct ipfw_trkcnt *tkey;
2667
2668                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2669                 key = NULL; /* tkey overlaps key */
2670
2671                 tkey->tc_key = t->t_key;
2672                 tkey->tc_ruleid = rule->track_ruleid;
2673
2674                 IPFW_TRKCNT_TOKGET;
2675                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2676                     tkey);
2677                 if (trk == NULL) {
2678                         IPFW_TRKCNT_TOKREL;
2679                         if (do_expire) {
2680                                 ctx->ipfw_tks_reap++;
2681                                 if (ipfw_track_reap(ctx) > 0) {
2682                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2683                                             ipfw_track_max) {
2684                                                 trk = ipfw_trkcnt_alloc(ctx);
2685                                                 if (trk != NULL)
2686                                                         goto install;
2687                                                 ctx->ipfw_tks_cntnomem++;
2688                                         } else {
2689                                                 ctx->ipfw_tks_overflow++;
2690                                         }
2691                                 } else {
2692                                         ctx->ipfw_tks_reapfailed++;
2693                                         ctx->ipfw_tks_overflow++;
2694                                 }
2695                         } else {
2696                                 ctx->ipfw_tks_cntnomem++;
2697                         }
2698                         kfree(t, M_IPFW);
2699                         return (NULL);
2700                 }
2701                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2702                     ("invalid trkcnt refs %d", trk->tc_refs));
2703                 atomic_add_int(&trk->tc_refs, 1);
2704                 IPFW_TRKCNT_TOKREL;
2705         } else {
2706 install:
2707                 trk->tc_key = t->t_key;
2708                 trk->tc_ruleid = rule->track_ruleid;
2709                 trk->tc_refs = 0;
2710                 trk->tc_count = 0;
2711                 trk->tc_expire = 0;
2712                 trk->tc_rulenum = rule->rulenum;
2713
2714                 IPFW_TRKCNT_TOKGET;
2715                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2716                     trk);
2717                 if (ret != NULL) {
2718                         KASSERT(ret->tc_refs > 0 &&
2719                             ret->tc_refs < netisr_ncpus,
2720                             ("invalid trkcnt refs %d", ret->tc_refs));
2721                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2722                             ("trkcnt spare was installed"));
2723                         ctx->ipfw_trkcnt_spare = trk;
2724                         trk = ret;
2725                 } else {
2726                         ipfw_gd.ipfw_trkcnt_cnt++;
2727                 }
2728                 atomic_add_int(&trk->tc_refs, 1);
2729                 IPFW_TRKCNT_TOKREL;
2730         }
2731         t->t_count = &trk->tc_count;
2732         t->t_trkcnt = trk;
2733
2734         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2735         if (dup != NULL)
2736                 panic("ipfw: track exists");
2737         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2738 done:
2739         t->t_expire = time_uptime + dyn_short_lifetime;
2740         return (t);
2741 }
2742
2743 /*
2744  * Install state for rule type cmd->o.opcode
2745  *
2746  * Returns NULL if state is not installed because of errors or because
2747  * states limitations are enforced.
2748  */
2749 static struct ipfw_state *
2750 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2751     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2752 {
2753         struct ipfw_state *s;
2754         struct ipfw_track *t;
2755         int count, diff;
2756
2757         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2758             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2759                 boolean_t overflow = TRUE;
2760
2761                 ctx->ipfw_sts_reap++;
2762                 if (ipfw_state_reap(ctx, diff) == 0)
2763                         ctx->ipfw_sts_reapfailed++;
2764                 if (ipfw_state_cntsync() < ipfw_state_max)
2765                         overflow = FALSE;
2766
2767                 if (overflow) {
2768                         time_t globexp, uptime;
2769                         int cpu;
2770
2771                         /*
2772                          * Do not expire globally more than once per second,
2773                          * it is useless.
2774                          */
2775                         uptime = time_uptime;
2776                         globexp = ipfw_gd.ipfw_state_globexp;
2777                         if (globexp == uptime ||
2778                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2779                             globexp, uptime)) {
2780                                 ctx->ipfw_sts_overflow++;
2781                                 return (NULL);
2782                         }
2783
2784                         /* Expire states on other CPUs. */
2785                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2786                                 if (cpu == mycpuid)
2787                                         continue;
2788                                 lwkt_send_ipiq(globaldata_find(cpu),
2789                                     ipfw_state_expire_ipifunc, NULL);
2790                         }
2791                         ctx->ipfw_sts_overflow++;
2792                         return (NULL);
2793                 }
2794         }
2795
2796         switch (cmd->o.opcode) {
2797         case O_KEEP_STATE: /* bidir rule */
2798         case O_REDIRECT:
2799                 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2800                     tcp);
2801                 if (s == NULL)
2802                         return (NULL);
2803                 break;
2804
2805         case O_LIMIT: /* limit number of sessions */
2806                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2807                 if (t == NULL)
2808                         return (NULL);
2809
2810                 if (*t->t_count >= cmd->conn_limit) {
2811                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2812                                 return (NULL);
2813                 }
2814                 for (;;) {
2815                         count = *t->t_count;
2816                         if (count >= cmd->conn_limit)
2817                                 return (NULL);
2818                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2819                                 break;
2820                 }
2821
2822                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2823                 if (s == NULL) {
2824                         /* Undo damage. */
2825                         atomic_subtract_int(t->t_count, 1);
2826                         return (NULL);
2827                 }
2828                 break;
2829
2830         default:
2831                 panic("unknown state type %u\n", cmd->o.opcode);
2832         }
2833
2834         if (s->st_type == O_REDIRECT) {
2835                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2836                 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2837
2838                 x->xlat_addr = r->addr.s_addr;
2839                 x->xlat_port = r->port;
2840                 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2841                 x->xlat_dir = MATCH_FORWARD;
2842                 KKASSERT(x->xlat_ifp != NULL);
2843         }
2844         return (s);
2845 }
2846
2847 static int
2848 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2849     const struct in_addr *in)
2850 {
2851         struct radix_node_head *rnh;
2852         struct sockaddr_in sin;
2853         struct ipfw_tblent *te;
2854
2855         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2856         rnh = ctx->ipfw_tables[tableid];
2857         if (rnh == NULL)
2858                 return (0); /* no match */
2859
2860         memset(&sin, 0, sizeof(sin));
2861         sin.sin_family = AF_INET;
2862         sin.sin_len = sizeof(sin);
2863         sin.sin_addr = *in;
2864
2865         te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2866         if (te == NULL)
2867                 return (0); /* no match */
2868
2869         te->te_use++;
2870         te->te_lastuse = time_second;
2871         return (1); /* match */
2872 }
2873
2874 /*
2875  * Transmit a TCP packet, containing either a RST or a keepalive.
2876  * When flags & TH_RST, we are sending a RST packet, because of a
2877  * "reset" action matched the packet.
2878  * Otherwise we are sending a keepalive, and flags & TH_
2879  *
2880  * Only {src,dst}_{ip,port} of "id" are used.
2881  */
2882 static void
2883 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2884 {
2885         struct mbuf *m;
2886         struct ip *ip;
2887         struct tcphdr *tcp;
2888         struct route sro;       /* fake route */
2889
2890         MGETHDR(m, M_NOWAIT, MT_HEADER);
2891         if (m == NULL)
2892                 return;
2893         m->m_pkthdr.rcvif = NULL;
2894         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2895         m->m_data += max_linkhdr;
2896
2897         ip = mtod(m, struct ip *);
2898         bzero(ip, m->m_len);
2899         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2900         ip->ip_p = IPPROTO_TCP;
2901         tcp->th_off = 5;
2902
2903         /*
2904          * Assume we are sending a RST (or a keepalive in the reverse
2905          * direction), swap src and destination addresses and ports.
2906          */
2907         ip->ip_src.s_addr = htonl(id->dst_ip);
2908         ip->ip_dst.s_addr = htonl(id->src_ip);
2909         tcp->th_sport = htons(id->dst_port);
2910         tcp->th_dport = htons(id->src_port);
2911         if (flags & TH_RST) {   /* we are sending a RST */
2912                 if (flags & TH_ACK) {
2913                         tcp->th_seq = htonl(ack);
2914                         tcp->th_ack = htonl(0);
2915                         tcp->th_flags = TH_RST;
2916                 } else {
2917                         if (flags & TH_SYN)
2918                                 seq++;
2919                         tcp->th_seq = htonl(0);
2920                         tcp->th_ack = htonl(seq);
2921                         tcp->th_flags = TH_RST | TH_ACK;
2922                 }
2923         } else {
2924                 /*
2925                  * We are sending a keepalive. flags & TH_SYN determines
2926                  * the direction, forward if set, reverse if clear.
2927                  * NOTE: seq and ack are always assumed to be correct
2928                  * as set by the caller. This may be confusing...
2929                  */
2930                 if (flags & TH_SYN) {
2931                         /*
2932                          * we have to rewrite the correct addresses!
2933                          */
2934                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2935                         ip->ip_src.s_addr = htonl(id->src_ip);
2936                         tcp->th_dport = htons(id->dst_port);
2937                         tcp->th_sport = htons(id->src_port);
2938                 }
2939                 tcp->th_seq = htonl(seq);
2940                 tcp->th_ack = htonl(ack);
2941                 tcp->th_flags = TH_ACK;
2942         }
2943
2944         /*
2945          * set ip_len to the payload size so we can compute
2946          * the tcp checksum on the pseudoheader
2947          * XXX check this, could save a couple of words ?
2948          */
2949         ip->ip_len = htons(sizeof(struct tcphdr));
2950         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2951
2952         /*
2953          * now fill fields left out earlier
2954          */
2955         ip->ip_ttl = ip_defttl;
2956         ip->ip_len = m->m_pkthdr.len;
2957
2958         bzero(&sro, sizeof(sro));
2959         ip_rtaddr(ip->ip_dst, &sro);
2960
2961         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2962         ip_output(m, NULL, &sro, 0, NULL, NULL);
2963         if (sro.ro_rt)
2964                 RTFREE(sro.ro_rt);
2965 }
2966
2967 /*
2968  * Send a reject message, consuming the mbuf passed as an argument.
2969  */
2970 static void
2971 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2972 {
2973         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2974                 /* We need the IP header in host order for icmp_error(). */
2975                 if (args->eh != NULL) {
2976                         struct ip *ip = mtod(args->m, struct ip *);
2977
2978                         ip->ip_len = ntohs(ip->ip_len);
2979                         ip->ip_off = ntohs(ip->ip_off);
2980                 }
2981                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2982         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2983                 struct tcphdr *const tcp =
2984                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2985
2986                 if ((tcp->th_flags & TH_RST) == 0) {
2987                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2988                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2989                 }
2990                 m_freem(args->m);
2991         } else {
2992                 m_freem(args->m);
2993         }
2994         args->m = NULL;
2995 }
2996
2997 /*
2998  * Given an ip_fw *, lookup_next_rule will return a pointer
2999  * to the next rule, which can be either the jump
3000  * target (for skipto instructions) or the next one in the list (in
3001  * all other cases including a missing jump target).
3002  * The result is also written in the "next_rule" field of the rule.
3003  * Backward jumps are not allowed, so start looking from the next
3004  * rule...
3005  *
3006  * This never returns NULL -- in case we do not have an exact match,
3007  * the next rule is returned. When the ruleset is changed,
3008  * pointers are flushed so we are always correct.
3009  */
3010 static struct ip_fw *
3011 lookup_next_rule(struct ip_fw *me)
3012 {
3013         struct ip_fw *rule = NULL;
3014         ipfw_insn *cmd;
3015
3016         /* look for action, in case it is a skipto */
3017         cmd = ACTION_PTR(me);
3018         if (cmd->opcode == O_LOG)
3019                 cmd += F_LEN(cmd);
3020         if (cmd->opcode == O_SKIPTO) {
3021                 for (rule = me->next; rule; rule = rule->next) {
3022                         if (rule->rulenum >= cmd->arg1)
3023                                 break;
3024                 }
3025         }
3026         if (rule == NULL)                       /* failure or not a skipto */
3027                 rule = me->next;
3028         me->next_rule = rule;
3029         return rule;
3030 }
3031
3032 static int
3033 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3034                 enum ipfw_opcodes opcode, uid_t uid)
3035 {
3036         struct in_addr src_ip, dst_ip;
3037         struct inpcbinfo *pi;
3038         boolean_t wildcard;
3039         struct inpcb *pcb;
3040
3041         if (fid->proto == IPPROTO_TCP) {
3042                 wildcard = FALSE;
3043                 pi = &tcbinfo[mycpuid];
3044         } else if (fid->proto == IPPROTO_UDP) {
3045                 wildcard = TRUE;
3046                 pi = &udbinfo[mycpuid];
3047         } else {
3048                 return 0;
3049         }
3050
3051         /*
3052          * Values in 'fid' are in host byte order
3053          */
3054         dst_ip.s_addr = htonl(fid->dst_ip);
3055         src_ip.s_addr = htonl(fid->src_ip);
3056         if (oif) {
3057                 pcb = in_pcblookup_hash(pi,
3058                         dst_ip, htons(fid->dst_port),
3059                         src_ip, htons(fid->src_port),
3060                         wildcard, oif);
3061         } else {
3062                 pcb = in_pcblookup_hash(pi,
3063                         src_ip, htons(fid->src_port),
3064                         dst_ip, htons(fid->dst_port),
3065                         wildcard, NULL);
3066         }
3067         if (pcb == NULL || pcb->inp_socket == NULL)
3068                 return 0;
3069
3070         if (opcode == O_UID) {
3071 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3072                 return !socheckuid(pcb->inp_socket, uid);
3073 #undef socheckuid
3074         } else  {
3075                 return groupmember(uid, pcb->inp_socket->so_cred);
3076         }
3077 }
3078
3079 static int
3080 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3081 {
3082
3083         if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3084                 struct ifaddr_container *ifac;
3085                 struct ifnet *ifp;
3086
3087                 ifp = ifunit_netisr(cmd->ifname);
3088                 if (ifp == NULL)
3089                         return (0);
3090
3091                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3092                         struct ifaddr *ia = ifac->ifa;
3093
3094                         if (ia->ifa_addr == NULL)
3095                                 continue;
3096                         if (ia->ifa_addr->sa_family != AF_INET)
3097                                 continue;
3098
3099                         cmd->mask.s_addr = INADDR_ANY;
3100                         if (cmd->o.arg1 & IPFW_IFIP_NET) {
3101                                 cmd->mask = ((struct sockaddr_in *)
3102                                     ia->ifa_netmask)->sin_addr;
3103                         }
3104                         if (cmd->mask.s_addr == INADDR_ANY)
3105                                 cmd->mask.s_addr = INADDR_BROADCAST;
3106
3107                         cmd->addr =
3108                             ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3109                         cmd->addr.s_addr &= cmd->mask.s_addr;
3110
3111                         cmd->o.arg1 |= IPFW_IFIP_VALID;
3112                         break;
3113                 }
3114                 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3115                         return (0);
3116         }
3117         return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3118 }
3119
3120 static void
3121 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3122     struct in_addr *old_addr, uint16_t *old_port)
3123 {
3124         struct ip *ip = mtod(m, struct ip *);
3125         struct in_addr *addr;
3126         uint16_t *port, *csum, dlen = 0;
3127         uint8_t udp = 0;
3128         boolean_t pseudo = FALSE;
3129
3130         if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3131                 addr = &ip->ip_src;
3132                 switch (ip->ip_p) {
3133                 case IPPROTO_TCP:
3134                         port = &L3HDR(struct tcphdr, ip)->th_sport;
3135                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3136                         break;
3137                 case IPPROTO_UDP:
3138                         port = &L3HDR(struct udphdr, ip)->uh_sport;
3139                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3140                         udp = 1;
3141                         break;
3142                 default:
3143                         panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3144                 }
3145         } else {
3146                 addr = &ip->ip_dst;
3147                 switch (ip->ip_p) {
3148                 case IPPROTO_TCP:
3149                         port = &L3HDR(struct tcphdr, ip)->th_dport;
3150                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3151                         break;
3152                 case IPPROTO_UDP:
3153                         port = &L3HDR(struct udphdr, ip)->uh_dport;
3154                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3155                         udp = 1;
3156                         break;
3157                 default:
3158                         panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3159                 }
3160         }
3161         if (old_addr != NULL)
3162                 *old_addr = *addr;
3163         if (old_port != NULL) {
3164                 if (x->xlat_port != 0)
3165                         *old_port = *port;
3166                 else
3167                         *old_port = 0;
3168         }
3169
3170         if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3171                 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3172                         dlen = ip->ip_len - (ip->ip_hl << 2);
3173                 pseudo = TRUE;
3174         }
3175
3176         if (!pseudo) {
3177                 const uint16_t *oaddr, *naddr;
3178
3179                 oaddr = (const uint16_t *)&addr->s_addr;
3180                 naddr = (const uint16_t *)&x->xlat_addr;
3181
3182                 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3183                     oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3184                 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3185                     oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3186         }
3187         addr->s_addr = x->xlat_addr;
3188
3189         if (x->xlat_port != 0) {
3190                 if (!pseudo) {
3191                         *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3192                             udp);
3193                 }
3194                 *port = x->xlat_port;
3195         }
3196
3197         if (pseudo) {
3198                 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3199                     htons(dlen + ip->ip_p));
3200         }
3201 }
3202
3203 static void
3204 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3205 {
3206         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3207         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3208         struct mbuf *m = nm->m;
3209         struct ipfw_xlat *x = nm->arg1;
3210         struct ip_fw *rule = x->xlat_rule;
3211
3212         ASSERT_NETISR_NCPUS(mycpuid);
3213         KASSERT(rule->cpuid == mycpuid,
3214             ("rule does not belong to cpu%d", mycpuid));
3215         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3216             ("mbuf does not have ipfw continue rule"));
3217
3218         KASSERT(ctx->ipfw_cont_rule == NULL,
3219             ("pending ipfw continue rule"));
3220         KASSERT(ctx->ipfw_cont_xlat == NULL,
3221             ("pending ipfw continue xlat"));
3222         ctx->ipfw_cont_rule = rule;
3223         ctx->ipfw_cont_xlat = x;
3224
3225         if (nm->arg2 == 0)
3226                 ip_input(m);
3227         else
3228                 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3229
3230         /* May not be cleared, if ipfw was unload/disabled. */
3231         ctx->ipfw_cont_rule = NULL;
3232         ctx->ipfw_cont_xlat = NULL;
3233
3234         /*
3235          * This state is no longer used; decrement its xlat_crefs,
3236          * so this state can be deleted.
3237          */
3238         x->xlat_crefs--;
3239         /*
3240          * This rule is no longer used; decrement its cross_refs,
3241          * so this rule can be deleted.
3242          *
3243          * NOTE:
3244          * Decrement cross_refs in the last step of this function,
3245          * so that the module could be unloaded safely.
3246          */
3247         rule->cross_refs--;
3248 }
3249
3250 static void
3251 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3252     uint32_t flags)
3253 {
3254         struct netmsg_genpkt *nm;
3255
3256         KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3257             x->xlat_pcpu, cpuid));
3258
3259         /*
3260          * Bump cross_refs to prevent this rule and its siblings
3261          * from being deleted, while this mbuf is inflight.  The
3262          * cross_refs of the sibling rule on the target cpu will
3263          * be decremented, once this mbuf is going to be filtered
3264          * on the target cpu.
3265          */
3266         x->xlat_rule->cross_refs++;
3267         /*
3268          * Bump xlat_crefs to prevent this state and its paired
3269          * state from being deleted, while this mbuf is inflight.
3270          * The xlat_crefs of the paired state on the target cpu
3271          * will be decremented, once this mbuf is going to be
3272          * filtered on the target cpu.
3273          */
3274         x->xlat_crefs++;
3275
3276         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3277         if (flags & IPFW_XLATE_INSERT)
3278                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3279         if (flags & IPFW_XLATE_FORWARD)
3280                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3281
3282         if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3283                 struct ip *ip = mtod(m, struct ip *);
3284
3285                 /*
3286                  * NOTE:
3287                  * ip_input() expects ip_len/ip_off are in network
3288                  * byte order.
3289                  */
3290                 ip->ip_len = htons(ip->ip_len);
3291                 ip->ip_off = htons(ip->ip_off);
3292         }
3293
3294         nm = &m->m_hdr.mh_genmsg;
3295         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3296             ipfw_ip_xlate_dispatch);
3297         nm->m = m;
3298         nm->arg1 = x->xlat_pair;
3299         nm->arg2 = 0;
3300         if (flags & IPFW_XLATE_OUTPUT)
3301                 nm->arg2 = 1;
3302         netisr_sendmsg(&nm->base, cpuid);
3303 }
3304
3305 static struct mbuf *
3306 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3307     struct ip_fw_local *local, struct ip **ip0)
3308 {
3309         struct ip *ip = mtod(m, struct ip *);
3310         struct tcphdr *tcp;
3311         struct udphdr *udp;
3312
3313         /*
3314          * Collect parameters into local variables for faster matching.
3315          */
3316         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
3317                 local->proto = args->f_id.proto = 0;    /* mark f_id invalid */
3318                 goto done;
3319         }
3320
3321         local->proto = args->f_id.proto = ip->ip_p;
3322         local->src_ip = ip->ip_src;
3323         local->dst_ip = ip->ip_dst;
3324         if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3325                 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3326                 local->ip_len = ntohs(ip->ip_len);
3327         } else {
3328                 local->offset = ip->ip_off & IP_OFFMASK;
3329                 local->ip_len = ip->ip_len;
3330         }
3331
3332 #define PULLUP_TO(len)                                  \
3333 do {                                                    \
3334         if (m->m_len < (len)) {                         \
3335                 args->m = m = m_pullup(m, (len));       \
3336                 if (m == NULL) {                        \
3337                         ip = NULL;                      \
3338                         goto done;                      \
3339                 }                                       \
3340                 ip = mtod(m, struct ip *);              \
3341         }                                               \
3342 } while (0)
3343
3344         if (local->offset == 0) {
3345                 switch (local->proto) {
3346                 case IPPROTO_TCP:
3347                         PULLUP_TO(hlen + sizeof(struct tcphdr));
3348                         local->tcp = tcp = L3HDR(struct tcphdr, ip);
3349                         local->dst_port = tcp->th_dport;
3350                         local->src_port = tcp->th_sport;
3351                         args->f_id.flags = tcp->th_flags;
3352                         break;
3353
3354                 case IPPROTO_UDP:
3355                         PULLUP_TO(hlen + sizeof(struct udphdr));
3356                         udp = L3HDR(struct udphdr, ip);
3357                         local->dst_port = udp->uh_dport;
3358                         local->src_port = udp->uh_sport;
3359                         break;
3360
3361                 case IPPROTO_ICMP:
3362                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
3363                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3364                         break;
3365
3366                 default:
3367                         break;
3368                 }
3369         }
3370
3371 #undef PULLUP_TO
3372
3373         args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3374         args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3375         args->f_id.src_port = local->src_port = ntohs(local->src_port);
3376         args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3377 done:
3378         *ip0 = ip;
3379         return (m);
3380 }
3381
3382 static struct mbuf *
3383 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3384     struct ip_fw_local *local, struct ip **ip0)
3385 {
3386         struct ip *ip = mtod(m, struct ip *);
3387
3388         ip->ip_len = htons(ip->ip_len);
3389         ip->ip_off = htons(ip->ip_off);
3390
3391         m->m_flags &= ~M_HASH;
3392         ip_hashfn(&m, 0);
3393         args->m = m;
3394         if (m == NULL) {
3395                 *ip0 = NULL;
3396                 return (NULL);
3397         }
3398         KASSERT(m->m_flags & M_HASH, ("no hash"));
3399
3400         /* 'm' might be changed by ip_hashfn(). */
3401         ip = mtod(m, struct ip *);
3402         ip->ip_len = ntohs(ip->ip_len);
3403         ip->ip_off = ntohs(ip->ip_off);
3404
3405         return (ipfw_setup_local(m, hlen, args, local, ip0));
3406 }
3407
3408 /*
3409  * The main check routine for the firewall.
3410  *
3411  * All arguments are in args so we can modify them and return them
3412  * back to the caller.
3413  *
3414  * Parameters:
3415  *
3416  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
3417  *              Starts with the IP header.
3418  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
3419  *      args->oif       Outgoing interface, or NULL if packet is incoming.
3420  *              The incoming interface is in the mbuf. (in)
3421  *
3422  *      args->rule      Pointer to the last matching rule (in/out)
3423  *      args->f_id      Addresses grabbed from the packet (out)
3424  *
3425  * Return value:
3426  *
3427  *      If the packet was denied/rejected and has been dropped, *m is equal
3428  *      to NULL upon return.
3429  *
3430  *      IP_FW_DENY      the packet must be dropped.
3431  *      IP_FW_PASS      The packet is to be accepted and routed normally.
3432  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
3433  *      IP_FW_TEE       Tee the packet to port (args->cookie)
3434  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
3435  *      IP_FW_CONTINUE  Continue processing on another cpu.
3436  */
3437 static int
3438 ipfw_chk(struct ip_fw_args *args)
3439 {
3440         /*
3441          * Local variables hold state during the processing of a packet.
3442          *
3443          * IMPORTANT NOTE: to speed up the processing of rules, there
3444          * are some assumption on the values of the variables, which
3445          * are documented here. Should you change them, please check
3446          * the implementation of the various instructions to make sure
3447          * that they still work.
3448          *
3449          * args->eh     The MAC header. It is non-null for a layer2
3450          *      packet, it is NULL for a layer-3 packet.
3451          *
3452          * m | args->m  Pointer to the mbuf, as received from the caller.
3453          *      It may change if ipfw_chk() does an m_pullup, or if it
3454          *      consumes the packet because it calls send_reject().
3455          *      XXX This has to change, so that ipfw_chk() never modifies
3456          *      or consumes the buffer.
3457          * ip   is simply an alias of the value of m, and it is kept
3458          *      in sync with it (the packet is  supposed to start with
3459          *      the ip header).
3460          */
3461         struct mbuf *m = args->m;
3462         struct ip *ip = mtod(m, struct ip *);
3463
3464         /*
3465          * oif | args->oif      If NULL, ipfw_chk has been called on the
3466          *      inbound path (ether_input, ip_input).
3467          *      If non-NULL, ipfw_chk has been called on the outbound path
3468          *      (ether_output, ip_output).
3469          */
3470         struct ifnet *oif = args->oif;
3471
3472         struct ip_fw *f = NULL;         /* matching rule */
3473         int retval = IP_FW_PASS;
3474         struct m_tag *mtag;
3475         struct divert_info *divinfo;
3476         struct ipfw_state *s;
3477
3478         /*
3479          * hlen The length of the IPv4 header.
3480          *      hlen >0 means we have an IPv4 packet.
3481          */
3482         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
3483
3484         struct ip_fw_local lc;
3485
3486         /*
3487          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3488          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
3489          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3490          */
3491         int dyn_dir = MATCH_UNKNOWN;
3492         struct ip_fw *dyn_f = NULL;
3493         int cpuid = mycpuid;
3494         struct ipfw_context *ctx;
3495
3496         ASSERT_NETISR_NCPUS(cpuid);
3497         ctx = ipfw_ctx[cpuid];
3498
3499         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3500                 return IP_FW_PASS;      /* accept */
3501
3502         if (args->eh == NULL ||         /* layer 3 packet */
3503             (m->m_pkthdr.len >= sizeof(struct ip) &&
3504              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3505                 hlen = ip->ip_hl << 2;
3506
3507         memset(&lc, 0, sizeof(lc));
3508
3509         m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3510         if (m == NULL)
3511                 goto pullup_failed;
3512
3513         if (args->rule) {
3514                 /*
3515                  * Packet has already been tagged. Look for the next rule
3516                  * to restart processing.
3517                  *
3518                  * If fw_one_pass != 0 then just accept it.
3519                  * XXX should not happen here, but optimized out in
3520                  * the caller.
3521                  */
3522                 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3523                         return IP_FW_PASS;
3524                 args->flags &= ~IP_FWARG_F_CONT;
3525
3526                 /* This rule is being/has been flushed */
3527                 if (ipfw_flushing)
3528                         return IP_FW_DENY;
3529
3530                 KASSERT(args->rule->cpuid == cpuid,
3531                         ("rule used on cpu%d", cpuid));
3532
3533                 /* This rule was deleted */
3534                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3535                         return IP_FW_DENY;
3536
3537                 if (args->xlat != NULL) {
3538                         struct ipfw_xlat *x = args->xlat;
3539
3540                         /* This xlat is being deleted. */
3541                         if (x->xlat_invalid)
3542                                 return IP_FW_DENY;
3543
3544                         f = args->rule;
3545
3546                         dyn_f = f;
3547                         dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3548                             MATCH_FORWARD : MATCH_REVERSE;
3549
3550                         if (args->flags & IP_FWARG_F_XLATINS) {
3551                                 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3552                                     ("not slave %u state", x->xlat_type));
3553                                 s = ipfw_state_link(ctx, &x->xlat_st);
3554                                 if (s != NULL) {
3555                                         ctx->ipfw_xlate_conflicts++;
3556                                         if (IPFW_STATE_ISDEAD(s)) {
3557                                                 ipfw_state_remove(ctx, s);
3558                                                 s = ipfw_state_link(ctx,
3559                                                     &x->xlat_st);
3560                                         }
3561                                         if (s != NULL) {
3562                                                 if (bootverbose) {
3563                                                         kprintf("ipfw: "
3564                                                         "slave %u state "
3565                                                         "conflicts %u state\n",
3566                                                         x->xlat_type,
3567                                                         s->st_type);
3568                                                 }
3569                                                 ipfw_xlat_invalidate(x);
3570                                                 return IP_FW_DENY;
3571                                         }
3572                                         ctx->ipfw_xlate_cresolved++;
3573                                 }
3574                         } else {
3575                                 ipfw_state_update(&args->f_id, dyn_dir,
3576                                     lc.tcp, &x->xlat_st);
3577                         }
3578                 } else {
3579                         /* TODO: setup dyn_f, dyn_dir */
3580
3581                         f = args->rule->next_rule;
3582                         if (f == NULL)
3583                                 f = lookup_next_rule(args->rule);
3584                 }
3585         } else {
3586                 /*
3587                  * Find the starting rule. It can be either the first
3588                  * one, or the one after divert_rule if asked so.
3589                  */
3590                 int skipto;
3591
3592                 KKASSERT((args->flags &
3593                     (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3594                 KKASSERT(args->xlat == NULL);
3595
3596                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3597                 if (mtag != NULL) {
3598                         divinfo = m_tag_data(mtag);
3599                         skipto = divinfo->skipto;
3600                 } else {
3601                         skipto = 0;
3602                 }
3603
3604                 f = ctx->ipfw_layer3_chain;
3605                 if (args->eh == NULL && skipto != 0) {
3606                         /* No skipto during rule flushing */
3607                         if (ipfw_flushing)
3608                                 return IP_FW_DENY;
3609
3610                         if (skipto >= IPFW_DEFAULT_RULE)
3611                                 return IP_FW_DENY; /* invalid */
3612
3613                         while (f && f->rulenum <= skipto)
3614                                 f = f->next;
3615                         if (f == NULL)  /* drop packet */
3616                                 return IP_FW_DENY;
3617                 } else if (ipfw_flushing) {
3618                         /* Rules are being flushed; skip to default rule */
3619                         f = ctx->ipfw_default_rule;
3620                 }
3621         }
3622         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3623                 m_tag_delete(m, mtag);
3624
3625         /*
3626          * Now scan the rules, and parse microinstructions for each rule.
3627          */
3628         for (; f; f = f->next) {
3629                 int l, cmdlen;
3630                 ipfw_insn *cmd;
3631                 int skip_or; /* skip rest of OR block */
3632
3633 again:
3634                 if (ctx->ipfw_set_disable & (1 << f->set)) {
3635                         args->xlat = NULL;
3636                         continue;
3637                 }
3638
3639                 if (args->xlat != NULL) {
3640                         args->xlat = NULL;
3641                         l = f->cmd_len - f->act_ofs;
3642                         cmd = ACTION_PTR(f);
3643                 } else {
3644                         l = f->cmd_len;
3645                         cmd = f->cmd;
3646                 }
3647
3648                 skip_or = 0;
3649                 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3650                         int match;
3651
3652                         /*
3653                          * check_body is a jump target used when we find a
3654                          * CHECK_STATE, and need to jump to the body of
3655                          * the target rule.
3656                          */
3657 check_body:
3658                         cmdlen = F_LEN(cmd);
3659                         /*
3660                          * An OR block (insn_1 || .. || insn_n) has the
3661                          * F_OR bit set in all but the last instruction.
3662                          * The first match will set "skip_or", and cause
3663                          * the following instructions to be skipped until
3664                          * past the one with the F_OR bit clear.
3665                          */
3666                         if (skip_or) {          /* skip this instruction */
3667                                 if ((cmd->len & F_OR) == 0)
3668                                         skip_or = 0;    /* next one is good */
3669                                 continue;
3670                         }
3671                         match = 0; /* set to 1 if we succeed */
3672
3673                         switch (cmd->opcode) {
3674                         /*
3675                          * The first set of opcodes compares the packet's
3676                          * fields with some pattern, setting 'match' if a
3677                          * match is found. At the end of the loop there is
3678                          * logic to deal with F_NOT and F_OR flags associated
3679                          * with the opcode.
3680                          */
3681                         case O_NOP:
3682                                 match = 1;
3683                                 break;
3684
3685                         case O_FORWARD_MAC:
3686                                 kprintf("ipfw: opcode %d unimplemented\n",
3687                                         cmd->opcode);
3688                                 break;
3689
3690                         case O_GID:
3691                         case O_UID:
3692                                 /*
3693                                  * We only check offset == 0 && proto != 0,
3694                                  * as this ensures that we have an IPv4
3695                                  * packet with the ports info.
3696                                  */
3697                                 if (lc.offset!=0)
3698                                         break;
3699
3700                                 match = ipfw_match_uid(&args->f_id, oif,
3701                                         cmd->opcode,
3702                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3703                                 break;
3704
3705                         case O_RECV:
3706                                 match = iface_match(m->m_pkthdr.rcvif,
3707                                     (ipfw_insn_if *)cmd);
3708                                 break;
3709
3710                         case O_XMIT:
3711                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3712                                 break;
3713
3714                         case O_VIA:
3715                                 match = iface_match(oif ? oif :
3716                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3717                                 break;
3718
3719                         case O_MACADDR2:
3720                                 if (args->eh != NULL) { /* have MAC header */
3721                                         uint32_t *want = (uint32_t *)
3722                                                 ((ipfw_insn_mac *)cmd)->addr;
3723                                         uint32_t *mask = (uint32_t *)
3724                                                 ((ipfw_insn_mac *)cmd)->mask;
3725                                         uint32_t *hdr = (uint32_t *)args->eh;
3726
3727                                         match =
3728                                         (want[0] == (hdr[0] & mask[0]) &&
3729                                          want[1] == (hdr[1] & mask[1]) &&
3730                                          want[2] == (hdr[2] & mask[2]));
3731                                 }
3732                                 break;
3733
3734                         case O_MAC_TYPE:
3735                                 if (args->eh != NULL) {
3736                                         uint16_t t =
3737                                             ntohs(args->eh->ether_type);
3738                                         uint16_t *p =
3739                                             ((ipfw_insn_u16 *)cmd)->ports;
3740                                         int i;
3741
3742                                         /* Special vlan handling */
3743                                         if (m->m_flags & M_VLANTAG)
3744                                                 t = ETHERTYPE_VLAN;
3745
3746                                         for (i = cmdlen - 1; !match && i > 0;
3747                                              i--, p += 2) {
3748                                                 match =
3749                                                 (t >= p[0] && t <= p[1]);
3750                                         }
3751                                 }
3752                                 break;
3753
3754                         case O_FRAG:
3755                                 match = (hlen > 0 && lc.offset != 0);
3756                                 break;
3757
3758                         case O_IPFRAG:
3759                                 if (hlen > 0) {
3760                                         uint16_t off;
3761
3762                                         if (args->eh != NULL)
3763                                                 off = ntohs(ip->ip_off);
3764                                         else
3765                                                 off = ip->ip_off;
3766                                         if (off & (IP_MF | IP_OFFMASK))
3767                                                 match = 1;
3768                                 }
3769                                 break;
3770
3771                         case O_IN:      /* "out" is "not in" */
3772                                 match = (oif == NULL);
3773                                 break;
3774
3775                         case O_LAYER2:
3776                                 match = (args->eh != NULL);
3777                                 break;
3778
3779                         case O_PROTO:
3780                                 /*
3781                                  * We do not allow an arg of 0 so the
3782                                  * check of "proto" only suffices.
3783                                  */
3784                                 match = (lc.proto == cmd->arg1);
3785                                 break;
3786
3787                         case O_IP_SRC:
3788                                 match = (hlen > 0 &&
3789                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3790                                     lc.src_ip.s_addr);
3791                                 break;
3792
3793                         case O_IP_SRC_MASK:
3794                                 match = (hlen > 0 &&
3795                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3796                                      (lc.src_ip.s_addr &
3797                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3798                                 break;
3799
3800                         case O_IP_SRC_ME:
3801                                 if (hlen > 0) {
3802                                         struct ifnet *tif;
3803
3804                                         tif = INADDR_TO_IFP(&lc.src_ip);
3805                                         match = (tif != NULL);
3806                                 }
3807                                 break;
3808
3809                         case O_IP_SRC_TABLE:
3810                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3811                                     &lc.src_ip);
3812                                 break;
3813
3814                         case O_IP_SRC_IFIP:
3815                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3816                                     &lc.src_ip);
3817                                 break;
3818
3819                         case O_IP_DST_SET:
3820                         case O_IP_SRC_SET:
3821                                 if (hlen > 0) {
3822                                         uint32_t *d = (uint32_t *)(cmd + 1);
3823                                         uint32_t addr =
3824                                             cmd->opcode == O_IP_DST_SET ?
3825                                                 args->f_id.dst_ip :
3826                                                 args->f_id.src_ip;
3827
3828                                         if (addr < d[0])
3829                                                 break;
3830                                         addr -= d[0]; /* subtract base */
3831                                         match =
3832                                         (addr < cmd->arg1) &&
3833                                          (d[1 + (addr >> 5)] &
3834                                           (1 << (addr & 0x1f)));
3835                                 }
3836                                 break;
3837
3838                         case O_IP_DST:
3839                                 match = (hlen > 0 &&
3840                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3841                                     lc.dst_ip.s_addr);
3842                                 break;
3843
3844                         case O_IP_DST_MASK:
3845                                 match = (hlen > 0) &&
3846                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3847                                      (lc.dst_ip.s_addr &
3848                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3849                                 break;
3850
3851                         case O_IP_DST_ME:
3852                                 if (hlen > 0) {
3853                                         struct ifnet *tif;
3854
3855                                         tif = INADDR_TO_IFP(&lc.dst_ip);
3856                                         match = (tif != NULL);
3857                                 }
3858                                 break;
3859
3860                         case O_IP_DST_TABLE:
3861                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3862                                     &lc.dst_ip);
3863                                 break;
3864
3865                         case O_IP_DST_IFIP:
3866                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3867                                     &lc.dst_ip);
3868                                 break;
3869
3870                         case O_IP_SRCPORT:
3871                         case O_IP_DSTPORT:
3872                                 /*
3873                                  * offset == 0 && proto != 0 is enough
3874                                  * to guarantee that we have an IPv4
3875                                  * packet with port info.
3876                                  */
3877                                 if ((lc.proto==IPPROTO_UDP ||
3878                                      lc.proto==IPPROTO_TCP)
3879                                     && lc.offset == 0) {
3880                                         uint16_t x =
3881                                             (cmd->opcode == O_IP_SRCPORT) ?
3882                                                 lc.src_port : lc.dst_port;
3883                                         uint16_t *p =
3884                                             ((ipfw_insn_u16 *)cmd)->ports;
3885                                         int i;
3886
3887                                         for (i = cmdlen - 1; !match && i > 0;
3888                                              i--, p += 2) {
3889                                                 match =
3890                                                 (x >= p[0] && x <= p[1]);
3891                                         }
3892                                 }
3893                                 break;
3894
3895                         case O_ICMPCODE:
3896                                 match = (lc.offset == 0 &&
3897                                     lc.proto==IPPROTO_ICMP &&
3898                                     icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3899                                 break;
3900
3901                         case O_ICMPTYPE:
3902                                 match = (lc.offset == 0 &&
3903                                     lc.proto==IPPROTO_ICMP &&
3904                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3905                                 break;
3906
3907                         case O_IPOPT:
3908                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3909                                 break;
3910
3911                         case O_IPVER:
3912                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3913                                 break;
3914
3915                         case O_IPTTL:
3916                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3917                                 break;
3918
3919                         case O_IPID:
3920                                 match = (hlen > 0 &&
3921                                     cmd->arg1 == ntohs(ip->ip_id));
3922                                 break;
3923
3924                         case O_IPLEN:
3925                                 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3926                                 break;
3927
3928                         case O_IPPRECEDENCE:
3929                                 match = (hlen > 0 &&
3930                                     (cmd->arg1 == (ip->ip_tos & 0xe0)));
3931                                 break;
3932
3933                         case O_IPTOS:
3934                                 match = (hlen > 0 &&
3935                                     flags_match(cmd, ip->ip_tos));
3936                                 break;
3937
3938                         case O_TCPFLAGS:
3939                                 match = (lc.proto == IPPROTO_TCP &&
3940                                     lc.offset == 0 &&
3941                                     flags_match(cmd,
3942                                         L3HDR(struct tcphdr,ip)->th_flags));
3943                                 break;
3944
3945                         case O_TCPOPTS:
3946                                 match = (lc.proto == IPPROTO_TCP &&
3947                                     lc.offset == 0 && tcpopts_match(ip, cmd));
3948                                 break;
3949
3950                         case O_TCPSEQ:
3951                                 match = (lc.proto == IPPROTO_TCP &&
3952                                     lc.offset == 0 &&
3953                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3954                                         L3HDR(struct tcphdr,ip)->th_seq);
3955                                 break;
3956
3957                         case O_TCPACK:
3958                                 match = (lc.proto == IPPROTO_TCP &&
3959                                     lc.offset == 0 &&
3960                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3961                                         L3HDR(struct tcphdr,ip)->th_ack);
3962                                 break;
3963
3964                         case O_TCPWIN:
3965                                 match = (lc.proto == IPPROTO_TCP &&
3966                                     lc.offset == 0 &&
3967                                     cmd->arg1 ==
3968                                         L3HDR(struct tcphdr,ip)->th_win);
3969                                 break;
3970
3971                         case O_ESTAB:
3972                                 /* reject packets which have SYN only */
3973                                 /* XXX should i also check for TH_ACK ? */
3974                                 match = (lc.proto == IPPROTO_TCP &&
3975                                     lc.offset == 0 &&
3976                                     (L3HDR(struct tcphdr,ip)->th_flags &
3977                                      (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3978                                 break;
3979
3980                         case O_LOG:
3981                                 if (fw_verbose) {
3982                                         ipfw_log(ctx, f, hlen, args->eh, m,
3983                                             oif);
3984                                 }
3985                                 match = 1;
3986                                 break;
3987
3988                         case O_PROB:
3989                                 match = (krandom() <
3990                                         ((ipfw_insn_u32 *)cmd)->d[0]);
3991                                 break;
3992
3993                         /*
3994                          * The second set of opcodes represents 'actions',
3995                          * i.e. the terminal part of a rule once the packet
3996                          * matches all previous patterns.
3997                          * Typically there is only one action for each rule,
3998                          * and the opcode is stored at the end of the rule
3999                          * (but there are exceptions -- see below).
4000                          *
4001                          * In general, here we set retval and terminate the
4002                          * outer loop (would be a 'break 3' in some language,
4003                          * but we need to do a 'goto done').
4004                          *
4005                          * Exceptions:
4006                          * O_COUNT and O_SKIPTO actions:
4007                          *   instead of terminating, we jump to the next rule
4008                          *   ('goto next_rule', equivalent to a 'break 2'),
4009                          *   or to the SKIPTO target ('goto again' after
4010                          *   having set f, cmd and l), respectively.
4011                          *
4012                          * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
4013                          *   are not real 'actions', and are stored right
4014                          *   before the 'action' part of the rule.
4015                          *   These opcodes try to install an entry in the
4016                          *   state tables; if successful, we continue with
4017                          *   the next opcode (match=1; break;), otherwise
4018                          *   the packet must be dropped ('goto done' after
4019                          *   setting retval).  If static rules are changed
4020                          *   during the state installation, the packet will
4021                          *   be dropped and rule's stats will not beupdated
4022                          *   ('return IP_FW_DENY').
4023                          *
4024                          * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4025                          *   cause a lookup of the state table, and a jump
4026                          *   to the 'action' part of the parent rule
4027                          *   ('goto check_body') if an entry is found, or
4028                          *   (CHECK_STATE only) a jump to the next rule if
4029                          *   the entry is not found ('goto next_rule').
4030                          *   The result of the lookup is cached to make
4031                          *   further instances of these opcodes are
4032                          *   effectively NOPs.  If static rules are changed
4033                          *   during the state looking up, the packet will
4034                          *   be dropped and rule's stats will not be updated
4035                          *   ('return IP_FW_DENY').
4036                          */
4037                         case O_REDIRECT:
4038                                 if (f->cross_rules == NULL) {
4039                                         /*
4040                                          * This rule was not completely setup;
4041                                          * move on to the next rule.
4042                                          */
4043                                         goto next_rule;
4044                                 }
4045                                 /*
4046                                  * Apply redirect only on input path and
4047                                  * only to non-fragment TCP segments or
4048                                  * UDP datagrams.
4049                                  *
4050                                  * Does _not_ work with layer2 filtering.
4051                                  */
4052                                 if (oif != NULL || args->eh != NULL ||
4053                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4054                                     (lc.proto != IPPROTO_TCP &&
4055                                      lc.proto != IPPROTO_UDP))
4056                                         break;
4057                                 /* FALL THROUGH */
4058                         case O_LIMIT:
4059                         case O_KEEP_STATE:
4060                                 if (hlen == 0)
4061                                         break;
4062                                 s = ipfw_state_install(ctx, f,
4063                                     (ipfw_insn_limit *)cmd, args, lc.tcp);
4064                                 if (s == NULL) {
4065                                         retval = IP_FW_DENY;
4066                                         goto done; /* error/limit violation */
4067                                 }
4068                                 s->st_pcnt++;
4069                                 s->st_bcnt += lc.ip_len;
4070
4071                                 if (s->st_type == O_REDIRECT) {
4072                                         struct in_addr oaddr;
4073                                         uint16_t oport;
4074                                         struct ipfw_xlat *slave_x, *x;
4075                                         struct ipfw_state *dup;
4076
4077                                         x = (struct ipfw_xlat *)s;
4078                                         ipfw_xlate(x, m, &oaddr, &oport);
4079                                         m = ipfw_rehashm(m, hlen, args, &lc,
4080                                             &ip);
4081                                         if (m == NULL) {
4082                                                 ipfw_state_del(ctx, s);
4083                                                 goto pullup_failed;
4084                                         }
4085
4086                                         cpuid = netisr_hashcpu(
4087                                             m->m_pkthdr.hash);
4088
4089                                         slave_x = (struct ipfw_xlat *)
4090                                             ipfw_state_alloc(ctx, &args->f_id,
4091                                             O_REDIRECT, f->cross_rules[cpuid],
4092                                             lc.tcp);
4093                                         if (slave_x == NULL) {
4094                                                 ipfw_state_del(ctx, s);
4095                                                 retval = IP_FW_DENY;
4096                                                 goto done;
4097                                         }
4098                                         slave_x->xlat_addr = oaddr.s_addr;
4099                                         slave_x->xlat_port = oport;
4100                                         slave_x->xlat_dir = MATCH_REVERSE;
4101                                         slave_x->xlat_flags |=
4102                                             IPFW_STATE_F_XLATSRC |
4103                                             IPFW_STATE_F_XLATSLAVE;
4104
4105                                         slave_x->xlat_pair = x;
4106                                         slave_x->xlat_pcpu = mycpuid;
4107                                         x->xlat_pair = slave_x;
4108                                         x->xlat_pcpu = cpuid;
4109
4110                                         ctx->ipfw_xlated++;
4111                                         if (cpuid != mycpuid) {
4112                                                 ctx->ipfw_xlate_split++;
4113                                                 ipfw_xlate_redispatch(
4114                                                     m, cpuid, x,
4115                                                     IPFW_XLATE_INSERT |
4116                                                     IPFW_XLATE_FORWARD);
4117                                                 args->m = NULL;
4118                                                 return (IP_FW_REDISPATCH);
4119                                         }
4120
4121                                         dup = ipfw_state_link(ctx,
4122                                             &slave_x->xlat_st);
4123                                         if (dup != NULL) {
4124                                                 ctx->ipfw_xlate_conflicts++;
4125                                                 if (IPFW_STATE_ISDEAD(dup)) {
4126                                                         ipfw_state_remove(ctx,
4127                                                             dup);
4128                                                         dup = ipfw_state_link(
4129                                                         ctx, &slave_x->xlat_st);
4130                                                 }
4131                                                 if (dup != NULL) {
4132                                                         if (bootverbose) {
4133                                                             kprintf("ipfw: "
4134                                                             "slave %u state "
4135                                                             "conflicts "
4136                                                             "%u state\n",
4137                                                             x->xlat_type,
4138                                                             s->st_type);
4139                                                         }
4140                                                         ipfw_state_del(ctx, s);
4141                                                         return (IP_FW_DENY);
4142                                                 }
4143                                                 ctx->ipfw_xlate_cresolved++;
4144                                         }
4145                                 }
4146                                 match = 1;
4147                                 break;
4148
4149                         case O_PROBE_STATE:
4150                         case O_CHECK_STATE:
4151                                 /*
4152                                  * States are checked at the first keep-state 
4153                                  * check-state occurrence, with the result
4154                                  * being stored in dyn_dir.  The compiler
4155                                  * introduces a PROBE_STATE instruction for
4156                                  * us when we have a KEEP_STATE/LIMIT/RDR
4157                                  * (because PROBE_STATE needs to be run first).
4158                                  */
4159                                 s = NULL;
4160                                 if (dyn_dir == MATCH_UNKNOWN) {
4161                                         s = ipfw_state_lookup(ctx,
4162                                             &args->f_id, &dyn_dir, lc.tcp);
4163                                 }
4164                                 if (s == NULL ||
4165                                     (s->st_type == O_REDIRECT &&
4166                                      (args->eh != NULL ||
4167                                       (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4168                                       (lc.proto != IPPROTO_TCP &&
4169                                        lc.proto != IPPROTO_UDP)))) {
4170                                         /*
4171                                          * State not found. If CHECK_STATE,
4172                                          * skip to next rule, if PROBE_STATE
4173                                          * just ignore and continue with next
4174                                          * opcode.
4175                                          */
4176                                         if (cmd->opcode == O_CHECK_STATE)
4177                                                 goto next_rule;
4178                                         match = 1;
4179                                         break;
4180                                 }
4181
4182                                 s->st_pcnt++;
4183                                 s->st_bcnt += lc.ip_len;
4184
4185                                 if (s->st_type == O_REDIRECT) {
4186                                         struct ipfw_xlat *x =
4187                                             (struct ipfw_xlat *)s;
4188
4189                                         if (oif != NULL &&
4190                                             x->xlat_ifp == NULL) {
4191                                                 KASSERT(x->xlat_flags &
4192                                                     IPFW_STATE_F_XLATSLAVE,
4193                                                     ("master rdr state "
4194                                                      "missing ifp"));
4195                                                 x->xlat_ifp = oif;
4196                                         } else if (
4197                                             (oif != NULL && x->xlat_ifp!=oif) ||
4198                                             (oif == NULL &&
4199                                              x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4200                                                 retval = IP_FW_DENY;
4201                                                 goto done;
4202                                         }
4203                                         if (x->xlat_dir != dyn_dir)
4204                                                 goto skip_xlate;
4205
4206                                         ipfw_xlate(x, m, NULL, NULL);
4207                                         m = ipfw_rehashm(m, hlen, args, &lc,
4208                                             &ip);
4209                                         if (m == NULL)
4210                                                 goto pullup_failed;
4211
4212                                         cpuid = netisr_hashcpu(
4213                                             m->m_pkthdr.hash);
4214                                         if (cpuid != mycpuid) {
4215                                                 uint32_t xlate = 0;
4216
4217                                                 if (oif != NULL) {
4218                                                         xlate |=
4219                                                             IPFW_XLATE_OUTPUT;
4220                                                 }
4221                                                 if (dyn_dir == MATCH_FORWARD) {
4222                                                         xlate |=
4223                                                             IPFW_XLATE_FORWARD;
4224                                                 }
4225                                                 ipfw_xlate_redispatch(m, cpuid,
4226                                                     x, xlate);
4227                                                 args->m = NULL;
4228                                                 return (IP_FW_REDISPATCH);
4229                                         }
4230
4231                                         KKASSERT(x->xlat_pcpu == mycpuid);
4232                                         ipfw_state_update(&args->f_id, dyn_dir,
4233                                             lc.tcp, &x->xlat_pair->xlat_st);
4234                                 }
4235 skip_xlate:
4236                                 /*
4237                                  * Found a rule from a state; jump to the
4238                                  * 'action' part of the rule.
4239                                  */
4240                                 f = s->st_rule;
4241                                 KKASSERT(f->cpuid == mycpuid);
4242
4243                                 cmd = ACTION_PTR(f);
4244                                 l = f->cmd_len - f->act_ofs;
4245                                 dyn_f = f;
4246                                 goto check_body;
4247
4248                         case O_ACCEPT:
4249                                 retval = IP_FW_PASS;    /* accept */
4250                                 goto done;
4251
4252                         case O_DEFRAG:
4253                                 if (f->cross_rules == NULL) {
4254                                         /*
4255                                          * This rule was not completely setup;
4256                                          * move on to the next rule.
4257                                          */
4258                                         goto next_rule;
4259                                 }
4260
4261                                 /*
4262                                  * Don't defrag for l2 packets, output packets
4263                                  * or non-fragments.
4264                                  */
4265                                 if (oif != NULL || args->eh != NULL ||
4266                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4267                                         goto next_rule;
4268
4269                                 ctx->ipfw_frags++;
4270                                 m = ip_reass(m);
4271                                 args->m = m;
4272                                 if (m == NULL) {
4273                                         retval = IP_FW_PASS;
4274                                         goto done;
4275                                 }
4276                                 ctx->ipfw_defraged++;
4277                                 KASSERT((m->m_flags & M_HASH) == 0,
4278                                     ("hash not cleared"));
4279
4280                                 /* Update statistics */
4281                                 f->pcnt++;
4282                                 f->bcnt += lc.ip_len;
4283                                 f->timestamp = time_second;
4284
4285                                 ip = mtod(m, struct ip *);
4286                                 hlen = ip->ip_hl << 2;
4287                                 ip->ip_len += hlen;
4288
4289                                 ip->ip_len = htons(ip->ip_len);
4290                                 ip->ip_off = htons(ip->ip_off);
4291
4292                                 ip_hashfn(&m, 0);
4293                                 args->m = m;
4294                                 if (m == NULL)
4295                                         goto pullup_failed;
4296
4297                                 KASSERT(m->m_flags & M_HASH, ("no hash"));
4298                                 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4299                                 if (cpuid != mycpuid) {
4300                                         /*
4301                                          * NOTE:
4302                                          * ip_len/ip_off are in network byte
4303                                          * order.
4304                                          */
4305                                         ctx->ipfw_defrag_remote++;
4306                                         ipfw_defrag_redispatch(m, cpuid, f);
4307                                         args->m = NULL;
4308                                         return (IP_FW_REDISPATCH);
4309                                 }
4310
4311                                 /* 'm' might be changed by ip_hashfn(). */
4312                                 ip = mtod(m, struct ip *);
4313                                 ip->ip_len = ntohs(ip->ip_len);
4314                                 ip->ip_off = ntohs(ip->ip_off);
4315
4316                                 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4317                                 if (m == NULL)
4318                                         goto pullup_failed;
4319
4320                                 /* Move on. */
4321                                 goto next_rule;
4322
4323                         case O_PIPE:
4324                         case O_QUEUE:
4325                                 args->rule = f; /* report matching rule */
4326                                 args->cookie = cmd->arg1;
4327                                 retval = IP_FW_DUMMYNET;
4328                                 goto done;
4329
4330                         case O_DIVERT:
4331                         case O_TEE:
4332                                 if (args->eh) /* not on layer 2 */
4333                                         break;
4334
4335                                 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4336                                     sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4337                                 if (mtag == NULL) {
4338                                         retval = IP_FW_DENY;
4339                                         goto done;
4340                                 }
4341                                 divinfo = m_tag_data(mtag);
4342
4343                                 divinfo->skipto = f->rulenum;
4344                                 divinfo->port = cmd->arg1;
4345                                 divinfo->tee = (cmd->opcode == O_TEE);
4346                                 m_tag_prepend(m, mtag);
4347
4348                                 args->cookie = cmd->arg1;
4349                                 retval = (cmd->opcode == O_DIVERT) ?
4350                                          IP_FW_DIVERT : IP_FW_TEE;
4351                                 goto done;
4352
4353                         case O_COUNT:
4354                         case O_SKIPTO:
4355                                 f->pcnt++;      /* update stats */
4356                                 f->bcnt += lc.ip_len;
4357                                 f->timestamp = time_second;
4358                                 if (cmd->opcode == O_COUNT)
4359                                         goto next_rule;
4360                                 /* handle skipto */
4361                                 if (f->next_rule == NULL)
4362                                         lookup_next_rule(f);
4363                                 f = f->next_rule;
4364                                 goto again;
4365
4366                         case O_REJECT:
4367                                 /*
4368                                  * Drop the packet and send a reject notice
4369                                  * if the packet is not ICMP (or is an ICMP
4370                                  * query), and it is not multicast/broadcast.
4371                                  */
4372                                 if (hlen > 0 &&
4373                                     (lc.proto != IPPROTO_ICMP ||
4374                                      is_icmp_query(ip)) &&
4375                                     !(m->m_flags & (M_BCAST|M_MCAST)) &&
4376                                     !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4377                                         send_reject(args, cmd->arg1,
4378                                             lc.offset, lc.ip_len);
4379                                         retval = IP_FW_DENY;
4380                                         goto done;
4381                                 }
4382                                 /* FALLTHROUGH */
4383                         case O_DENY:
4384                                 retval = IP_FW_DENY;
4385                                 goto done;
4386
4387                         case O_FORWARD_IP:
4388                                 if (args->eh)   /* not valid on layer2 pkts */
4389                                         break;
4390                                 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4391                                         struct sockaddr_in *sin;
4392
4393                                         mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4394                                             sizeof(*sin), M_INTWAIT | M_NULLOK);
4395                                         if (mtag == NULL) {
4396                                                 retval = IP_FW_DENY;
4397                                                 goto done;
4398                                         }
4399                                         sin = m_tag_data(mtag);
4400
4401                                         /* Structure copy */
4402                                         *sin = ((ipfw_insn_sa *)cmd)->sa;
4403
4404                                         m_tag_prepend(m, mtag);
4405                                         m->m_pkthdr.fw_flags |=
4406                                                 IPFORWARD_MBUF_TAGGED;
4407                                         m->m_pkthdr.fw_flags &=
4408                                                 ~BRIDGE_MBUF_TAGGED;
4409                                 }
4410                                 retval = IP_FW_PASS;
4411                                 goto done;
4412
4413                         default:
4414                                 panic("-- unknown opcode %d", cmd->opcode);
4415                         } /* end of switch() on opcodes */
4416
4417                         if (cmd->len & F_NOT)
4418                                 match = !match;
4419
4420                         if (match) {
4421                                 if (cmd->len & F_OR)
4422                                         skip_or = 1;
4423                         } else {
4424                                 if (!(cmd->len & F_OR)) /* not an OR block, */
4425                                         break;          /* try next rule    */
4426                         }
4427
4428                 }       /* end of inner for, scan opcodes */
4429
4430 next_rule:;             /* try next rule                */
4431
4432         }               /* end of outer for, scan rules */
4433         kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4434         return IP_FW_DENY;
4435
4436 done:
4437         /* Update statistics */
4438         f->pcnt++;
4439         f->bcnt += lc.ip_len;
4440         f->timestamp = time_second;
4441         return retval;
4442
4443 pullup_failed:
4444         if (fw_verbose)
4445                 kprintf("pullup failed\n");
4446         return IP_FW_DENY;
4447 }
4448
4449 static struct mbuf *
4450 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4451 {
4452         struct m_tag *mtag;
4453         struct dn_pkt *pkt;
4454         ipfw_insn *cmd;
4455         const struct ipfw_flow_id *id;
4456         struct dn_flow_id *fid;
4457
4458         M_ASSERTPKTHDR(m);
4459
4460         mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4461             M_INTWAIT | M_NULLOK);
4462         if (mtag == NULL) {
4463                 m_freem(m);
4464                 return (NULL);
4465         }
4466         m_tag_prepend(m, mtag);
4467
4468         pkt = m_tag_data(mtag);
4469         bzero(pkt, sizeof(*pkt));
4470
4471         cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4472         if (cmd->opcode == O_LOG)
4473                 cmd += F_LEN(cmd);
4474         KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4475                 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4476
4477         pkt->dn_m = m;
4478         pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4479         pkt->ifp = fwa->oif;
4480         pkt->pipe_nr = pipe_nr;
4481
4482         pkt->cpuid = mycpuid;
4483         pkt->msgport = netisr_curport();
4484
4485         id = &fwa->f_id;
4486         fid = &pkt->id;
4487         fid->fid_dst_ip = id->dst_ip;
4488         fid->fid_src_ip = id->src_ip;
4489         fid->fid_dst_port = id->dst_port;
4490         fid->fid_src_port = id->src_port;
4491         fid->fid_proto = id->proto;
4492         fid->fid_flags = id->flags;
4493
4494         ipfw_ref_rule(fwa->rule);
4495         pkt->dn_priv = fwa->rule;
4496         pkt->dn_unref_priv = ipfw_unref_rule;
4497
4498         if (cmd->opcode == O_PIPE)
4499                 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4500
4501         m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4502         return (m);
4503 }
4504
4505 /*
4506  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4507  * These will be reconstructed on the fly as packets are matched.
4508  */
4509 static void
4510 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4511 {
4512         struct ip_fw *rule;
4513
4514         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4515                 rule->next_rule = NULL;
4516 }
4517
4518 static void
4519 ipfw_inc_static_count(struct ip_fw *rule)
4520 {
4521         /* Static rule's counts are updated only on CPU0 */
4522         KKASSERT(mycpuid == 0);
4523
4524         static_count++;
4525         static_ioc_len += IOC_RULESIZE(rule);
4526 }
4527
4528 static void
4529 ipfw_dec_static_count(struct ip_fw *rule)
4530 {
4531         int l = IOC_RULESIZE(rule);
4532
4533         /* Static rule's counts are updated only on CPU0 */
4534         KKASSERT(mycpuid == 0);
4535
4536         KASSERT(static_count > 0, ("invalid static count %u", static_count));
4537         static_count--;
4538
4539         KASSERT(static_ioc_len >= l,
4540                 ("invalid static len %u", static_ioc_len));
4541         static_ioc_len -= l;
4542 }
4543
4544 static void
4545 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4546 {
4547         if (fwmsg->sibling != NULL) {
4548                 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4549                 fwmsg->sibling->sibling = rule;
4550         }
4551         fwmsg->sibling = rule;
4552 }
4553
4554 static struct ip_fw *
4555 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4556 {
4557         struct ip_fw *rule;
4558
4559         rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4560
4561         rule->act_ofs = ioc_rule->act_ofs;
4562         rule->cmd_len = ioc_rule->cmd_len;
4563         rule->rulenum = ioc_rule->rulenum;
4564         rule->set = ioc_rule->set;
4565         rule->usr_flags = ioc_rule->usr_flags;
4566
4567         bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4568
4569         rule->refcnt = 1;
4570         rule->cpuid = mycpuid;
4571         rule->rule_flags = rule_flags;
4572
4573         return rule;
4574 }
4575
4576 static void
4577 ipfw_add_rule_dispatch(netmsg_t nmsg)
4578 {
4579         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4580         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4581         struct ip_fw *rule;
4582
4583         ASSERT_NETISR_NCPUS(mycpuid);
4584
4585         rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4586
4587         /*
4588          * Insert rule into the pre-determined position
4589          */
4590         if (fwmsg->prev_rule != NULL) {
4591                 struct ip_fw *prev, *next;
4592
4593                 prev = fwmsg->prev_rule;
4594                 KKASSERT(prev->cpuid == mycpuid);
4595
4596                 next = fwmsg->next_rule;
4597                 KKASSERT(next->cpuid == mycpuid);
4598
4599                 rule->next = next;
4600                 prev->next = rule;
4601
4602                 /*
4603                  * Move to the position on the next CPU
4604                  * before the msg is forwarded.
4605                  */
4606                 fwmsg->prev_rule = prev->sibling;
4607                 fwmsg->next_rule = next->sibling;
4608         } else {
4609                 KKASSERT(fwmsg->next_rule == NULL);
4610                 rule->next = ctx->ipfw_layer3_chain;
4611                 ctx->ipfw_layer3_chain = rule;
4612         }
4613
4614         /* Link rule CPU sibling */
4615         ipfw_link_sibling(fwmsg, rule);
4616
4617         ipfw_flush_rule_ptrs(ctx);
4618
4619         if (mycpuid == 0) {
4620                 /* Statistics only need to be updated once */
4621                 ipfw_inc_static_count(rule);
4622
4623                 /* Return the rule on CPU0 */
4624                 nmsg->lmsg.u.ms_resultp = rule;
4625         }
4626
4627         if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4628                 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4629
4630         if (fwmsg->cross_rules != NULL) {
4631                 /* Save rules for later use. */
4632                 fwmsg->cross_rules[mycpuid] = rule;
4633         }
4634
4635         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4636 }
4637
4638 static void
4639 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4640 {
4641         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4642         struct ip_fw *rule = fwmsg->sibling;
4643         int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4644
4645         ASSERT_NETISR_NCPUS(mycpuid);
4646         KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4647             ("not crossref rule"));
4648
4649         rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4650         memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4651
4652         fwmsg->sibling = rule->sibling;
4653         netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4654 }
4655
4656 /*
4657  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4658  * then possibly create a rule number and add the rule to the list.
4659  * Update the rule_number in the input struct so the caller knows
4660  * it as well.
4661  */
4662 static void
4663 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4664 {
4665         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4666         struct netmsg_ipfw fwmsg;
4667         struct ip_fw *f, *prev, *rule;
4668
4669         ASSERT_NETISR0;
4670
4671         /*
4672          * If rulenum is 0, find highest numbered rule before the
4673          * default rule, and add rule number incremental step.
4674          */
4675         if (ioc_rule->rulenum == 0) {
4676                 int step = autoinc_step;
4677
4678                 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4679                          step <= IPFW_AUTOINC_STEP_MAX);
4680
4681                 /*
4682                  * Locate the highest numbered rule before default
4683                  */
4684                 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4685                         if (f->rulenum == IPFW_DEFAULT_RULE)
4686                                 break;
4687                         ioc_rule->rulenum = f->rulenum;
4688                 }
4689                 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4690                         ioc_rule->rulenum += step;
4691         }
4692         KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4693                 ioc_rule->rulenum != 0,
4694                 ("invalid rule num %d", ioc_rule->rulenum));
4695
4696         /*
4697          * Now find the right place for the new rule in the sorted list.
4698          */
4699         for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4700              prev = f, f = f->next) {
4701                 if (f->rulenum > ioc_rule->rulenum) {
4702                         /* Found the location */
4703                         break;
4704                 }
4705         }
4706         KASSERT(f != NULL, ("no default rule?!"));
4707
4708         /*
4709          * Duplicate the rule onto each CPU.
4710          * The rule duplicated on CPU0 will be returned.
4711          */
4712         bzero(&fwmsg, sizeof(fwmsg));
4713         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4714             ipfw_add_rule_dispatch);
4715         fwmsg.ioc_rule = ioc_rule;
4716         fwmsg.prev_rule = prev;
4717         fwmsg.next_rule = prev == NULL ? NULL : f;
4718         fwmsg.rule_flags = rule_flags;
4719         if (rule_flags & IPFW_RULE_F_CROSSREF) {
4720                 fwmsg.cross_rules = kmalloc(
4721                     sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4722                     M_WAITOK | M_ZERO);
4723         }
4724
4725         netisr_domsg_global(&fwmsg.base);
4726         KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4727
4728         rule = fwmsg.base.lmsg.u.ms_resultp;
4729         KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4730
4731         if (fwmsg.cross_rules != NULL) {
4732                 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4733                     MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4734                 fwmsg.sibling = rule;
4735                 netisr_domsg_global(&fwmsg.base);
4736                 KKASSERT(fwmsg.sibling == NULL);
4737
4738                 kfree(fwmsg.cross_rules, M_TEMP);
4739
4740 #ifdef KLD_MODULE
4741                 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4742 #endif
4743         }
4744
4745         DPRINTF("++ installed rule %d, static count now %d\n",
4746                 rule->rulenum, static_count);
4747 }
4748
4749 /*
4750  * Free storage associated with a static rule (including derived
4751  * states/tracks).
4752  * The caller is in charge of clearing rule pointers to avoid
4753  * dangling pointers.
4754  * @return a pointer to the next entry.
4755  * Arguments are not checked, so they better be correct.
4756  */
4757 static struct ip_fw *
4758 ipfw_delete_rule(struct ipfw_context *ctx,
4759                  struct ip_fw *prev, struct ip_fw *rule)
4760 {
4761         struct ip_fw *n;
4762
4763         n = rule->next;
4764         if (prev == NULL)
4765                 ctx->ipfw_layer3_chain = n;
4766         else
4767                 prev->next = n;
4768
4769         /* Mark the rule as invalid */
4770         rule->rule_flags |= IPFW_RULE_F_INVALID;
4771         rule->next_rule = NULL;
4772         rule->sibling = NULL;
4773 #ifdef foo
4774         /* Don't reset cpuid here; keep various assertion working */
4775         rule->cpuid = -1;
4776 #endif
4777
4778         /* Statistics only need to be updated once */
4779         if (mycpuid == 0)
4780                 ipfw_dec_static_count(rule);
4781
4782         if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4783                 /* Try to free this rule */
4784                 ipfw_free_rule(rule);
4785         } else {
4786                 /* TODO: check staging area. */
4787                 if (mycpuid == 0) {
4788                         rule->next = ipfw_gd.ipfw_crossref_free;
4789                         ipfw_gd.ipfw_crossref_free = rule;
4790                 }
4791         }
4792
4793         /* Return the next rule */
4794         return n;
4795 }
4796
4797 static void
4798 ipfw_flush_dispatch(netmsg_t nmsg)
4799 {
4800         int kill_default = nmsg->lmsg.u.ms_result;
4801         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4802         struct ip_fw *rule;
4803
4804         ASSERT_NETISR_NCPUS(mycpuid);
4805
4806         /*
4807          * Flush states.
4808          */
4809         ipfw_state_flush(ctx, NULL);
4810         KASSERT(ctx->ipfw_state_cnt == 0,
4811             ("%d pcpu states remain", ctx->ipfw_state_cnt));
4812         ctx->ipfw_state_loosecnt = 0;
4813         ctx->ipfw_state_lastexp = 0;
4814
4815         /*
4816          * Flush tracks.
4817          */
4818         ipfw_track_flush(ctx, NULL);
4819         ctx->ipfw_track_lastexp = 0;
4820         if (ctx->ipfw_trkcnt_spare != NULL) {
4821                 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4822                 ctx->ipfw_trkcnt_spare = NULL;
4823         }
4824
4825         ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4826
4827         while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4828                (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4829                 ipfw_delete_rule(ctx, NULL, rule);
4830
4831         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4832 }
4833
4834 /*
4835  * Deletes all rules from a chain (including the default rule
4836  * if the second argument is set).
4837  */
4838 static void
4839 ipfw_flush(int kill_default)
4840 {
4841         struct netmsg_base nmsg;
4842 #ifdef INVARIANTS
4843         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4844         int state_cnt;
4845 #endif
4846
4847         ASSERT_NETISR0;
4848
4849         /*
4850          * If 'kill_default' then caller has done the necessary
4851          * msgport syncing; unnecessary to do it again.
4852          */
4853         if (!kill_default) {
4854                 /*
4855                  * Let ipfw_chk() know the rules are going to
4856                  * be flushed, so it could jump directly to
4857                  * the default rule.
4858                  */
4859                 ipfw_flushing = 1;
4860                 /* XXX use priority sync */
4861                 netmsg_service_sync();
4862         }
4863
4864         /*
4865          * Press the 'flush' button
4866          */
4867         bzero(&nmsg, sizeof(nmsg));
4868         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4869             ipfw_flush_dispatch);
4870         nmsg.lmsg.u.ms_result = kill_default;
4871         netisr_domsg_global(&nmsg);
4872         ipfw_gd.ipfw_state_loosecnt = 0;
4873         ipfw_gd.ipfw_state_globexp = 0;
4874         ipfw_gd.ipfw_track_globexp = 0;
4875
4876 #ifdef INVARIANTS
4877         state_cnt = ipfw_state_cntcoll();
4878         KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4879
4880         KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4881             ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4882
4883         if (kill_default) {
4884                 KASSERT(static_count == 0,
4885                         ("%u static rules remain", static_count));
4886                 KASSERT(static_ioc_len == 0,
4887                         ("%u bytes of static rules remain", static_ioc_len));
4888         } else {
4889                 KASSERT(static_count == 1,
4890                         ("%u static rules remain", static_count));
4891                 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4892                         ("%u bytes of static rules remain, should be %lu",
4893                          static_ioc_len,
4894                          (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4895         }
4896 #endif
4897
4898         /* Flush is done */
4899         ipfw_flushing = 0;
4900 }
4901
4902 static void
4903 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4904 {
4905         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4906         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4907         struct ip_fw *rule, *prev;
4908
4909         ASSERT_NETISR_NCPUS(mycpuid);
4910
4911         rule = dmsg->start_rule;
4912         KKASSERT(rule->cpuid == mycpuid);
4913         dmsg->start_rule = rule->sibling;
4914
4915         prev = dmsg->prev_rule;
4916         if (prev != NULL) {
4917                 KKASSERT(prev->cpuid == mycpuid);
4918
4919                 /*
4920                  * Move to the position on the next CPU
4921                  * before the msg is forwarded.
4922                  */
4923                 dmsg->prev_rule = prev->sibling;
4924         }
4925
4926         /*
4927          * flush pointers outside the loop, then delete all matching
4928          * rules.  'prev' remains the same throughout the cycle.
4929          */
4930         ipfw_flush_rule_ptrs(ctx);
4931         while (rule && rule->rulenum == dmsg->rulenum) {
4932                 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4933                         /* Flush states generated by this rule. */
4934                         ipfw_state_flush(ctx, rule);
4935                 }
4936                 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4937                         /* Flush tracks generated by this rule. */
4938                         ipfw_track_flush(ctx, rule);
4939                 }
4940                 rule = ipfw_delete_rule(ctx, prev, rule);
4941         }
4942
4943         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4944 }
4945
4946 static int
4947 ipfw_alt_delete_rule(uint16_t rulenum)
4948 {
4949         struct ip_fw *prev, *rule;
4950         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4951         struct netmsg_del dmsg;
4952
4953         ASSERT_NETISR0;
4954
4955         /*
4956          * Locate first rule to delete
4957          */
4958         for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4959              rule && rule->rulenum < rulenum;
4960              prev = rule, rule = rule->next)
4961                 ; /* EMPTY */
4962         if (rule->rulenum != rulenum)
4963                 return EINVAL;
4964
4965         /*
4966          * Get rid of the rule duplications on all CPUs
4967          */
4968         bzero(&dmsg, sizeof(dmsg));
4969         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4970             ipfw_alt_delete_rule_dispatch);
4971         dmsg.prev_rule = prev;
4972         dmsg.start_rule = rule;
4973         dmsg.rulenum = rulenum;
4974
4975         netisr_domsg_global(&dmsg.base);
4976         KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4977         return 0;
4978 }
4979
4980 static void
4981 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4982 {
4983         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4984         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4985         struct ip_fw *prev, *rule;
4986 #ifdef INVARIANTS
4987         int del = 0;
4988 #endif
4989
4990         ASSERT_NETISR_NCPUS(mycpuid);
4991
4992         ipfw_flush_rule_ptrs(ctx);
4993
4994         prev = NULL;
4995         rule = ctx->ipfw_layer3_chain;
4996         while (rule != NULL) {
4997                 if (rule->set == dmsg->from_set) {
4998                         if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4999                                 /* Flush states generated by this rule. */
5000                                 ipfw_state_flush(ctx, rule);
5001                         }
5002                         if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
5003                                 /* Flush tracks generated by this rule. */
5004                                 ipfw_track_flush(ctx, rule);
5005                         }
5006                         rule = ipfw_delete_rule(ctx, prev, rule);
5007 #ifdef INVARIANTS
5008                         del = 1;
5009 #endif
5010                 } else {
5011                         prev = rule;
5012                         rule = rule->next;
5013                 }
5014         }
5015         KASSERT(del, ("no match set?!"));
5016
5017         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5018 }
5019
5020 static int
5021 ipfw_alt_delete_ruleset(uint8_t set)
5022 {
5023         struct netmsg_del dmsg;
5024         int del;
5025         struct ip_fw *rule;
5026         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5027
5028         ASSERT_NETISR0;
5029
5030         /*
5031          * Check whether the 'set' exists.  If it exists,
5032          * then check whether any rules within the set will
5033          * try to create states.
5034          */
5035         del = 0;
5036         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5037                 if (rule->set == set)
5038                         del = 1;
5039         }
5040         if (!del)
5041                 return 0; /* XXX EINVAL? */
5042
5043         /*
5044          * Delete this set
5045          */
5046         bzero(&dmsg, sizeof(dmsg));
5047         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5048             ipfw_alt_delete_ruleset_dispatch);
5049         dmsg.from_set = set;
5050         netisr_domsg_global(&dmsg.base);
5051
5052         return 0;
5053 }
5054
5055 static void
5056 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5057 {
5058         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5059         struct ip_fw *rule;
5060
5061         ASSERT_NETISR_NCPUS(mycpuid);
5062
5063         rule = dmsg->start_rule;
5064         KKASSERT(rule->cpuid == mycpuid);
5065
5066         /*
5067          * Move to the position on the next CPU
5068          * before the msg is forwarded.
5069          */
5070         dmsg->start_rule = rule->sibling;
5071
5072         while (rule && rule->rulenum <= dmsg->rulenum) {
5073                 if (rule->rulenum == dmsg->rulenum)
5074                         rule->set = dmsg->to_set;
5075                 rule = rule->next;
5076         }
5077         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5078 }
5079
5080 static int
5081 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5082 {
5083         struct netmsg_del dmsg;
5084         struct netmsg_base *nmsg;
5085         struct ip_fw *rule;
5086         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5087
5088         ASSERT_NETISR0;
5089
5090         /*
5091          * Locate first rule to move
5092          */
5093         for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5094              rule = rule->next) {
5095                 if (rule->rulenum == rulenum && rule->set != set)
5096                         break;
5097         }
5098         if (rule == NULL || rule->rulenum > rulenum)
5099                 return 0; /* XXX error? */
5100
5101         bzero(&dmsg, sizeof(dmsg));
5102         nmsg = &dmsg.base;
5103         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5104             ipfw_alt_move_rule_dispatch);
5105         dmsg.start_rule = rule;
5106         dmsg.rulenum = rulenum;
5107         dmsg.to_set = set;
5108
5109         netisr_domsg_global(nmsg);
5110         KKASSERT(dmsg.start_rule == NULL);
5111         return 0;
5112 }
5113
5114 static void
5115 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5116 {
5117         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5118         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5119         struct ip_fw *rule;
5120
5121         ASSERT_NETISR_NCPUS(mycpuid);
5122
5123         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5124                 if (rule->set == dmsg->from_set)
5125                         rule->set = dmsg->to_set;
5126         }
5127         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5128 }
5129
5130 static int
5131 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5132 {
5133         struct netmsg_del dmsg;
5134         struct netmsg_base *nmsg;
5135
5136         ASSERT_NETISR0;
5137
5138         bzero(&dmsg, sizeof(dmsg));
5139         nmsg = &dmsg.base;
5140         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5141             ipfw_alt_move_ruleset_dispatch);
5142         dmsg.from_set = from_set;
5143         dmsg.to_set = to_set;
5144
5145         netisr_domsg_global(nmsg);
5146         return 0;
5147 }
5148
5149 static void
5150 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5151 {
5152         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5153         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5154         struct ip_fw *rule;
5155
5156         ASSERT_NETISR_NCPUS(mycpuid);
5157
5158         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5159                 if (rule->set == dmsg->from_set)
5160                         rule->set = dmsg->to_set;
5161                 else if (rule->set == dmsg->to_set)
5162                         rule->set = dmsg->from_set;
5163         }
5164         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5165 }
5166
5167 static int
5168 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5169 {
5170         struct netmsg_del dmsg;
5171         struct netmsg_base *nmsg;
5172
5173         ASSERT_NETISR0;
5174
5175         bzero(&dmsg, sizeof(dmsg));
5176         nmsg = &dmsg.base;
5177         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5178             ipfw_alt_swap_ruleset_dispatch);
5179         dmsg.from_set = set1;
5180         dmsg.to_set = set2;
5181
5182         netisr_domsg_global(nmsg);
5183         return 0;
5184 }
5185
5186 /*
5187  * Remove all rules with given number, and also do set manipulation.
5188  *
5189  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5190  * the next 8 bits are the new set, the top 8 bits are the command:
5191  *
5192  *      0       delete rules with given number
5193  *      1       delete rules with given set number
5194  *      2       move rules with given number to new set
5195  *      3       move rules with given set number to new set
5196  *      4       swap sets with given numbers
5197  */
5198 static int
5199 ipfw_ctl_alter(uint32_t arg)
5200 {
5201         uint16_t rulenum;
5202         uint8_t cmd, new_set;
5203         int error = 0;
5204
5205         ASSERT_NETISR0;
5206
5207         rulenum = arg & 0xffff;
5208         cmd = (arg >> 24) & 0xff;
5209         new_set = (arg >> 16) & 0xff;
5210
5211         if (cmd > 4)
5212                 return EINVAL;
5213         if (new_set >= IPFW_DEFAULT_SET)
5214                 return EINVAL;
5215         if (cmd == 0 || cmd == 2) {
5216                 if (rulenum == IPFW_DEFAULT_RULE)
5217                         return EINVAL;
5218         } else {
5219                 if (rulenum >= IPFW_DEFAULT_SET)
5220                         return EINVAL;
5221         }
5222
5223         switch (cmd) {
5224         case 0: /* delete rules with given number */
5225                 error = ipfw_alt_delete_rule(rulenum);
5226                 break;
5227
5228         case 1: /* delete all rules with given set number */
5229                 error = ipfw_alt_delete_ruleset(rulenum);
5230                 break;
5231
5232         case 2: /* move rules with given number to new set */
5233                 error = ipfw_alt_move_rule(rulenum, new_set);
5234                 break;
5235
5236         case 3: /* move rules with given set number to new set */
5237                 error = ipfw_alt_move_ruleset(rulenum, new_set);
5238                 break;
5239
5240         case 4: /* swap two sets */
5241                 error = ipfw_alt_swap_ruleset(rulenum, new_set);
5242                 break;
5243         }
5244         return error;
5245 }
5246
5247 /*
5248  * Clear counters for a specific rule.
5249  */
5250 static void
5251 clear_counters(struct ip_fw *rule, int log_only)
5252 {
5253         ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5254
5255         if (log_only == 0) {
5256                 rule->bcnt = rule->pcnt = 0;
5257                 rule->timestamp = 0;
5258         }
5259         if (l->o.opcode == O_LOG)
5260                 l->log_left = l->max_log;
5261 }
5262
5263 static void
5264 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5265 {
5266         struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5267         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5268         struct ip_fw *rule;
5269
5270         ASSERT_NETISR_NCPUS(mycpuid);
5271
5272         if (zmsg->rulenum == 0) {
5273                 KKASSERT(zmsg->start_rule == NULL);
5274
5275                 ctx->ipfw_norule_counter = 0;
5276                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5277                         clear_counters(rule, zmsg->log_only);
5278         } else {
5279                 struct ip_fw *start = zmsg->start_rule;
5280
5281                 KKASSERT(start->cpuid == mycpuid);
5282                 KKASSERT(start->rulenum == zmsg->rulenum);
5283
5284                 /*
5285                  * We can have multiple rules with the same number, so we
5286                  * need to clear them all.
5287                  */
5288                 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5289                      rule = rule->next)
5290                         clear_counters(rule, zmsg->log_only);
5291
5292                 /*
5293                  * Move to the position on the next CPU
5294                  * before the msg is forwarded.
5295                  */
5296                 zmsg->start_rule = start->sibling;
5297         }
5298         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5299 }
5300
5301 /*
5302  * Reset some or all counters on firewall rules.
5303  * @arg frwl is null to clear all entries, or contains a specific
5304  * rule number.
5305  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5306  */
5307 static int
5308 ipfw_ctl_zero_entry(int rulenum, int log_only)
5309 {
5310         struct netmsg_zent zmsg;
5311         struct netmsg_base *nmsg;
5312         const char *msg;
5313         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5314
5315         ASSERT_NETISR0;
5316
5317         bzero(&zmsg, sizeof(zmsg));
5318         nmsg = &zmsg.base;
5319         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5320             ipfw_zero_entry_dispatch);
5321         zmsg.log_only = log_only;
5322
5323         if (rulenum == 0) {
5324                 msg = log_only ? "ipfw: All logging counts reset.\n"
5325                                : "ipfw: Accounting cleared.\n";
5326         } else {
5327                 struct ip_fw *rule;
5328
5329                 /*
5330                  * Locate the first rule with 'rulenum'
5331                  */
5332                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5333                         if (rule->rulenum == rulenum)
5334                                 break;
5335                 }
5336                 if (rule == NULL) /* we did not find any matching rules */
5337                         return (EINVAL);
5338                 zmsg.start_rule = rule;
5339                 zmsg.rulenum = rulenum;
5340
5341                 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5342                                : "ipfw: Entry %d cleared.\n";
5343         }
5344         netisr_domsg_global(nmsg);
5345         KKASSERT(zmsg.start_rule == NULL);
5346
5347         if (fw_verbose)
5348                 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5349         return (0);
5350 }
5351
5352 /*
5353  * Check validity of the structure before insert.
5354  * Fortunately rules are simple, so this mostly need to check rule sizes.
5355  */
5356 static int
5357 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5358 {
5359         int l, cmdlen = 0;
5360         int have_action = 0;
5361         ipfw_insn *cmd;
5362
5363         *rule_flags = 0;
5364
5365         /* Check for valid size */
5366         if (size < sizeof(*rule)) {
5367                 kprintf("ipfw: rule too short\n");
5368                 return EINVAL;
5369         }
5370         l = IOC_RULESIZE(rule);
5371         if (l != size) {
5372                 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5373                 return EINVAL;
5374         }
5375
5376         /* Check rule number */
5377         if (rule->rulenum == IPFW_DEFAULT_RULE) {
5378                 kprintf("ipfw: invalid rule number\n");
5379                 return EINVAL;
5380         }
5381
5382         /*
5383          * Now go for the individual checks. Very simple ones, basically only
5384          * instruction sizes.
5385          */
5386         for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5387              l -= cmdlen, cmd += cmdlen) {
5388                 cmdlen = F_LEN(cmd);
5389                 if (cmdlen > l) {
5390                         kprintf("ipfw: opcode %d size truncated\n",
5391                                 cmd->opcode);
5392                         return EINVAL;
5393                 }
5394
5395                 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5396
5397                 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5398                     IPFW_ISXLAT(cmd->opcode)) {
5399                         /* This rule will generate states. */
5400                         *rule_flags |= IPFW_RULE_F_GENSTATE;
5401                         if (cmd->opcode == O_LIMIT)
5402                                 *rule_flags |= IPFW_RULE_F_GENTRACK;
5403                 }
5404                 if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5405                         *rule_flags |= IPFW_RULE_F_CROSSREF;
5406                 if (cmd->opcode == O_IP_SRC_IFIP ||
5407                     cmd->opcode == O_IP_DST_IFIP) {
5408                         *rule_flags |= IPFW_RULE_F_DYNIFADDR;
5409                         cmd->arg1 &= IPFW_IFIP_SETTINGS;
5410                 }
5411
5412                 switch (cmd->opcode) {
5413                 case O_NOP:
5414                 case O_PROBE_STATE:
5415                 case O_KEEP_STATE:
5416                 case O_PROTO:
5417                 case O_IP_SRC_ME:
5418                 case O_IP_DST_ME:
5419                 case O_LAYER2:
5420                 case O_IN:
5421                 case O_FRAG:
5422                 case O_IPFRAG:
5423                 case O_IPOPT:
5424                 case O_IPLEN:
5425                 case O_IPID:
5426                 case O_IPTOS:
5427                 case O_IPPRECEDENCE:
5428                 case O_IPTTL:
5429                 case O_IPVER:
5430                 case O_TCPWIN:
5431                 case O_TCPFLAGS:
5432                 case O_TCPOPTS:
5433                 case O_ESTAB:
5434                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5435                                 goto bad_size;
5436                         break;
5437
5438                 case O_IP_SRC_TABLE:
5439                 case O_IP_DST_TABLE:
5440                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5441                                 goto bad_size;
5442                         if (cmd->arg1 >= ipfw_table_max) {
5443                                 kprintf("ipfw: invalid table id %u, max %d\n",
5444                                     cmd->arg1, ipfw_table_max);
5445                                 return EINVAL;
5446                         }
5447                         break;
5448
5449                 case O_IP_SRC_IFIP:
5450                 case O_IP_DST_IFIP:
5451                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5452                                 goto bad_size;
5453                         break;
5454
5455                 case O_ICMPCODE:
5456                 case O_ICMPTYPE:
5457                         if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5458                                 goto bad_size;
5459                         break;
5460
5461                 case O_UID:
5462                 case O_GID:
5463                 case O_IP_SRC:
5464                 case O_IP_DST:
5465                 case O_TCPSEQ:
5466                 case O_TCPACK:
5467                 case O_PROB:
5468                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5469                                 goto bad_size;
5470                         break;
5471
5472                 case O_LIMIT:
5473                         if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5474                                 goto bad_size;
5475                         break;
5476                 case O_REDIRECT:
5477                         if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5478                                 goto bad_size;
5479                         break;
5480
5481                 case O_LOG:
5482                         if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5483                                 goto bad_size;
5484
5485                         ((ipfw_insn_log *)cmd)->log_left =
5486                             ((ipfw_insn_log *)cmd)->max_log;
5487
5488                         break;
5489
5490                 case O_IP_SRC_MASK:
5491                 case O_IP_DST_MASK:
5492                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5493                                 goto bad_size;
5494                         if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5495                                 kprintf("ipfw: opcode %d, useless rule\n",
5496                                         cmd->opcode);
5497                                 return EINVAL;
5498                         }
5499                         break;
5500
5501                 case O_IP_SRC_SET:
5502                 case O_IP_DST_SET:
5503                         if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5504                                 kprintf("ipfw: invalid set size %d\n",
5505                                         cmd->arg1);
5506                                 return EINVAL;
5507                         }
5508                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5509                             (cmd->arg1+31)/32 )
5510                                 goto bad_size;
5511                         break;
5512
5513                 case O_MACADDR2:
5514                         if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5515                                 goto bad_size;
5516                         break;
5517
5518                 case O_MAC_TYPE:
5519                 case O_IP_SRCPORT:
5520                 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5521                         if (cmdlen < 2 || cmdlen > 31)
5522                                 goto bad_size;
5523                         break;
5524
5525                 case O_RECV:
5526                 case O_XMIT:
5527                 case O_VIA:
5528                         if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5529                                 goto bad_size;
5530                         break;
5531
5532                 case O_PIPE:
5533                 case O_QUEUE:
5534                         if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5535                                 goto bad_size;
5536                         goto check_action;
5537
5538                 case O_FORWARD_IP:
5539                         if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5540                                 goto bad_size;
5541                         } else {
5542                                 in_addr_t fwd_addr;
5543
5544                                 fwd_addr = ((ipfw_insn_sa *)cmd)->
5545                                            sa.sin_addr.s_addr;
5546                                 if (IN_MULTICAST(ntohl(fwd_addr))) {
5547                                         kprintf("ipfw: try forwarding to "
5548                                                 "multicast address\n");
5549                                         return EINVAL;
5550                                 }
5551                         }
5552                         goto check_action;
5553
5554                 case O_FORWARD_MAC: /* XXX not implemented yet */
5555                 case O_CHECK_STATE:
5556                 case O_COUNT:
5557                 case O_ACCEPT:
5558                 case O_DENY:
5559                 case O_REJECT:
5560                 case O_SKIPTO:
5561                 case O_DIVERT:
5562                 case O_TEE:
5563                 case O_DEFRAG:
5564                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5565                                 goto bad_size;
5566 check_action:
5567                         if (have_action) {
5568                                 kprintf("ipfw: opcode %d, multiple actions"
5569                                         " not allowed\n",
5570                                         cmd->opcode);
5571                                 return EINVAL;
5572                         }
5573                         have_action = 1;
5574                         if (l != cmdlen) {
5575                                 kprintf("ipfw: opcode %d, action must be"
5576                                         " last opcode\n",
5577                                         cmd->opcode);
5578                                 return EINVAL;
5579                         }
5580                         break;
5581                 default:
5582                         kprintf("ipfw: opcode %d, unknown opcode\n",
5583                                 cmd->opcode);
5584                         return EINVAL;
5585                 }
5586         }
5587         if (have_action == 0) {
5588                 kprintf("ipfw: missing action\n");
5589                 return EINVAL;
5590         }
5591         return 0;
5592
5593 bad_size:
5594         kprintf("ipfw: opcode %d size %d wrong\n",
5595                 cmd->opcode, cmdlen);
5596         return EINVAL;
5597 }
5598
5599 static int
5600 ipfw_ctl_add_rule(struct sockopt *sopt)
5601 {
5602         struct ipfw_ioc_rule *ioc_rule;
5603         size_t size;
5604         uint32_t rule_flags;
5605         int error;
5606
5607         ASSERT_NETISR0;
5608         
5609         size = sopt->sopt_valsize;
5610         if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5611             size < sizeof(*ioc_rule)) {
5612                 return EINVAL;
5613         }
5614         if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5615                 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5616                                           IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5617         }
5618         ioc_rule = sopt->sopt_val;
5619
5620         error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5621         if (error)
5622                 return error;
5623
5624         ipfw_add_rule(ioc_rule, rule_flags);
5625
5626         if (sopt->sopt_dir == SOPT_GET)
5627                 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5628         return 0;
5629 }
5630
5631 static void *
5632 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5633     struct ipfw_ioc_rule *ioc_rule)
5634 {
5635         const struct ip_fw *sibling;
5636 #ifdef INVARIANTS
5637         int i;
5638 #endif
5639
5640         ASSERT_NETISR0;
5641         KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5642
5643         ioc_rule->act_ofs = rule->act_ofs;
5644         ioc_rule->cmd_len = rule->cmd_len;
5645         ioc_rule->rulenum = rule->rulenum;
5646         ioc_rule->set = rule->set;
5647         ioc_rule->usr_flags = rule->usr_flags;
5648
5649         ioc_rule->set_disable = ctx->ipfw_set_disable;
5650         ioc_rule->static_count = static_count;
5651         ioc_rule->static_len = static_ioc_len;
5652
5653         /*
5654          * Visit (read-only) all of the rule's duplications to get
5655          * the necessary statistics
5656          */
5657 #ifdef INVARIANTS
5658         i = 0;
5659 #endif
5660         ioc_rule->pcnt = 0;
5661         ioc_rule->bcnt = 0;
5662         ioc_rule->timestamp = 0;
5663         for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5664                 ioc_rule->pcnt += sibling->pcnt;
5665                 ioc_rule->bcnt += sibling->bcnt;
5666                 if (sibling->timestamp > ioc_rule->timestamp)
5667                         ioc_rule->timestamp = sibling->timestamp;
5668 #ifdef INVARIANTS
5669                 ++i;
5670 #endif
5671         }
5672         KASSERT(i == netisr_ncpus,
5673             ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5674
5675         bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5676
5677         return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5678 }
5679
5680 static boolean_t
5681 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5682 {
5683         struct ipfw_ioc_flowid *ioc_id;
5684
5685         if (trk->tc_expire == 0) {
5686                 /* Not a scanned one. */
5687                 return (FALSE);
5688         }
5689
5690         ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5691             0 : trk->tc_expire - time_uptime;
5692         ioc_state->pcnt = 0;
5693         ioc_state->bcnt = 0;
5694
5695         ioc_state->dyn_type = O_LIMIT_PARENT;
5696         ioc_state->count = trk->tc_count;
5697
5698         ioc_state->rulenum = trk->tc_rulenum;
5699
5700         ioc_id = &ioc_state->id;
5701         ioc_id->type = ETHERTYPE_IP;
5702         ioc_id->u.ip.proto = trk->tc_proto;
5703         ioc_id->u.ip.src_ip = trk->tc_saddr;
5704         ioc_id->u.ip.dst_ip = trk->tc_daddr;
5705         ioc_id->u.ip.src_port = trk->tc_sport;
5706         ioc_id->u.ip.dst_port = trk->tc_dport;
5707
5708         return (TRUE);
5709 }
5710
5711 static boolean_t
5712 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5713 {
5714         struct ipfw_ioc_flowid *ioc_id;
5715
5716         if (IPFW_STATE_SCANSKIP(s))
5717                 return (FALSE);
5718
5719         ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5720             0 : s->st_expire - time_uptime;
5721         ioc_state->pcnt = s->st_pcnt;
5722         ioc_state->bcnt = s->st_bcnt;
5723
5724         ioc_state->dyn_type = s->st_type;
5725         ioc_state->count = 0;
5726
5727         ioc_state->rulenum = s->st_rule->rulenum;
5728
5729         ioc_id = &ioc_state->id;
5730         ioc_id->type = ETHERTYPE_IP;
5731         ioc_id->u.ip.proto = s->st_proto;
5732         ipfw_key_4tuple(&s->st_key,
5733             &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5734             &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5735
5736         if (IPFW_ISXLAT(s->st_type)) {
5737                 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5738
5739                 if (x->xlat_port == 0)
5740                         ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5741                 else
5742                         ioc_state->xlat_port = ntohs(x->xlat_port);
5743                 ioc_state->xlat_addr = ntohl(x->xlat_addr);
5744
5745                 ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5746                 ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5747         }
5748
5749         return (TRUE);
5750 }
5751
5752 static void
5753 ipfw_state_copy_dispatch(netmsg_t nmsg)
5754 {
5755         struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5756         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5757         const struct ipfw_state *s;
5758         const struct ipfw_track *t;
5759
5760         ASSERT_NETISR_NCPUS(mycpuid);
5761         KASSERT(nm->state_cnt < nm->state_cntmax,
5762             ("invalid state count %d, max %d",
5763              nm->state_cnt, nm->state_cntmax));
5764
5765         TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5766                 if (ipfw_state_copy(s, nm->ioc_state)) {
5767                         nm->ioc_state++;
5768                         nm->state_cnt++;
5769                         if (nm->state_cnt == nm->state_cntmax)
5770                                 goto done;
5771                 }
5772         }
5773
5774         /*
5775          * Prepare tracks in the global track tree for userland.
5776          */
5777         TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5778                 struct ipfw_trkcnt *trk;
5779
5780                 if (t->t_count == NULL) /* anchor */
5781                         continue;
5782                 trk = t->t_trkcnt;
5783
5784                 /*
5785                  * Only one netisr can run this function at
5786                  * any time, and only this function accesses
5787                  * trkcnt's tc_expire, so this is safe w/o
5788                  * ipfw_gd.ipfw_trkcnt_token.
5789                  */
5790                 if (trk->tc_expire > t->t_expire)
5791                         continue;
5792                 trk->tc_expire = t->t_expire;
5793         }
5794
5795         /*
5796          * Copy tracks in the global track tree to userland in
5797          * the last netisr.
5798          */
5799         if (mycpuid == netisr_ncpus - 1) {
5800                 struct ipfw_trkcnt *trk;
5801
5802                 KASSERT(nm->state_cnt < nm->state_cntmax,
5803                     ("invalid state count %d, max %d",
5804                      nm->state_cnt, nm->state_cntmax));
5805
5806                 IPFW_TRKCNT_TOKGET;
5807                 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5808                         if (ipfw_track_copy(trk, nm->ioc_state)) {
5809                                 nm->ioc_state++;
5810                                 nm->state_cnt++;
5811                                 if (nm->state_cnt == nm->state_cntmax) {
5812                                         IPFW_TRKCNT_TOKREL;
5813                                         goto done;
5814                                 }
5815                         }
5816                 }
5817                 IPFW_TRKCNT_TOKREL;
5818         }
5819 done:
5820         if (nm->state_cnt == nm->state_cntmax) {
5821                 /* No more space; done. */
5822                 netisr_replymsg(&nm->base, 0);
5823         } else {
5824                 netisr_forwardmsg(&nm->base, mycpuid + 1);
5825         }
5826 }
5827
5828 static int
5829 ipfw_ctl_get_rules(struct sockopt *sopt)
5830 {
5831         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5832         struct ip_fw *rule;
5833         void *bp;
5834         size_t size;
5835         int state_cnt;
5836
5837         ASSERT_NETISR0;
5838
5839         /*
5840          * pass up a copy of the current rules. Static rules
5841          * come first (the last of which has number IPFW_DEFAULT_RULE),
5842          * followed by a possibly empty list of states.
5843          */
5844
5845         size = static_ioc_len;  /* size of static rules */
5846
5847         /*
5848          * Size of the states.
5849          * XXX take tracks as state for userland compat.
5850          */
5851         state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5852         state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5853         size += state_cnt * sizeof(struct ipfw_ioc_state);
5854
5855         if (sopt->sopt_valsize < size) {
5856                 /* short length, no need to return incomplete rules */
5857                 /* XXX: if superuser, no need to zero buffer */
5858                 bzero(sopt->sopt_val, sopt->sopt_valsize); 
5859                 return 0;
5860         }
5861         bp = sopt->sopt_val;
5862
5863         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5864                 bp = ipfw_copy_rule(ctx, rule, bp);
5865
5866         if (state_cnt) {
5867                 struct netmsg_cpstate nm;
5868 #ifdef INVARIANTS
5869                 size_t old_size = size;
5870 #endif
5871
5872                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5873                     MSGF_PRIORITY, ipfw_state_copy_dispatch);
5874                 nm.ioc_state = bp;
5875                 nm.state_cntmax = state_cnt;
5876                 nm.state_cnt = 0;
5877                 netisr_domsg_global(&nm.base);
5878
5879                 /*
5880                  * The # of states may be shrinked after the snapshot
5881                  * of the state count was taken.  To give user a correct
5882                  * state count, nm->state_cnt is used to recalculate
5883                  * the actual size.
5884                  */
5885                 size = static_ioc_len +
5886                     (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5887                 KKASSERT(size <= old_size);
5888         }
5889
5890         sopt->sopt_valsize = size;
5891         return 0;
5892 }
5893
5894 static void
5895 ipfw_set_disable_dispatch(netmsg_t nmsg)
5896 {
5897         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5898
5899         ASSERT_NETISR_NCPUS(mycpuid);
5900
5901         ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5902         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5903 }
5904
5905 static void
5906 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5907 {
5908         struct netmsg_base nmsg;
5909         uint32_t set_disable;
5910
5911         ASSERT_NETISR0;
5912
5913         /* IPFW_DEFAULT_SET is always enabled */
5914         enable |= (1 << IPFW_DEFAULT_SET);
5915         set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5916
5917         bzero(&nmsg, sizeof(nmsg));
5918         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5919             ipfw_set_disable_dispatch);
5920         nmsg.lmsg.u.ms_result32 = set_disable;
5921
5922         netisr_domsg_global(&nmsg);
5923 }
5924
5925 static void
5926 ipfw_table_create_dispatch(netmsg_t nm)
5927 {
5928         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5929         int tblid = nm->lmsg.u.ms_result;
5930
5931         ASSERT_NETISR_NCPUS(mycpuid);
5932
5933         if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5934             rn_cpumaskhead(mycpuid), 32))
5935                 panic("ipfw: create table%d failed", tblid);
5936
5937         netisr_forwardmsg(&nm->base, mycpuid + 1);
5938 }
5939
5940 static int
5941 ipfw_table_create(struct sockopt *sopt)
5942 {
5943         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5944         struct ipfw_ioc_table *tbl;
5945         struct netmsg_base nm;
5946
5947         ASSERT_NETISR0;
5948
5949         if (sopt->sopt_valsize != sizeof(*tbl))
5950                 return (EINVAL);
5951
5952         tbl = sopt->sopt_val;
5953         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5954                 return (EINVAL);
5955
5956         if (ctx->ipfw_tables[tbl->tableid] != NULL)
5957                 return (EEXIST);
5958
5959         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5960             ipfw_table_create_dispatch);
5961         nm.lmsg.u.ms_result = tbl->tableid;
5962         netisr_domsg_global(&nm);
5963
5964         return (0);
5965 }
5966
5967 static void
5968 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5969 {
5970         struct radix_node *ret;
5971
5972         ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5973         if (ret != rn)
5974                 panic("deleted other table entry");
5975         kfree(ret, M_IPFW);
5976 }
5977
5978 static int
5979 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5980 {
5981
5982         ipfw_table_killrn(xrnh, rn);
5983         return (0);
5984 }
5985
5986 static void
5987 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5988     int destroy)
5989 {
5990         struct radix_node_head *rnh;
5991
5992         ASSERT_NETISR_NCPUS(mycpuid);
5993
5994         rnh = ctx->ipfw_tables[tableid];
5995         rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5996         if (destroy) {
5997                 Free(rnh);
5998                 ctx->ipfw_tables[tableid] = NULL;
5999         }
6000 }
6001
6002 static void
6003 ipfw_table_flush_dispatch(netmsg_t nmsg)
6004 {
6005         struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
6006         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6007
6008         ASSERT_NETISR_NCPUS(mycpuid);
6009
6010         ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
6011         netisr_forwardmsg(&nm->base, mycpuid + 1);
6012 }
6013
6014 static void
6015 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
6016 {
6017         int i;
6018
6019         ASSERT_NETISR_NCPUS(mycpuid);
6020
6021         for (i = 0; i < ipfw_table_max; ++i) {
6022                 if (ctx->ipfw_tables[i] != NULL)
6023                         ipfw_table_flush_oncpu(ctx, i, destroy);
6024         }
6025 }
6026
6027 static void
6028 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6029 {
6030         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6031
6032         ASSERT_NETISR_NCPUS(mycpuid);
6033
6034         ipfw_table_flushall_oncpu(ctx, 0);
6035         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6036 }
6037
6038 static int
6039 ipfw_table_flush(struct sockopt *sopt)
6040 {
6041         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6042         struct ipfw_ioc_table *tbl;
6043         struct netmsg_tblflush nm;
6044
6045         ASSERT_NETISR0;
6046
6047         if (sopt->sopt_valsize != sizeof(*tbl))
6048                 return (EINVAL);
6049
6050         tbl = sopt->sopt_val;
6051         if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6052                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6053                     MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6054                 netisr_domsg_global(&nm.base);
6055                 return (0);
6056         }
6057
6058         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6059                 return (EINVAL);
6060
6061         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6062                 return (ENOENT);
6063
6064         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6065             ipfw_table_flush_dispatch);
6066         nm.tableid = tbl->tableid;
6067         nm.destroy = 0;
6068         if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6069                 nm.destroy = 1;
6070         netisr_domsg_global(&nm.base);
6071
6072         return (0);
6073 }
6074
6075 static int
6076 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6077 {
6078         int *cnt = xcnt;
6079
6080         (*cnt)++;
6081         return (0);
6082 }
6083
6084 static int
6085 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6086 {
6087         struct ipfw_table_cp *cp = xcp;
6088         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6089         struct ipfw_ioc_tblent *ioc_te;
6090 #ifdef INVARIANTS
6091         int cnt;
6092 #endif
6093
6094         KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6095             cp->te_idx, cp->te_cnt));
6096         ioc_te = &cp->te[cp->te_idx];
6097
6098         if (te->te_nodes->rn_mask != NULL) {
6099                 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6100                     *te->te_nodes->rn_mask);
6101         } else {
6102                 ioc_te->netmask.sin_len = 0;
6103         }
6104         memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6105
6106         ioc_te->use = te->te_use;
6107         ioc_te->last_used = te->te_lastuse;
6108 #ifdef INVARIANTS
6109         cnt = 1;
6110 #endif
6111
6112         while ((te = te->te_sibling) != NULL) {
6113 #ifdef INVARIANTS
6114                 ++cnt;
6115 #endif
6116                 ioc_te->use += te->te_use;
6117                 if (te->te_lastuse > ioc_te->last_used)
6118                         ioc_te->last_used = te->te_lastuse;
6119         }
6120         KASSERT(cnt == netisr_ncpus,
6121             ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6122
6123         cp->te_idx++;
6124
6125         return (0);
6126 }
6127
6128 static int
6129 ipfw_table_get(struct sockopt *sopt)
6130 {
6131         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6132         struct radix_node_head *rnh;
6133         struct ipfw_ioc_table *tbl;
6134         struct ipfw_ioc_tblcont *cont;
6135         struct ipfw_table_cp cp;
6136         int cnt = 0, sz;
6137
6138         ASSERT_NETISR0;
6139
6140         if (sopt->sopt_valsize < sizeof(*tbl))
6141                 return (EINVAL);
6142
6143         tbl = sopt->sopt_val;
6144         if (tbl->tableid < 0) {
6145                 struct ipfw_ioc_tbllist *list;
6146                 int i;
6147
6148                 /*
6149                  * List available table ids.
6150                  */
6151                 for (i = 0; i < ipfw_table_max; ++i) {
6152                         if (ctx->ipfw_tables[i] != NULL)
6153                                 ++cnt;
6154                 }
6155
6156                 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6157                 if (sopt->sopt_valsize < sz) {
6158                         bzero(sopt->sopt_val, sopt->sopt_valsize);
6159                         return (E2BIG);
6160                 }
6161                 list = sopt->sopt_val;
6162                 list->tablecnt = cnt;
6163
6164                 cnt = 0;
6165                 for (i = 0; i < ipfw_table_max; ++i) {
6166                         if (ctx->ipfw_tables[i] != NULL) {
6167                                 KASSERT(cnt < list->tablecnt,
6168                                     ("invalid idx %d, cnt %d",
6169                                      cnt, list->tablecnt));
6170                                 list->tables[cnt++] = i;
6171                         }
6172                 }
6173                 sopt->sopt_valsize = sz;
6174                 return (0);
6175         } else if (tbl->tableid >= ipfw_table_max) {
6176                 return (EINVAL);
6177         }
6178
6179         rnh = ctx->ipfw_tables[tbl->tableid];
6180         if (rnh == NULL)
6181                 return (ENOENT);
6182         rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6183
6184         sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6185         if (sopt->sopt_valsize < sz) {
6186                 bzero(sopt->sopt_val, sopt->sopt_valsize);
6187                 return (E2BIG);
6188         }
6189         cont = sopt->sopt_val;
6190         cont->entcnt = cnt;
6191
6192         cp.te = cont->ent;
6193         cp.te_idx = 0;
6194         cp.te_cnt = cnt;
6195         rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6196
6197         sopt->sopt_valsize = sz;
6198         return (0);
6199 }
6200
6201 static void
6202 ipfw_table_add_dispatch(netmsg_t nmsg)
6203 {
6204         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6205         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6206         struct radix_node_head *rnh;
6207         struct ipfw_tblent *te;
6208
6209         ASSERT_NETISR_NCPUS(mycpuid);
6210
6211         rnh = ctx->ipfw_tables[nm->tableid];
6212
6213         te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6214         te->te_nodes->rn_key = (char *)&te->te_key;
6215         memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6216
6217         if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6218             te->te_nodes) == NULL) {
6219                 if (mycpuid == 0) {
6220                         kfree(te, M_IPFW);
6221                         netisr_replymsg(&nm->base, EEXIST);
6222                         return;
6223                 }
6224                 panic("rnh_addaddr failed");
6225         }
6226
6227         /* Link siblings. */
6228         if (nm->sibling != NULL)
6229                 nm->sibling->te_sibling = te;
6230         nm->sibling = te;
6231
6232         netisr_forwardmsg(&nm->base, mycpuid + 1);
6233 }
6234
6235 static void
6236 ipfw_table_del_dispatch(netmsg_t nmsg)
6237 {
6238         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6239         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6240         struct radix_node_head *rnh;
6241         struct radix_node *rn;
6242
6243         ASSERT_NETISR_NCPUS(mycpuid);
6244
6245         rnh = ctx->ipfw_tables[nm->tableid];
6246         rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6247         if (rn == NULL) {
6248                 if (mycpuid == 0) {
6249                         netisr_replymsg(&nm->base, ESRCH);
6250                         return;
6251                 }
6252                 panic("rnh_deladdr failed");
6253         }
6254         kfree(rn, M_IPFW);
6255
6256         netisr_forwardmsg(&nm->base, mycpuid + 1);
6257 }
6258
6259 static int
6260 ipfw_table_alt(struct sockopt *sopt)
6261 {
6262         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6263         struct ipfw_ioc_tblcont *tbl;
6264         struct ipfw_ioc_tblent *te;
6265         struct sockaddr_in key0;
6266         struct sockaddr *netmask = NULL, *key;
6267         struct netmsg_tblent nm;
6268
6269         ASSERT_NETISR0;
6270
6271         if (sopt->sopt_valsize != sizeof(*tbl))
6272                 return (EINVAL);
6273         tbl = sopt->sopt_val;
6274
6275         if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6276                 return (EINVAL);
6277         if (tbl->entcnt != 1)
6278                 return (EINVAL);
6279
6280         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6281                 return (ENOENT);
6282         te = &tbl->ent[0];
6283
6284         if (te->key.sin_family != AF_INET ||
6285             te->key.sin_port != 0 ||
6286             te->key.sin_len != sizeof(struct sockaddr_in))
6287                 return (EINVAL);
6288         key = (struct sockaddr *)&te->key;
6289
6290         if (te->netmask.sin_len != 0) {
6291                 if (te->netmask.sin_port != 0 ||
6292                     te->netmask.sin_len > sizeof(struct sockaddr_in))
6293                         return (EINVAL);
6294                 netmask = (struct sockaddr *)&te->netmask;
6295                 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6296                 key = (struct sockaddr *)&key0;
6297         }
6298
6299         if (sopt->sopt_name == IP_FW_TBL_ADD) {
6300                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6301                     MSGF_PRIORITY, ipfw_table_add_dispatch);
6302         } else {
6303                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6304                     MSGF_PRIORITY, ipfw_table_del_dispatch);
6305         }
6306         nm.key = key;
6307         nm.netmask = netmask;
6308         nm.tableid = tbl->tableid;
6309         nm.sibling = NULL;
6310         return (netisr_domsg_global(&nm.base));
6311 }
6312
6313 static int
6314 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6315 {
6316         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6317
6318         te->te_use = 0;
6319         te->te_lastuse = 0;
6320         return (0);
6321 }
6322
6323 static void
6324 ipfw_table_zero_dispatch(netmsg_t nmsg)
6325 {
6326         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6327         struct radix_node_head *rnh;
6328
6329         ASSERT_NETISR_NCPUS(mycpuid);
6330
6331         rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6332         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6333
6334         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6335 }
6336
6337 static void
6338 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6339 {
6340         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6341         int i;
6342
6343         ASSERT_NETISR_NCPUS(mycpuid);
6344
6345         for (i = 0; i < ipfw_table_max; ++i) {
6346                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6347
6348                 if (rnh != NULL)
6349                         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6350         }
6351         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6352 }
6353
6354 static int
6355 ipfw_table_zero(struct sockopt *sopt)
6356 {
6357         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6358         struct netmsg_base nm;
6359         struct ipfw_ioc_table *tbl;
6360
6361         ASSERT_NETISR0;
6362
6363         if (sopt->sopt_valsize != sizeof(*tbl))
6364                 return (EINVAL);
6365         tbl = sopt->sopt_val;
6366
6367         if (tbl->tableid < 0) {
6368                 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6369                     ipfw_table_zeroall_dispatch);
6370                 netisr_domsg_global(&nm);
6371                 return (0);
6372         } else if (tbl->tableid >= ipfw_table_max) {
6373                 return (EINVAL);
6374         } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6375                 return (ENOENT);
6376         }
6377
6378         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6379             ipfw_table_zero_dispatch);
6380         nm.lmsg.u.ms_result = tbl->tableid;
6381         netisr_domsg_global(&nm);
6382
6383         return (0);
6384 }
6385
6386 static int
6387 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6388 {
6389         struct netmsg_tblexp *nm = xnm;
6390         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6391
6392         if (te->te_expired) {
6393                 ipfw_table_killrn(nm->rnh, rn);
6394                 nm->expcnt++;
6395         }
6396         return (0);
6397 }
6398
6399 static void
6400 ipfw_table_expire_dispatch(netmsg_t nmsg)
6401 {
6402         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6403         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6404         struct radix_node_head *rnh;
6405
6406         ASSERT_NETISR_NCPUS(mycpuid);
6407
6408         rnh = ctx->ipfw_tables[nm->tableid];
6409         nm->rnh = rnh;
6410         rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6411
6412         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6413             ("not all expired addresses (%d) were deleted (%d)",
6414              nm->cnt * (mycpuid + 1), nm->expcnt));
6415
6416         netisr_forwardmsg(&nm->base, mycpuid + 1);
6417 }
6418
6419 static void
6420 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6421 {
6422         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6423         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6424         int i;
6425
6426         ASSERT_NETISR_NCPUS(mycpuid);
6427
6428         for (i = 0; i < ipfw_table_max; ++i) {
6429                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6430
6431                 if (rnh == NULL)
6432                         continue;
6433                 nm->rnh = rnh;
6434                 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6435         }
6436
6437         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6438             ("not all expired addresses (%d) were deleted (%d)",
6439              nm->cnt * (mycpuid + 1), nm->expcnt));
6440
6441         netisr_forwardmsg(&nm->base, mycpuid + 1);
6442 }
6443
6444 static int
6445 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6446 {
6447         struct netmsg_tblexp *nm = xnm;
6448         struct ipfw_tblent *te;
6449         time_t lastuse;
6450
6451         te = (struct ipfw_tblent *)rn;
6452         lastuse = te->te_lastuse;
6453
6454         while ((te = te->te_sibling) != NULL) {
6455                 if (te->te_lastuse > lastuse)
6456                         lastuse = te->te_lastuse;
6457         }
6458         if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6459                 /* Not expired */
6460                 return (0);
6461         }
6462
6463         te = (struct ipfw_tblent *)rn;
6464         te->te_expired = 1;
6465         while ((te = te->te_sibling) != NULL)
6466                 te->te_expired = 1;
6467         nm->cnt++;
6468
6469         return (0);
6470 }
6471
6472 static int
6473 ipfw_table_expire(struct sockopt *sopt)
6474 {
6475         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6476         struct netmsg_tblexp nm;
6477         struct ipfw_ioc_tblexp *tbl;
6478         struct radix_node_head *rnh;
6479
6480         ASSERT_NETISR0;
6481
6482         if (sopt->sopt_valsize != sizeof(*tbl))
6483                 return (EINVAL);
6484         tbl = sopt->sopt_val;
6485         tbl->expcnt = 0;
6486
6487         nm.expcnt = 0;
6488         nm.cnt = 0;
6489         nm.expire = tbl->expire;
6490
6491         if (tbl->tableid < 0) {
6492                 int i;
6493
6494                 for (i = 0; i < ipfw_table_max; ++i) {
6495                         rnh = ctx->ipfw_tables[i];
6496                         if (rnh == NULL)
6497                                 continue;
6498                         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6499                 }
6500                 if (nm.cnt == 0) {
6501                         /* No addresses can be expired. */
6502                         return (0);
6503                 }
6504                 tbl->expcnt = nm.cnt;
6505
6506                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6507                     MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6508                 nm.tableid = -1;
6509                 netisr_domsg_global(&nm.base);
6510                 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6511                     ("not all expired addresses (%d) were deleted (%d)",
6512                      nm.cnt * netisr_ncpus, nm.expcnt));
6513
6514                 return (0);
6515         } else if (tbl->tableid >= ipfw_table_max) {
6516                 return (EINVAL);
6517         }
6518
6519         rnh = ctx->ipfw_tables[tbl->tableid];
6520         if (rnh == NULL)
6521                 return (ENOENT);
6522         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6523         if (nm.cnt == 0) {
6524                 /* No addresses can be expired. */
6525                 return (0);
6526         }
6527         tbl->expcnt = nm.cnt;
6528
6529         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6530             ipfw_table_expire_dispatch);
6531         nm.tableid = tbl->tableid;
6532         netisr_domsg_global(&nm.base);
6533         KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6534             ("not all expired addresses (%d) were deleted (%d)",
6535              nm.cnt * netisr_ncpus, nm.expcnt));
6536         return (0);
6537 }
6538
6539 static void
6540 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6541 {
6542         struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6543
6544         KKASSERT((rule->rule_flags &
6545             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6546             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6547         ipfw_free_rule(rule);
6548
6549         netisr_replymsg(&nmsg->base, 0);
6550 }
6551
6552 static void
6553 ipfw_crossref_reap(void)
6554 {
6555         struct ip_fw *rule, *prev = NULL;
6556
6557         ASSERT_NETISR0;
6558
6559         rule = ipfw_gd.ipfw_crossref_free;
6560         while (rule != NULL) {
6561                 uint64_t inflight = 0;
6562                 int i;
6563
6564                 for (i = 0; i < netisr_ncpus; ++i)
6565                         inflight += rule->cross_rules[i]->cross_refs;
6566                 if (inflight == 0) {
6567                         struct ip_fw *f = rule;
6568
6569                         /*
6570                          * Unlink.
6571                          */
6572                         rule = rule->next;
6573                         if (prev != NULL)
6574                                 prev->next = rule;
6575                         else
6576                                 ipfw_gd.ipfw_crossref_free = rule;
6577
6578                         /*
6579                          * Free.
6580                          */
6581                         for (i = 1; i < netisr_ncpus; ++i) {
6582                                 struct netmsg_base nm;
6583
6584                                 netmsg_init(&nm, NULL, &curthread->td_msgport,
6585                                     MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6586                                 nm.lmsg.u.ms_resultp = f->cross_rules[i];
6587                                 netisr_domsg(&nm, i);
6588                         }
6589                         KKASSERT((f->rule_flags &
6590                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6591                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6592                         ipfw_unref_rule(f);
6593                 } else {
6594                         prev = rule;
6595                         rule = rule->next;
6596                 }
6597         }
6598
6599         if (ipfw_gd.ipfw_crossref_free != NULL) {
6600                 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6601                     ipfw_crossref_timeo, NULL);
6602         }
6603 }
6604
6605 /*
6606  * {set|get}sockopt parser.
6607  */
6608 static int
6609 ipfw_ctl(struct sockopt *sopt)
6610 {
6611         int error, rulenum;
6612         uint32_t *masks;
6613         size_t size;
6614
6615         ASSERT_NETISR0;
6616
6617         error = 0;
6618
6619         switch (sopt->sopt_name) {
6620         case IP_FW_GET:
6621                 error = ipfw_ctl_get_rules(sopt);
6622                 break;
6623
6624         case IP_FW_FLUSH:
6625                 ipfw_flush(0 /* keep default rule */);
6626                 break;
6627
6628         case IP_FW_ADD:
6629                 error = ipfw_ctl_add_rule(sopt);
6630                 break;
6631
6632         case IP_FW_DEL:
6633                 /*
6634                  * IP_FW_DEL is used for deleting single rules or sets,
6635                  * and (ab)used to atomically manipulate sets.
6636                  * Argument size is used to distinguish between the two:
6637                  *    sizeof(uint32_t)
6638                  *      delete single rule or set of rules,
6639                  *      or reassign rules (or sets) to a different set.
6640                  *    2 * sizeof(uint32_t)
6641                  *      atomic disable/enable sets.
6642                  *      first uint32_t contains sets to be disabled,
6643                  *      second uint32_t contains sets to be enabled.
6644                  */
6645                 masks = sopt->sopt_val;
6646                 size = sopt->sopt_valsize;
6647                 if (size == sizeof(*masks)) {
6648                         /*
6649                          * Delete or reassign static rule
6650                          */
6651                         error = ipfw_ctl_alter(masks[0]);
6652                 } else if (size == (2 * sizeof(*masks))) {
6653                         /*
6654                          * Set enable/disable
6655                          */
6656                         ipfw_ctl_set_disable(masks[0], masks[1]);
6657                 } else {
6658                         error = EINVAL;
6659                 }
6660                 break;
6661
6662         case IP_FW_ZERO:
6663         case IP_FW_RESETLOG: /* argument is an int, the rule number */
6664                 rulenum = 0;
6665
6666                 if (sopt->sopt_val != 0) {
6667                     error = soopt_to_kbuf(sopt, &rulenum,
6668                             sizeof(int), sizeof(int));
6669                     if (error)
6670                         break;
6671                 }
6672                 error = ipfw_ctl_zero_entry(rulenum,
6673                         sopt->sopt_name == IP_FW_RESETLOG);
6674                 break;
6675
6676         case IP_FW_TBL_CREATE:
6677                 error = ipfw_table_create(sopt);
6678                 break;
6679
6680         case IP_FW_TBL_ADD:
6681         case IP_FW_TBL_DEL:
6682                 error = ipfw_table_alt(sopt);
6683                 break;
6684
6685         case IP_FW_TBL_FLUSH:
6686         case IP_FW_TBL_DESTROY:
6687                 error = ipfw_table_flush(sopt);
6688                 break;
6689
6690         case IP_FW_TBL_GET:
6691                 error = ipfw_table_get(sopt);
6692                 break;
6693
6694         case IP_FW_TBL_ZERO:
6695                 error = ipfw_table_zero(sopt);
6696                 break;
6697
6698         case IP_FW_TBL_EXPIRE:
6699                 error = ipfw_table_expire(sopt);
6700                 break;
6701
6702         default:
6703                 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6704                 error = EINVAL;
6705         }
6706
6707         ipfw_crossref_reap();
6708         return error;
6709 }
6710
6711 static void
6712 ipfw_keepalive_done(struct ipfw_context *ctx)
6713 {
6714
6715         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6716             ("keepalive is not in progress"));
6717         ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6718         callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6719             ipfw_keepalive, NULL);
6720 }
6721
6722 static void
6723 ipfw_keepalive_more(struct ipfw_context *ctx)
6724 {
6725         struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6726
6727         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6728             ("keepalive is not in progress"));
6729         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6730             ("keepalive more did not finish"));
6731         netisr_sendmsg_oncpu(nm);
6732 }
6733
6734 static void
6735 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6736 {
6737         struct ipfw_state *s;
6738         int scanned = 0, expired = 0, kept = 0;
6739
6740         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6741             ("keepalive is not in progress"));
6742
6743         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6744                 uint32_t ack_rev, ack_fwd;
6745                 struct ipfw_flow_id id;
6746                 uint8_t send_dir;
6747
6748                 if (scanned++ >= ipfw_state_scan_max) {
6749                         ipfw_keepalive_more(ctx);
6750                         return;
6751                 }
6752
6753                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6754                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6755
6756                 /*
6757                  * NOTE:
6758                  * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6759                  * on slave xlat.
6760                  */
6761                 if (s->st_type == O_ANCHOR)
6762                         continue;
6763
6764                 if (IPFW_STATE_ISDEAD(s)) {
6765                         ipfw_state_remove(ctx, s);
6766                         if (++expired >= ipfw_state_expire_max) {
6767                                 ipfw_keepalive_more(ctx);
6768                                 return;
6769                         }
6770                         continue;
6771                 }
6772
6773                 /*
6774                  * Keep alive processing
6775                  */
6776
6777                 if (s->st_proto != IPPROTO_TCP)
6778                         continue;
6779                 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6780                         continue;
6781                 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6782                     s->st_expire))
6783                         continue;       /* too early */
6784
6785                 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6786                     &id.dst_ip, &id.dst_port);
6787                 ack_rev = s->st_ack_rev;
6788                 ack_fwd = s->st_ack_fwd;
6789
6790 #define SEND_FWD        0x1
6791 #define SEND_REV        0x2
6792
6793                 if (IPFW_ISXLAT(s->st_type)) {
6794                         const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6795
6796                         if (x->xlat_dir == MATCH_FORWARD)
6797                                 send_dir = SEND_FWD;
6798                         else
6799                                 send_dir = SEND_REV;
6800                 } else {
6801                         send_dir = SEND_FWD | SEND_REV;
6802                 }
6803
6804                 if (send_dir & SEND_REV)
6805                         send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6806                 if (send_dir & SEND_FWD)
6807                         send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6808
6809 #undef SEND_FWD
6810 #undef SEND_REV
6811
6812                 if (++kept >= ipfw_keepalive_max) {
6813                         ipfw_keepalive_more(ctx);
6814                         return;
6815                 }
6816         }
6817         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6818         ipfw_keepalive_done(ctx);
6819 }
6820
6821 static void
6822 ipfw_keepalive_more_dispatch(netmsg_t nm)
6823 {
6824         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6825         struct ipfw_state *anchor;
6826
6827         ASSERT_NETISR_NCPUS(mycpuid);
6828         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6829             ("keepalive is not in progress"));
6830
6831         /* Reply ASAP */
6832         netisr_replymsg(&nm->base, 0);
6833
6834         anchor = &ctx->ipfw_keepalive_anch;
6835         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6836                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6837                 ipfw_keepalive_done(ctx);
6838                 return;
6839         }
6840         ipfw_keepalive_loop(ctx, anchor);
6841 }
6842
6843 /*
6844  * This procedure is only used to handle keepalives. It is invoked
6845  * every dyn_keepalive_period
6846  */
6847 static void
6848 ipfw_keepalive_dispatch(netmsg_t nm)
6849 {
6850         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6851         struct ipfw_state *anchor;
6852
6853         ASSERT_NETISR_NCPUS(mycpuid);
6854         KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6855             ("keepalive is in progress"));
6856         ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6857
6858         /* Reply ASAP */
6859         crit_enter();
6860         netisr_replymsg(&nm->base, 0);
6861         crit_exit();
6862
6863         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6864                 ipfw_keepalive_done(ctx);
6865                 return;
6866         }
6867
6868         anchor = &ctx->ipfw_keepalive_anch;
6869         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6870         ipfw_keepalive_loop(ctx, anchor);
6871 }
6872
6873 /*
6874  * This procedure is only used to handle keepalives. It is invoked
6875  * every dyn_keepalive_period
6876  */
6877 static void
6878 ipfw_keepalive(void *dummy __unused)
6879 {
6880         struct netmsg_base *msg;
6881
6882         KKASSERT(mycpuid < netisr_ncpus);
6883         msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6884
6885         crit_enter();
6886         if (msg->lmsg.ms_flags & MSGF_DONE)
6887                 netisr_sendmsg_oncpu(msg);
6888         crit_exit();
6889 }
6890
6891 static void
6892 ipfw_ip_input_dispatch(netmsg_t nmsg)
6893 {
6894         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6895         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6896         struct mbuf *m = nm->m;
6897         struct ip_fw *rule = nm->arg1;
6898
6899         ASSERT_NETISR_NCPUS(mycpuid);
6900         KASSERT(rule->cpuid == mycpuid,
6901             ("rule does not belong to cpu%d", mycpuid));
6902         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6903             ("mbuf does not have ipfw continue rule"));
6904
6905         KASSERT(ctx->ipfw_cont_rule == NULL,
6906             ("pending ipfw continue rule"));
6907         ctx->ipfw_cont_rule = rule;
6908         ip_input(m);
6909
6910         /* May not be cleared, if ipfw was unload/disabled. */
6911         ctx->ipfw_cont_rule = NULL;
6912
6913         /*
6914          * This rule is no longer used; decrement its cross_refs,
6915          * so this rule can be deleted.
6916          */
6917         rule->cross_refs--;
6918 }
6919
6920 static void
6921 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6922 {
6923         struct netmsg_genpkt *nm;
6924
6925         KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6926
6927         /*
6928          * NOTE:
6929          * Bump cross_refs to prevent this rule and its siblings
6930          * from being deleted, while this mbuf is inflight.  The
6931          * cross_refs of the sibling rule on the target cpu will
6932          * be decremented, once this mbuf is going to be filtered
6933          * on the target cpu.
6934          */
6935         rule->cross_refs++;
6936         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6937
6938         nm = &m->m_hdr.mh_genmsg;
6939         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6940             ipfw_ip_input_dispatch);
6941         nm->m = m;
6942         nm->arg1 = rule->cross_rules[cpuid];
6943         netisr_sendmsg(&nm->base, cpuid);
6944 }
6945
6946 static void
6947 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6948 {
6949
6950         args->flags = 0;
6951         args->rule = NULL;
6952         args->xlat = NULL;
6953
6954         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6955                 struct m_tag *mtag;
6956
6957                 /* Extract info from dummynet tag */
6958                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6959                 KKASSERT(mtag != NULL);
6960                 args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6961                 KKASSERT(args->rule != NULL);
6962
6963                 m_tag_delete(m, mtag);
6964                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6965         } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6966                 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6967
6968                 KKASSERT(ctx->ipfw_cont_rule != NULL);
6969                 args->rule = ctx->ipfw_cont_rule;
6970                 ctx->ipfw_cont_rule = NULL;
6971
6972                 if (ctx->ipfw_cont_xlat != NULL) {
6973                         args->xlat = ctx->ipfw_cont_xlat;
6974                         ctx->ipfw_cont_xlat = NULL;
6975                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6976                                 args->flags |= IP_FWARG_F_XLATINS;
6977                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6978                         }
6979                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6980                                 args->flags |= IP_FWARG_F_XLATFWD;
6981                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6982                         }
6983                 }
6984                 KKASSERT((m->m_pkthdr.fw_flags &
6985                     (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6986
6987                 args->flags |= IP_FWARG_F_CONT;
6988                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6989         }
6990
6991         args->eh = NULL;
6992         args->oif = oif;
6993         args->m = m;
6994 }
6995
6996 static int
6997 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6998 {
6999         struct ip_fw_args args;
7000         struct mbuf *m = *m0;
7001         int tee = 0, error = 0, ret;
7002
7003         ipfw_init_args(&args, m, NULL);
7004
7005         ret = ipfw_chk(&args);
7006         m = args.m;
7007         if (m == NULL) {
7008                 if (ret != IP_FW_REDISPATCH)
7009                         error = EACCES;
7010                 goto back;
7011         }
7012
7013         switch (ret) {
7014         case IP_FW_PASS:
7015                 break;
7016
7017         case IP_FW_DENY:
7018                 m_freem(m);
7019                 m = NULL;
7020                 error = EACCES;
7021                 break;
7022
7023         case IP_FW_DUMMYNET:
7024                 /* Send packet to the appropriate pipe */
7025                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
7026                 break;
7027
7028         case IP_FW_TEE:
7029                 tee = 1;
7030                 /* FALL THROUGH */
7031
7032         case IP_FW_DIVERT:
7033                 /*
7034                  * Must clear bridge tag when changing
7035                  */
7036                 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7037                 if (ip_divert_p != NULL) {
7038                         m = ip_divert_p(m, tee, 1);
7039                 } else {
7040                         m_freem(m);
7041                         m = NULL;
7042                         /* not sure this is the right error msg */
7043                         error = EACCES;
7044                 }
7045                 break;
7046
7047         default:
7048                 panic("unknown ipfw return value: %d", ret);
7049         }
7050 back:
7051         *m0 = m;
7052         return error;
7053 }
7054
7055 static int
7056 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7057 {
7058         struct ip_fw_args args;
7059         struct mbuf *m = *m0;
7060         int tee = 0, error = 0, ret;
7061
7062         ipfw_init_args(&args, m, ifp);
7063
7064         ret = ipfw_chk(&args);
7065         m = args.m;
7066         if (m == NULL) {
7067                 if (ret != IP_FW_REDISPATCH)
7068                         error = EACCES;
7069                 goto back;
7070         }
7071
7072         switch (ret) {
7073         case IP_FW_PASS:
7074                 break;
7075
7076         case IP_FW_DENY:
7077                 m_freem(m);
7078                 m = NULL;
7079                 error = EACCES;
7080                 break;
7081
7082         case IP_FW_DUMMYNET:
7083                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7084                 break;
7085
7086         case IP_FW_TEE:
7087                 tee = 1;
7088                 /* FALL THROUGH */
7089
7090         case IP_FW_DIVERT:
7091                 if (ip_divert_p != NULL) {
7092                         m = ip_divert_p(m, tee, 0);
7093                 } else {
7094                         m_freem(m);
7095                         m = NULL;
7096                         /* not sure this is the right error msg */
7097                         error = EACCES;
7098                 }
7099                 break;
7100
7101         default:
7102                 panic("unknown ipfw return value: %d", ret);
7103         }
7104 back:
7105         *m0 = m;
7106         return error;
7107 }
7108
7109 static void
7110 ipfw_hook(void)
7111 {
7112         struct pfil_head *pfh;
7113
7114         ASSERT_NETISR0;
7115
7116         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7117         if (pfh == NULL)
7118                 return;
7119
7120         pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7121         pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7122 }
7123
7124 static void
7125 ipfw_dehook(void)
7126 {
7127         struct pfil_head *pfh;
7128
7129         ASSERT_NETISR0;
7130
7131         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7132         if (pfh == NULL)
7133                 return;
7134
7135         pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7136         pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7137 }
7138
7139 static int
7140 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7141 {
7142         int dyn_cnt;
7143
7144         dyn_cnt = ipfw_state_cntcoll();
7145         dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7146
7147         return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7148 }
7149
7150 static int
7151 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7152 {
7153         int state_cnt;
7154
7155         state_cnt = ipfw_state_cntcoll();
7156         return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7157 }
7158
7159 static int
7160 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7161 {
7162         int state_max, error;
7163
7164         state_max = ipfw_state_max;
7165         error = sysctl_handle_int(oidp, &state_max, 0, req);
7166         if (error || req->newptr == NULL)
7167                 return (error);
7168
7169         if (state_max < 1)
7170                 return (EINVAL);
7171
7172         ipfw_state_max_set(state_max);
7173         return (0);
7174 }
7175
7176 static int
7177 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7178 {
7179         int dyn_max, error;
7180
7181         dyn_max = ipfw_state_max + ipfw_track_max;
7182
7183         error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7184         if (error || req->newptr == NULL)
7185                 return (error);
7186
7187         if (dyn_max < 2)
7188                 return (EINVAL);
7189
7190         ipfw_state_max_set(dyn_max / 2);
7191         ipfw_track_max = dyn_max / 2;
7192         return (0);
7193 }
7194
7195 static void
7196 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7197 {
7198         int enable = nmsg->lmsg.u.ms_result;
7199
7200         ASSERT_NETISR0;
7201
7202         if (fw_enable == enable)
7203                 goto reply;
7204
7205         fw_enable = enable;
7206         if (fw_enable)
7207                 ipfw_hook();
7208         else
7209                 ipfw_dehook();
7210 reply:
7211         netisr_replymsg(&nmsg->base, 0);
7212 }
7213
7214 static int
7215 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7216 {
7217         struct netmsg_base nmsg;
7218         int enable, error;
7219
7220         enable = fw_enable;
7221         error = sysctl_handle_int(oidp, &enable, 0, req);
7222         if (error || req->newptr == NULL)
7223                 return error;
7224
7225         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7226             ipfw_sysctl_enable_dispatch);
7227         nmsg.lmsg.u.ms_result = enable;
7228
7229         return netisr_domsg(&nmsg, 0);
7230 }
7231
7232 static int
7233 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7234 {
7235         return sysctl_int_range(oidp, arg1, arg2, req,
7236                IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7237 }
7238
7239 static int
7240 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7241 {
7242
7243         return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7244 }
7245
7246 static int
7247 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7248 {
7249         u_long stat = 0;
7250         int cpu, error;
7251
7252         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7253                 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7254
7255         error = sysctl_handle_long(oidp, &stat, 0, req);
7256         if (error || req->newptr == NULL)
7257                 return (error);
7258
7259         /* Zero out this stat. */
7260         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7261                 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7262         return (0);
7263 }
7264
7265 static void
7266 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7267 {
7268         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7269         struct ipfw_context *ctx;
7270         struct ip_fw *def_rule;
7271
7272         ASSERT_NETISR_NCPUS(mycpuid);
7273
7274         ctx = kmalloc(__offsetof(struct ipfw_context,
7275             ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7276
7277         RB_INIT(&ctx->ipfw_state_tree);
7278         TAILQ_INIT(&ctx->ipfw_state_list);
7279
7280         RB_INIT(&ctx->ipfw_track_tree);
7281         TAILQ_INIT(&ctx->ipfw_track_list);
7282
7283         callout_init_mp(&ctx->ipfw_stateto_ch);
7284         netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7285             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7286         ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7287         netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7288             MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7289
7290         callout_init_mp(&ctx->ipfw_trackto_ch);
7291         netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7292             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7293         netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7294             MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7295
7296         callout_init_mp(&ctx->ipfw_keepalive_ch);
7297         netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7298             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7299         ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7300         netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7301             MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7302
7303         callout_init_mp(&ctx->ipfw_xlatreap_ch);
7304         netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7305             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7306         TAILQ_INIT(&ctx->ipfw_xlatreap);
7307
7308         ipfw_ctx[mycpuid] = ctx;
7309
7310         def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7311
7312         def_rule->act_ofs = 0;
7313         def_rule->rulenum = IPFW_DEFAULT_RULE;
7314         def_rule->cmd_len = 1;
7315         def_rule->set = IPFW_DEFAULT_SET;
7316
7317         def_rule->cmd[0].len = 1;
7318 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7319         def_rule->cmd[0].opcode = O_ACCEPT;
7320 #else
7321         if (filters_default_to_accept)
7322                 def_rule->cmd[0].opcode = O_ACCEPT;
7323         else
7324                 def_rule->cmd[0].opcode = O_DENY;
7325 #endif
7326
7327         def_rule->refcnt = 1;
7328         def_rule->cpuid = mycpuid;
7329
7330         /* Install the default rule */
7331         ctx->ipfw_default_rule = def_rule;
7332         ctx->ipfw_layer3_chain = def_rule;
7333
7334         /* Link rule CPU sibling */
7335         ipfw_link_sibling(fwmsg, def_rule);
7336
7337         /* Statistics only need to be updated once */
7338         if (mycpuid == 0)
7339                 ipfw_inc_static_count(def_rule);
7340
7341         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7342 }
7343
7344 static void
7345 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7346 {
7347
7348         crit_enter();
7349         /* Reply ASAP */
7350         netisr_replymsg(&nmsg->base, 0);
7351         crit_exit();
7352         ipfw_crossref_reap();
7353 }
7354
7355 static void
7356 ipfw_crossref_timeo(void *dummy __unused)
7357 {
7358         struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7359
7360         KKASSERT(mycpuid == 0);
7361
7362         crit_enter();
7363         if (msg->lmsg.ms_flags & MSGF_DONE)
7364                 netisr_sendmsg_oncpu(msg);
7365         crit_exit();
7366 }
7367
7368 static void
7369 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7370 {
7371         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7372         struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7373         struct ip_fw *f;
7374
7375         ASSERT_NETISR_NCPUS(mycpuid);
7376
7377         for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7378                 int l, cmdlen;
7379                 ipfw_insn *cmd;
7380
7381                 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7382                         continue;
7383
7384                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
7385                      l -= cmdlen, cmd += cmdlen) {
7386                         cmdlen = F_LEN(cmd);
7387                         if (cmd->opcode == O_IP_SRC_IFIP ||
7388                             cmd->opcode == O_IP_DST_IFIP) {
7389                                 if (strncmp(ifp->if_xname,
7390                                     ((ipfw_insn_ifip *)cmd)->ifname,
7391                                     IFNAMSIZ) == 0)
7392                                         cmd->arg1 &= ~IPFW_IFIP_VALID;
7393                         }
7394                 }
7395         }
7396         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7397 }
7398
7399 static void
7400 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7401     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7402 {
7403         struct netmsg_base nm;
7404
7405         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7406             ipfw_ifaddr_dispatch);
7407         nm.lmsg.u.ms_resultp = ifp;
7408         netisr_domsg_global(&nm);
7409 }
7410
7411 static void
7412 ipfw_init_dispatch(netmsg_t nmsg)
7413 {
7414         struct netmsg_ipfw fwmsg;
7415         int error = 0, cpu;
7416
7417         ASSERT_NETISR0;
7418
7419         if (IPFW_LOADED) {
7420                 kprintf("IP firewall already loaded\n");
7421                 error = EEXIST;
7422                 goto reply;
7423         }
7424
7425         if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7426                 ipfw_table_max = UINT16_MAX;
7427
7428         /* Initialize global track tree. */
7429         RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7430         IPFW_TRKCNT_TOKINIT;
7431
7432         /* GC for freed crossref rules. */
7433         callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7434         netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7435             MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7436
7437         ipfw_state_max_set(ipfw_state_max);
7438         ipfw_state_headroom = 8 * netisr_ncpus;
7439
7440         bzero(&fwmsg, sizeof(fwmsg));
7441         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7442             ipfw_ctx_init_dispatch);
7443         netisr_domsg_global(&fwmsg.base);
7444
7445         ip_fw_chk_ptr = ipfw_chk;
7446         ip_fw_ctl_ptr = ipfw_ctl;
7447         ip_fw_dn_io_ptr = ipfw_dummynet_io;
7448
7449         kprintf("ipfw2 initialized, default to %s, logging ",
7450                 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7451                 O_ACCEPT ? "accept" : "deny");
7452
7453 #ifdef IPFIREWALL_VERBOSE
7454         fw_verbose = 1;
7455 #endif
7456 #ifdef IPFIREWALL_VERBOSE_LIMIT
7457         verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7458 #endif
7459         if (fw_verbose == 0) {
7460                 kprintf("disabled\n");
7461         } else if (verbose_limit == 0) {
7462                 kprintf("unlimited\n");
7463         } else {
7464                 kprintf("limited to %d packets/entry by default\n",
7465                         verbose_limit);
7466         }
7467
7468         ip_fw_loaded = 1;
7469         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7470                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7471                     ipfw_state_expire_ipifunc, NULL, cpu);
7472                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7473                     ipfw_track_expire_ipifunc, NULL, cpu);
7474                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7475                     ipfw_keepalive, NULL, cpu);
7476         }
7477
7478         if (fw_enable)
7479                 ipfw_hook();
7480
7481         ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7482             NULL, EVENTHANDLER_PRI_ANY);
7483         if (ipfw_ifaddr_event == NULL)
7484                 kprintf("ipfw: ifaddr_event register failed\n");
7485
7486 reply:
7487         netisr_replymsg(&nmsg->base, error);
7488 }
7489
7490 static int
7491 ipfw_init(void)
7492 {
7493         struct netmsg_base smsg;
7494
7495         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7496             ipfw_init_dispatch);
7497         return netisr_domsg(&smsg, 0);
7498 }
7499
7500 #ifdef KLD_MODULE
7501
7502 static void
7503 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7504 {
7505         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7506
7507         ASSERT_NETISR_NCPUS(mycpuid);
7508
7509         callout_cancel(&ctx->ipfw_stateto_ch);
7510         callout_cancel(&ctx->ipfw_trackto_ch);
7511         callout_cancel(&ctx->ipfw_keepalive_ch);
7512         callout_cancel(&ctx->ipfw_xlatreap_ch);
7513
7514         crit_enter();
7515         netisr_dropmsg(&ctx->ipfw_stateexp_more);
7516         netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7517         netisr_dropmsg(&ctx->ipfw_trackexp_more);
7518         netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7519         netisr_dropmsg(&ctx->ipfw_keepalive_more);
7520         netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7521         netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7522         crit_exit();
7523
7524         ipfw_table_flushall_oncpu(ctx, 1);
7525
7526         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7527 }
7528
7529 static void
7530 ipfw_fini_dispatch(netmsg_t nmsg)
7531 {
7532         struct netmsg_base nm;
7533         int error = 0, cpu;
7534
7535         ASSERT_NETISR0;
7536
7537         ipfw_crossref_reap();
7538
7539         if (ipfw_gd.ipfw_refcnt != 0) {
7540                 error = EBUSY;
7541                 goto reply;
7542         }
7543
7544         ip_fw_loaded = 0;
7545         ipfw_dehook();
7546
7547         /* Synchronize any inflight state/track expire IPIs. */
7548         lwkt_synchronize_ipiqs("ipfwfini");
7549
7550         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7551             ipfw_ctx_fini_dispatch);
7552         netisr_domsg_global(&nm);
7553
7554         callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7555         crit_enter();
7556         netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7557         crit_exit();
7558
7559         if (ipfw_ifaddr_event != NULL)
7560                 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7561
7562         ip_fw_chk_ptr = NULL;
7563         ip_fw_ctl_ptr = NULL;
7564         ip_fw_dn_io_ptr = NULL;
7565         ipfw_flush(1 /* kill default rule */);
7566
7567         /* Free pre-cpu context */
7568         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7569                 kfree(ipfw_ctx[cpu], M_IPFW);
7570
7571         kprintf("IP firewall unloaded\n");
7572 reply:
7573         netisr_replymsg(&nmsg->base, error);
7574 }
7575
7576 static void
7577 ipfw_fflush_dispatch(netmsg_t nmsg)
7578 {
7579
7580         ipfw_flush(0 /* keep default rule */);
7581         ipfw_crossref_reap();
7582         netisr_replymsg(&nmsg->base, 0);
7583 }
7584
7585 static int
7586 ipfw_fini(void)
7587 {
7588         struct netmsg_base smsg;
7589         int i = 0;
7590
7591         for (;;) {
7592                 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7593                     ipfw_fflush_dispatch);
7594                 netisr_domsg(&smsg, 0);
7595
7596                 if (ipfw_gd.ipfw_refcnt == 0)
7597                         break;
7598                 kprintf("ipfw: flush pending %d\n", ++i);
7599                 tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7600         }
7601
7602         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7603             ipfw_fini_dispatch);
7604         return netisr_domsg(&smsg, 0);
7605 }
7606
7607 #endif  /* KLD_MODULE */
7608
7609 static int
7610 ipfw_modevent(module_t mod, int type, void *unused)
7611 {
7612         int err = 0;
7613
7614         switch (type) {
7615         case MOD_LOAD:
7616                 err = ipfw_init();
7617                 break;
7618
7619         case MOD_UNLOAD:
7620 #ifndef KLD_MODULE
7621                 kprintf("ipfw statically compiled, cannot unload\n");
7622                 err = EBUSY;
7623 #else
7624                 err = ipfw_fini();
7625 #endif
7626                 break;
7627         default:
7628                 break;
7629         }
7630         return err;
7631 }
7632
7633 static moduledata_t ipfwmod = {
7634         "ipfw",
7635         ipfw_modevent,
7636         0
7637 };
7638 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7639 MODULE_VERSION(ipfw, 1);