Merge branch 'vendor/PAM_PASSWDQC'
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27
28 /*
29  * Implement IP packet firewall (new version)
30  */
31
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58
59 #include <sys/thread2.h>
60 #include <net/netmsg2.h>
61
62 #include <netinet/in.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in_var.h>
65 #include <netinet/in_pcb.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_icmp.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/tcpip.h>
74 #include <netinet/udp.h>
75 #include <netinet/udp_var.h>
76 #include <netinet/ip_divert.h>
77 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
78
79 #include <net/ipfw/ip_fw2.h>
80
81 #ifdef IPFIREWALL_DEBUG
82 #define DPRINTF(fmt, ...) \
83 do { \
84         if (fw_debug > 0) \
85                 kprintf(fmt, __VA_ARGS__); \
86 } while (0)
87 #else
88 #define DPRINTF(fmt, ...)       ((void)0)
89 #endif
90
91 /*
92  * Description about per-CPU rule duplication:
93  *
94  * Module loading/unloading and all ioctl operations are serialized
95  * by netisr0, so we don't have any ordering or locking problems.
96  *
97  * Following graph shows how operation on per-CPU rule list is
98  * performed [2 CPU case]:
99  *
100  *   CPU0                 CPU1
101  *
102  * netisr0 <------------------------------------+
103  *  domsg                                       |
104  *    :                                         |
105  *    :(delete/add...)                          |
106  *    :                                         |
107  *    :         netmsg                          | netmsg
108  *  forwardmsg---------->netisr1                |
109  *                          :                   |
110  *                          :(delete/add...)    |
111  *                          :                   |
112  *                          :                   |
113  *                        replymsg--------------+
114  *
115  *
116  *
117  * Rule structure [2 CPU case]
118  *
119  *    CPU0               CPU1
120  *
121  * layer3_chain       layer3_chain
122  *     |                  |
123  *     V                  V
124  * +-------+ sibling  +-------+ sibling
125  * | rule1 |--------->| rule1 |--------->NULL
126  * +-------+          +-------+
127  *     |                  |
128  *     |next              |next
129  *     V                  V
130  * +-------+ sibling  +-------+ sibling
131  * | rule2 |--------->| rule2 |--------->NULL
132  * +-------+          +-------+
133  *
134  * ip_fw.sibling:
135  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
136  *    iterate layer3_chain in netisr0; the current rule's duplication
137  *    to the other CPUs could safely be read-only accessed through
138  *    ip_fw.sibling.
139  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
140  *    a) In netisr0 rule3 is determined to be inserted between rule1
141  *       and rule2.  To make this decision we need to iterate the
142  *       layer3_chain in netisr0.  The netmsg, which is used to insert
143  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
144  *       in netisr0 as next_rule.
145  *    b) After the insertion in netisr0 is done, we will move on to
146  *       netisr1.  But instead of relocating the rule3's position in
147  *       netisr1 by iterating the layer3_chain in netisr1, we set the
148  *       netmsg's prev_rule to rule1->sibling and next_rule to
149  *       rule2->sibling before the netmsg is forwarded to netisr1 from
150  *       netisr0.
151  */
152
153 /*
154  * Description of states and tracks.
155  *
156  * Both states and tracks are stored in per-cpu RB trees instead of
157  * per-cpu hash tables to avoid the worst case hash degeneration.
158  *
159  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
160  * measured in seconds and depending on the flags.
161  *
162  * When a packet is received, its address fields are first masked with
163  * the mask defined for the rule, then matched against the entries in
164  * the per-cpu state RB tree.  States are generated by 'keep-state'
165  * and 'limit' options.
166  *
167  * The max number of states is ipfw_state_max.  When we reach the
168  * maximum number of states we do not create anymore.  This is done to
169  * avoid consuming too much memory, but also too much time when
170  * searching on each packet.
171  *
172  * Each state holds a pointer to the parent ipfw rule of the current
173  * CPU so we know what action to perform.  States are removed when the
174  * parent rule is deleted.  XXX we should make them survive.
175  *
176  * There are some limitations with states -- we do not obey the
177  * 'randomized match', and we do not do multiple passes through the
178  * firewall.  XXX check the latter!!!
179  *
180  * States grow independently on each CPU, e.g. 2 CPU case:
181  *
182  *        CPU0                     CPU1
183  * ...................      ...................
184  * :  state RB tree  :      :  state RB tree  :
185  * :                 :      :                 :
186  * : state1   state2 :      :      state3     :
187  * :     |    |      :      :        |        :
188  * :.....|....|......:      :........|........:
189  *       |    |                      |
190  *       |    |                      |st_rule
191  *       |    |                      |
192  *       V    V                      V
193  *     +-------+                 +-------+
194  *     | rule1 |                 | rule1 |
195  *     +-------+                 +-------+
196  *
197  * Tracks are used to enforce limits on the number of sessions.  Tracks
198  * are generated by 'limit' option.
199  *
200  * The max number of tracks is ipfw_track_max.  When we reach the
201  * maximum number of tracks we do not create anymore.  This is done to
202  * avoid consuming too much memory.
203  *
204  * Tracks are organized into two layers, track counter RB tree is
205  * shared between CPUs, track RB tree is per-cpu.  States generated by
206  * 'limit' option are linked to the track in addition to the per-cpu
207  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
208  *
209  *             ..............................
210  *             :    track counter RB tree   :
211  *             :                            :
212  *             :        +-----------+       :
213  *             :        |  trkcnt1  |       :
214  *             :        |           |       :
215  *             :      +--->counter<----+    :
216  *             :      | |           |  |    :
217  *             :      | +-----------+  |    :
218  *             :......|................|....:
219  *                    |                |
220  *        CPU0        |                |         CPU1
221  * .................  |t_count         |  .................
222  * : track RB tree :  |                |  : track RB tree :
223  * :               :  |                |  :               :
224  * : +-->track1-------+                +--------track2    :
225  * : |     A       :                      :               :
226  * : |     |       :                      :               :
227  * :.|.....|.......:                      :...............:
228  *   |     +----------------+
229  *   | .................... |
230  *   | :   state RB tree  : |st_track
231  *   | :                  : |
232  *   +---state1    state2---+
233  *     :     |       |    :
234  *     :.....|.......|....:
235  *           |       |
236  *           |       |st_rule
237  *           V       V
238  *         +----------+
239  *         |   rule1  |
240  *         +----------+
241  */
242
243 #define IPFW_AUTOINC_STEP_MIN   1
244 #define IPFW_AUTOINC_STEP_MAX   1000
245 #define IPFW_AUTOINC_STEP_DEF   100
246
247 #define IPFW_TABLE_MAX_DEF      64
248
249 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
250 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
251
252 #define MATCH_REVERSE           0
253 #define MATCH_FORWARD           1
254 #define MATCH_NONE              2
255 #define MATCH_UNKNOWN           3
256
257 #define TIME_LEQ(a, b)          ((a) - (b) <= 0)
258
259 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
260 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
261                                  (IPFW_STATE_TCPFLAGS << 8))
262
263 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
264 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
265 #define BOTH_RST                (TH_RST | (TH_RST << 8))
266 /* TH_ACK here means FIN was ACKed. */
267 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
268
269 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
270                                  (((s)->st_state & BOTH_RST) ||         \
271                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
272
273 #define O_ANCHOR                O_NOP
274
275 #define IPFW_ISXLAT(type)       ((type) == O_REDIRECT)
276 #define IPFW_XLAT_INVALID(s)    (IPFW_ISXLAT((s)->st_type) &&   \
277                                  ((struct ipfw_xlat *)(s))->xlat_invalid)
278
279 #define IPFW_MBUF_XLATINS       FW_MBUF_PRIVATE1
280 #define IPFW_MBUF_XLATFWD       FW_MBUF_PRIVATE2
281
282 #define IPFW_XLATE_INSERT       0x0001
283 #define IPFW_XLATE_FORWARD      0x0002
284 #define IPFW_XLATE_OUTPUT       0x0004
285
286 struct netmsg_ipfw {
287         struct netmsg_base      base;
288         const struct ipfw_ioc_rule *ioc_rule;
289         struct ip_fw            *next_rule;
290         struct ip_fw            *prev_rule;
291         struct ip_fw            *sibling;
292         uint32_t                rule_flags;
293         struct ip_fw            **cross_rules;
294 };
295
296 struct netmsg_del {
297         struct netmsg_base      base;
298         struct ip_fw            *start_rule;
299         struct ip_fw            *prev_rule;
300         uint16_t                rulenum;
301         uint8_t                 from_set;
302         uint8_t                 to_set;
303 };
304
305 struct netmsg_zent {
306         struct netmsg_base      base;
307         struct ip_fw            *start_rule;
308         uint16_t                rulenum;
309         uint16_t                log_only;
310 };
311
312 struct netmsg_cpstate {
313         struct netmsg_base      base;
314         struct ipfw_ioc_state   *ioc_state;
315         int                     state_cntmax;
316         int                     state_cnt;
317 };
318
319 struct netmsg_tblent {
320         struct netmsg_base      base;
321         struct sockaddr         *key;
322         struct sockaddr         *netmask;
323         struct ipfw_tblent      *sibling;
324         int                     tableid;
325 };
326
327 struct netmsg_tblflush {
328         struct netmsg_base      base;
329         int                     tableid;
330         int                     destroy;
331 };
332
333 struct netmsg_tblexp {
334         struct netmsg_base      base;
335         time_t                  expire;
336         int                     tableid;
337         int                     cnt;
338         int                     expcnt;
339         struct radix_node_head  *rnh;
340 };
341
342 struct ipfw_table_cp {
343         struct ipfw_ioc_tblent  *te;
344         int                     te_idx;
345         int                     te_cnt;
346 };
347
348 struct ip_fw_local {
349         /*
350          * offset       The offset of a fragment. offset != 0 means that
351          *      we have a fragment at this offset of an IPv4 packet.
352          *      offset == 0 means that (if this is an IPv4 packet)
353          *      this is the first or only fragment.
354          */
355         u_short                 offset;
356
357         /*
358          * Local copies of addresses. They are only valid if we have
359          * an IP packet.
360          *
361          * proto        The protocol. Set to 0 for non-ip packets,
362          *      or to the protocol read from the packet otherwise.
363          *      proto != 0 means that we have an IPv4 packet.
364          *
365          * src_port, dst_port   port numbers, in HOST format. Only
366          *      valid for TCP and UDP packets.
367          *
368          * src_ip, dst_ip       ip addresses, in NETWORK format.
369          *      Only valid for IPv4 packets.
370          */
371         uint8_t                 proto;
372         uint16_t                src_port;       /* NOTE: host format    */
373         uint16_t                dst_port;       /* NOTE: host format    */
374         struct in_addr          src_ip;         /* NOTE: network format */
375         struct in_addr          dst_ip;         /* NOTE: network format */
376         uint16_t                ip_len;
377         struct tcphdr           *tcp;
378 };
379
380 struct ipfw_addrs {
381         uint32_t                addr1;  /* host byte order */
382         uint32_t                addr2;  /* host byte order */
383 };
384
385 struct ipfw_ports {
386         uint16_t                port1;  /* host byte order */
387         uint16_t                port2;  /* host byte order */
388 };
389
390 struct ipfw_key {
391         union {
392                 struct ipfw_addrs addrs;
393                 uint64_t        value;
394         } addr_u;
395         union {
396                 struct ipfw_ports ports;
397                 uint32_t        value;
398         } port_u;
399         uint8_t                 proto;
400         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
401         uint16_t                rsvd2;
402 };
403
404 #define IPFW_KEY_SWAP_ADDRS     0x1
405 #define IPFW_KEY_SWAP_PORTS     0x2
406 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
407
408 struct ipfw_trkcnt {
409         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
410         struct ipfw_key         tc_key;
411         uintptr_t               tc_ruleid;
412         int                     tc_refs;
413         int                     tc_count;
414         time_t                  tc_expire;      /* userland get-only */
415         uint16_t                tc_rulenum;     /* userland get-only */
416 } __cachealign;
417
418 #define tc_addrs                tc_key.addr_u.value
419 #define tc_ports                tc_key.port_u.value
420 #define tc_proto                tc_key.proto
421 #define tc_saddr                tc_key.addr_u.addrs.addr1
422 #define tc_daddr                tc_key.addr_u.addrs.addr2
423 #define tc_sport                tc_key.port_u.ports.port1
424 #define tc_dport                tc_key.port_u.ports.port2
425
426 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
427
428 struct ipfw_state;
429
430 struct ipfw_track {
431         RB_ENTRY(ipfw_track)    t_rblink;
432         struct ipfw_key         t_key;
433         struct ip_fw            *t_rule;
434         time_t                  t_lastexp;
435         LIST_HEAD(, ipfw_state) t_state_list;
436         time_t                  t_expire;
437         volatile int            *t_count;
438         struct ipfw_trkcnt      *t_trkcnt;
439         TAILQ_ENTRY(ipfw_track) t_link;
440 };
441
442 #define t_addrs                 t_key.addr_u.value
443 #define t_ports                 t_key.port_u.value
444 #define t_proto                 t_key.proto
445 #define t_saddr                 t_key.addr_u.addrs.addr1
446 #define t_daddr                 t_key.addr_u.addrs.addr2
447 #define t_sport                 t_key.port_u.ports.port1
448 #define t_dport                 t_key.port_u.ports.port2
449
450 RB_HEAD(ipfw_track_tree, ipfw_track);
451 TAILQ_HEAD(ipfw_track_list, ipfw_track);
452
453 struct ipfw_state {
454         RB_ENTRY(ipfw_state)    st_rblink;
455         struct ipfw_key         st_key;
456
457         time_t                  st_expire;      /* expire time */
458         struct ip_fw            *st_rule;
459
460         uint64_t                st_pcnt;        /* packets */
461         uint64_t                st_bcnt;        /* bytes */
462
463         /*
464          * st_state:
465          * State of this rule, typically a combination of TCP flags.
466          *
467          * st_ack_fwd/st_ack_rev:
468          * Most recent ACKs in forward and reverse direction.  They
469          * are used to generate keepalives.
470          */
471         uint32_t                st_state;
472         uint32_t                st_ack_fwd;     /* host byte order */
473         uint32_t                st_seq_fwd;     /* host byte order */
474         uint32_t                st_ack_rev;     /* host byte order */
475         uint32_t                st_seq_rev;     /* host byte order */
476
477         uint16_t                st_flags;       /* IPFW_STATE_F_ */
478         uint16_t                st_type;        /* KEEP_STATE/LIMIT/RDR */
479         struct ipfw_track       *st_track;
480
481         LIST_ENTRY(ipfw_state)  st_trklink;
482         TAILQ_ENTRY(ipfw_state) st_link;
483 };
484
485 #define st_addrs                st_key.addr_u.value
486 #define st_ports                st_key.port_u.value
487 #define st_proto                st_key.proto
488 #define st_swap                 st_key.swap
489
490 #define IPFW_STATE_F_ACKFWD     0x0001
491 #define IPFW_STATE_F_SEQFWD     0x0002
492 #define IPFW_STATE_F_ACKREV     0x0004
493 #define IPFW_STATE_F_SEQREV     0x0008
494 #define IPFW_STATE_F_XLATSRC    0x0010
495 #define IPFW_STATE_F_XLATSLAVE  0x0020
496 #define IPFW_STATE_F_LINKED     0x0040
497
498 #define IPFW_STATE_SCANSKIP(s)  ((s)->st_type == O_ANCHOR ||    \
499                                  ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
500
501 /* Expired or being deleted. */
502 #define IPFW_STATE_ISDEAD(s)    (TIME_LEQ((s)->st_expire, time_uptime) || \
503                                  IPFW_XLAT_INVALID((s)))
504
505 TAILQ_HEAD(ipfw_state_list, ipfw_state);
506 RB_HEAD(ipfw_state_tree, ipfw_state);
507
508 struct ipfw_xlat {
509         struct ipfw_state       xlat_st;        /* MUST be the first field */
510         uint32_t                xlat_addr;      /* network byte order */
511         uint16_t                xlat_port;      /* network byte order */
512         uint16_t                xlat_dir;       /* MATCH_ */
513         struct ifnet            *xlat_ifp;      /* matching ifnet */
514         struct ipfw_xlat        *xlat_pair;     /* paired state */
515         int                     xlat_pcpu;      /* paired cpu */
516         volatile int            xlat_invalid;   /* invalid, but not dtor yet */
517         volatile uint64_t       xlat_crefs;     /* cross references */
518         struct netmsg_base      xlat_freenm;    /* for remote free */
519 };
520
521 #define xlat_type               xlat_st.st_type
522 #define xlat_flags              xlat_st.st_flags
523 #define xlat_rule               xlat_st.st_rule
524 #define xlat_bcnt               xlat_st.st_bcnt
525 #define xlat_pcnt               xlat_st.st_pcnt
526
527 struct ipfw_tblent {
528         struct radix_node       te_nodes[2];
529         struct sockaddr_in      te_key;
530         u_long                  te_use;
531         time_t                  te_lastuse;
532         struct ipfw_tblent      *te_sibling;
533         volatile int            te_expired;
534 };
535
536 struct ipfw_context {
537         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
538         struct ip_fw            *ipfw_default_rule;     /* default rule */
539         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
540
541         /*
542          * ipfw_set_disable contains one bit per set value (0..31).
543          * If the bit is set, all rules with the corresponding set
544          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
545          * default rule and CANNOT be disabled.
546          */
547         uint32_t                ipfw_set_disable;
548
549         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
550
551         struct ip_fw            *ipfw_cont_rule;
552         struct ipfw_xlat        *ipfw_cont_xlat;
553
554         struct ipfw_state_tree  ipfw_state_tree;
555         struct ipfw_state_list  ipfw_state_list;
556         int                     ipfw_state_loosecnt;
557         int                     ipfw_state_cnt;
558
559         union {
560                 struct ipfw_state state;
561                 struct ipfw_track track;
562                 struct ipfw_trkcnt trkcnt;
563         } ipfw_tmpkey;
564
565         struct ipfw_track_tree  ipfw_track_tree;
566         struct ipfw_track_list  ipfw_track_list;
567         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
568
569         struct callout          ipfw_stateto_ch;
570         time_t                  ipfw_state_lastexp;
571         struct netmsg_base      ipfw_stateexp_nm;
572         struct netmsg_base      ipfw_stateexp_more;
573         struct ipfw_state       ipfw_stateexp_anch;
574
575         struct callout          ipfw_trackto_ch;
576         time_t                  ipfw_track_lastexp;
577         struct netmsg_base      ipfw_trackexp_nm;
578         struct netmsg_base      ipfw_trackexp_more;
579         struct ipfw_track       ipfw_trackexp_anch;
580
581         struct callout          ipfw_keepalive_ch;
582         struct netmsg_base      ipfw_keepalive_nm;
583         struct netmsg_base      ipfw_keepalive_more;
584         struct ipfw_state       ipfw_keepalive_anch;
585
586         struct callout          ipfw_xlatreap_ch;
587         struct netmsg_base      ipfw_xlatreap_nm;
588         struct ipfw_state_list  ipfw_xlatreap;
589
590         /*
591          * Statistics
592          */
593         u_long                  ipfw_sts_reap;
594         u_long                  ipfw_sts_reapfailed;
595         u_long                  ipfw_sts_overflow;
596         u_long                  ipfw_sts_nomem;
597         u_long                  ipfw_sts_tcprecycled;
598
599         u_long                  ipfw_tks_nomem;
600         u_long                  ipfw_tks_reap;
601         u_long                  ipfw_tks_reapfailed;
602         u_long                  ipfw_tks_overflow;
603         u_long                  ipfw_tks_cntnomem;
604
605         u_long                  ipfw_frags;
606         u_long                  ipfw_defraged;
607         u_long                  ipfw_defrag_remote;
608
609         u_long                  ipfw_xlated;
610         u_long                  ipfw_xlate_split;
611         u_long                  ipfw_xlate_conflicts;
612         u_long                  ipfw_xlate_cresolved;
613
614         /* Last field */
615         struct radix_node_head  *ipfw_tables[];
616 };
617
618 #define IPFW_FLAG_KEEPALIVE     0x01
619 #define IPFW_FLAG_STATEEXP      0x02
620 #define IPFW_FLAG_TRACKEXP      0x04
621 #define IPFW_FLAG_STATEREAP     0x08
622 #define IPFW_FLAG_TRACKREAP     0x10
623
624 #define ipfw_state_tmpkey       ipfw_tmpkey.state
625 #define ipfw_track_tmpkey       ipfw_tmpkey.track
626 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
627
628 struct ipfw_global {
629         int                     ipfw_state_loosecnt;    /* cache aligned */
630         time_t                  ipfw_state_globexp __cachealign;
631
632         struct lwkt_token       ipfw_trkcnt_token __cachealign;
633         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
634         int                     ipfw_trkcnt_cnt;
635         time_t                  ipfw_track_globexp;
636
637         /* Accessed in netisr0. */
638         struct ip_fw            *ipfw_crossref_free __cachealign;
639         struct callout          ipfw_crossref_ch;
640         struct netmsg_base      ipfw_crossref_nm;
641
642 #ifdef KLD_MODULE
643         /*
644          * Module can not be unloaded, if there are references to
645          * certains rules of ipfw(4), e.g. dummynet(4)
646          */
647         int                     ipfw_refcnt __cachealign;
648 #endif
649 } __cachealign;
650
651 static struct ipfw_context      *ipfw_ctx[MAXCPU];
652
653 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
654
655 /*
656  * Following two global variables are accessed and updated only
657  * in netisr0.
658  */
659 static uint32_t static_count;   /* # of static rules */
660 static uint32_t static_ioc_len; /* bytes of static rules */
661
662 /*
663  * If 1, then ipfw static rules are being flushed,
664  * ipfw_chk() will skip to the default rule.
665  */
666 static int ipfw_flushing;
667
668 static int fw_verbose;
669 static int verbose_limit;
670
671 static int fw_debug;
672 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
673
674 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
675
676 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
677 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
678
679 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
680
681 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
682 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
683     "Firewall statistics");
684
685 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
686     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
687 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
688     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
689     "Rule number autincrement step");
690 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
691     &fw_one_pass, 0,
692     "Only do a single pass through ipfw when using dummynet(4)");
693 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
694     &fw_debug, 0, "Enable printing of debug ip_fw statements");
695 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
696     &fw_verbose, 0, "Log matches to ipfw rules");
697 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
698     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
699 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
700     &ipfw_table_max, 0, "Max # of tables");
701
702 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
703 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
704 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
705 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
706 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
707 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
708
709 /*
710  * Timeouts for various events in handing states.
711  *
712  * NOTE:
713  * 1 == 0~1 second.
714  * 2 == 1~2 second(s).
715  *
716  * We use 2 seconds for FIN lifetime, so that the states will not be
717  * ripped prematurely.
718  */
719 static uint32_t dyn_ack_lifetime = 300;
720 static uint32_t dyn_syn_lifetime = 20;
721 static uint32_t dyn_finwait_lifetime = 20;
722 static uint32_t dyn_fin_lifetime = 2;
723 static uint32_t dyn_rst_lifetime = 2;
724 static uint32_t dyn_udp_lifetime = 10;
725 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
726
727 /*
728  * Keepalives are sent if dyn_keepalive is set. They are sent every
729  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
730  * seconds of lifetime of a rule.
731  */
732 static uint32_t dyn_keepalive_interval = 20;
733 static uint32_t dyn_keepalive_period = 5;
734 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
735
736 static struct ipfw_global       ipfw_gd;
737 static int      ipfw_state_loosecnt_updthr;
738 static int      ipfw_state_max = 4096;  /* max # of states */
739 static int      ipfw_track_max = 4096;  /* max # of tracks */
740
741 static int      ipfw_state_headroom;    /* setup at module load time */
742 static int      ipfw_state_reap_min = 8;
743 static int      ipfw_state_expire_max = 32;
744 static int      ipfw_state_scan_max = 256;
745 static int      ipfw_keepalive_max = 8;
746 static int      ipfw_track_reap_max = 4;
747 static int      ipfw_track_expire_max = 16;
748 static int      ipfw_track_scan_max = 128;
749
750 static eventhandler_tag ipfw_ifaddr_event;
751
752 /* Compat */
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
754     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
755     "Number of states and tracks");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
757     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
758     "Max number of states and tracks");
759
760 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
761     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
762     "Number of states");
763 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
764     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
765     "Max number of states");
766 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
767     &ipfw_state_headroom, 0, "headroom for state reap");
768 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
769     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
770 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
771     &ipfw_track_max, 0, "Max number of tracks");
772 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
773     &static_count, 0, "Number of static rules");
774 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
775     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
776 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
777     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
778 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
779     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
780 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
781     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
782 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
783     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
784 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
785     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
786 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
787     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
788 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
789     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
790 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
791     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
792     "I", "# of states to scan for each expire iteration");
793 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
794     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
795     "I", "# of states to expire for each expire iteration");
796 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
797     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
798     "I", "# of states to expire for each expire iteration");
799 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
800     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
801     "I", "# of states to reap for state shortage");
802 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
803     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
804     "I", "# of tracks to scan for each expire iteration");
805 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
806     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
807     "I", "# of tracks to expire for each expire iteration");
808 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
809     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
810     "I", "# of tracks to reap for track shortage");
811
812 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
813     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
814     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
815     "LU", "# of state reaps due to states shortage");
816 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
817     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
818     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
819     "LU", "# of state reap failure");
820 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
821     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
822     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
823     "LU", "# of state overflow");
824 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
825     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
826     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
827     "LU", "# of state allocation failure");
828 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
829     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
830     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
831     "LU", "# of state deleted due to fast TCP port recycling");
832
833 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
834     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
835     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
836     "LU", "# of track allocation failure");
837 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
838     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
839     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
840     "LU", "# of track reap due to tracks shortage");
841 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
842     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
843     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
844     "LU", "# of track reap failure");
845 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
846     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
847     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
848     "LU", "# of track overflow");
849 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
850     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
851     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
852     "LU", "# of track counter allocation failure");
853 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
854     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
855     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
856     "LU", "# of IP fragements defraged");
857 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
858     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
859     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
860     "LU", "# of IP packets after defrag");
861 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
862     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
863     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
864     "LU", "# of IP packets after defrag dispatched to remote cpus");
865 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
866     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
867     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
868     "LU", "# address/port translations");
869 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
870     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
871     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
872     "LU", "# address/port translations split between different cpus");
873 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
874     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
875     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
876     "LU", "# address/port translations conflicts on remote cpu");
877 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
878     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
879     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
880     "LU", "# address/port translations conflicts resolved on remote cpu");
881
882 static int              ipfw_state_cmp(struct ipfw_state *,
883                             struct ipfw_state *);
884 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
885                             struct ipfw_trkcnt *);
886 static int              ipfw_track_cmp(struct ipfw_track *,
887                             struct ipfw_track *);
888
889 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
890 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891
892 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
893 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894
895 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
896 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897
898 static int              ipfw_chk(struct ip_fw_args *);
899 static void             ipfw_track_expire_ipifunc(void *);
900 static void             ipfw_state_expire_ipifunc(void *);
901 static void             ipfw_keepalive(void *);
902 static int              ipfw_state_expire_start(struct ipfw_context *,
903                             int, int);
904 static void             ipfw_crossref_timeo(void *);
905 static void             ipfw_state_remove(struct ipfw_context *,
906                             struct ipfw_state *);
907 static void             ipfw_xlat_reap_timeo(void *);
908 static void             ipfw_defrag_redispatch(struct mbuf *, int,
909                             struct ip_fw *);
910
911 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
912 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKINIT     \
914         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
915
916 static void
917 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
918     const struct sockaddr *netmask)
919 {
920         const u_char *cp1 = (const u_char *)src;
921         u_char *cp2 = (u_char *)dst;
922         const u_char *cp3 = (const u_char *)netmask;
923         u_char *cplim = cp2 + *cp3;
924         u_char *cplim2 = cp2 + *cp1;
925
926         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
927         cp3 += 2;
928         if (cplim > cplim2)
929                 cplim = cplim2;
930         while (cp2 < cplim)
931                 *cp2++ = *cp1++ & *cp3++;
932         if (cp2 < cplim2)
933                 bzero(cp2, cplim2 - cp2);
934 }
935
936 static __inline uint16_t
937 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
938 {
939         uint32_t l;
940
941         if (udp && !cksum)
942                 return (0x0000);
943         l = cksum + old - new;
944         l = (l >> 16) + (l & 65535);
945         l = l & 65535;
946         if (udp && !l)
947                 return (0xFFFF);
948         return (l);
949 }
950
951 static __inline void
952 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
953     in_addr_t daddr, uint16_t dport, uint8_t proto)
954 {
955
956         key->proto = proto;
957         key->swap = 0;
958
959         if (saddr < daddr) {
960                 key->addr_u.addrs.addr1 = daddr;
961                 key->addr_u.addrs.addr2 = saddr;
962                 key->swap |= IPFW_KEY_SWAP_ADDRS;
963         } else {
964                 key->addr_u.addrs.addr1 = saddr;
965                 key->addr_u.addrs.addr2 = daddr;
966         }
967
968         if (sport < dport) {
969                 key->port_u.ports.port1 = dport;
970                 key->port_u.ports.port2 = sport;
971                 key->swap |= IPFW_KEY_SWAP_PORTS;
972         } else {
973                 key->port_u.ports.port1 = sport;
974                 key->port_u.ports.port2 = dport;
975         }
976
977         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
978                 key->swap |= IPFW_KEY_SWAP_PORTS;
979         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
980                 key->swap |= IPFW_KEY_SWAP_ADDRS;
981 }
982
983 static __inline void
984 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
985     in_addr_t *daddr, uint16_t *dport)
986 {
987
988         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
989                 *saddr = key->addr_u.addrs.addr2;
990                 *daddr = key->addr_u.addrs.addr1;
991         } else {
992                 *saddr = key->addr_u.addrs.addr1;
993                 *daddr = key->addr_u.addrs.addr2;
994         }
995
996         if (key->swap & IPFW_KEY_SWAP_PORTS) {
997                 *sport = key->port_u.ports.port2;
998                 *dport = key->port_u.ports.port1;
999         } else {
1000                 *sport = key->port_u.ports.port1;
1001                 *dport = key->port_u.ports.port2;
1002         }
1003 }
1004
1005 static int
1006 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1007 {
1008
1009         if (s1->st_proto > s2->st_proto)
1010                 return (1);
1011         if (s1->st_proto < s2->st_proto)
1012                 return (-1);
1013
1014         if (s1->st_addrs > s2->st_addrs)
1015                 return (1);
1016         if (s1->st_addrs < s2->st_addrs)
1017                 return (-1);
1018
1019         if (s1->st_ports > s2->st_ports)
1020                 return (1);
1021         if (s1->st_ports < s2->st_ports)
1022                 return (-1);
1023
1024         if (s1->st_swap == s2->st_swap ||
1025             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1026                 return (0);
1027
1028         if (s1->st_swap > s2->st_swap)
1029                 return (1);
1030         else
1031                 return (-1);
1032 }
1033
1034 static int
1035 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1036 {
1037
1038         if (t1->tc_proto > t2->tc_proto)
1039                 return (1);
1040         if (t1->tc_proto < t2->tc_proto)
1041                 return (-1);
1042
1043         if (t1->tc_addrs > t2->tc_addrs)
1044                 return (1);
1045         if (t1->tc_addrs < t2->tc_addrs)
1046                 return (-1);
1047
1048         if (t1->tc_ports > t2->tc_ports)
1049                 return (1);
1050         if (t1->tc_ports < t2->tc_ports)
1051                 return (-1);
1052
1053         if (t1->tc_ruleid > t2->tc_ruleid)
1054                 return (1);
1055         if (t1->tc_ruleid < t2->tc_ruleid)
1056                 return (-1);
1057
1058         return (0);
1059 }
1060
1061 static int
1062 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1063 {
1064
1065         if (t1->t_proto > t2->t_proto)
1066                 return (1);
1067         if (t1->t_proto < t2->t_proto)
1068                 return (-1);
1069
1070         if (t1->t_addrs > t2->t_addrs)
1071                 return (1);
1072         if (t1->t_addrs < t2->t_addrs)
1073                 return (-1);
1074
1075         if (t1->t_ports > t2->t_ports)
1076                 return (1);
1077         if (t1->t_ports < t2->t_ports)
1078                 return (-1);
1079
1080         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1081                 return (1);
1082         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1083                 return (-1);
1084
1085         return (0);
1086 }
1087
1088 static __inline struct ipfw_state *
1089 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1090 {
1091         struct ipfw_state *dup;
1092
1093         KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1094             ("state %p was linked", s));
1095         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1096         if (dup == NULL) {
1097                 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1098                 s->st_flags |= IPFW_STATE_F_LINKED;
1099         }
1100         return (dup);
1101 }
1102
1103 static __inline void
1104 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1105 {
1106
1107         KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1108             ("state %p was not linked", s));
1109         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1110         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1111         s->st_flags &= ~IPFW_STATE_F_LINKED;
1112 }
1113
1114 static void
1115 ipfw_state_max_set(int state_max)
1116 {
1117
1118         ipfw_state_max = state_max;
1119         /* Allow 5% states over-allocation. */
1120         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1121 }
1122
1123 static __inline int
1124 ipfw_state_cntcoll(void)
1125 {
1126         int cpu, state_cnt = 0;
1127
1128         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1129                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1130         return (state_cnt);
1131 }
1132
1133 static __inline int
1134 ipfw_state_cntsync(void)
1135 {
1136         int state_cnt;
1137
1138         state_cnt = ipfw_state_cntcoll();
1139         ipfw_gd.ipfw_state_loosecnt = state_cnt;
1140         return (state_cnt);
1141 }
1142
1143 static __inline int
1144 ipfw_free_rule(struct ip_fw *rule)
1145 {
1146         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1147         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1148         rule->refcnt--;
1149         if (rule->refcnt == 0) {
1150                 if (rule->cross_rules != NULL)
1151                         kfree(rule->cross_rules, M_IPFW);
1152                 kfree(rule, M_IPFW);
1153                 return 1;
1154         }
1155         return 0;
1156 }
1157
1158 static void
1159 ipfw_unref_rule(void *priv)
1160 {
1161         ipfw_free_rule(priv);
1162 #ifdef KLD_MODULE
1163         KASSERT(ipfw_gd.ipfw_refcnt > 0,
1164             ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1165         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1166 #endif
1167 }
1168
1169 static __inline void
1170 ipfw_ref_rule(struct ip_fw *rule)
1171 {
1172         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1173 #ifdef KLD_MODULE
1174         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1175 #endif
1176         rule->refcnt++;
1177 }
1178
1179 /*
1180  * This macro maps an ip pointer into a layer3 header pointer of type T
1181  */
1182 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1183
1184 static __inline int
1185 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1186 {
1187         int type = L3HDR(struct icmp,ip)->icmp_type;
1188         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1189         int idx = type / 32;
1190
1191         if (idx >= idx_max)
1192                 return (0);
1193         return (cmd->d[idx] & (1 << (type % 32)));
1194 }
1195
1196 static __inline int
1197 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1198 {
1199         int code = L3HDR(struct icmp,ip)->icmp_code;
1200         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1201         int idx = code / 32;
1202
1203         if (idx >= idx_max)
1204                 return (0);
1205         return (cmd->d[idx] & (1 << (code % 32)));
1206 }
1207
1208 #define TT      ((1 << ICMP_ECHO) | \
1209                  (1 << ICMP_ROUTERSOLICIT) | \
1210                  (1 << ICMP_TSTAMP) | \
1211                  (1 << ICMP_IREQ) | \
1212                  (1 << ICMP_MASKREQ))
1213
1214 static int
1215 is_icmp_query(struct ip *ip)
1216 {
1217         int type = L3HDR(struct icmp, ip)->icmp_type;
1218
1219         return (type < 32 && (TT & (1 << type)));
1220 }
1221
1222 #undef TT
1223
1224 /*
1225  * The following checks use two arrays of 8 or 16 bits to store the
1226  * bits that we want set or clear, respectively. They are in the
1227  * low and high half of cmd->arg1 or cmd->d[0].
1228  *
1229  * We scan options and store the bits we find set. We succeed if
1230  *
1231  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1232  *
1233  * The code is sometimes optimized not to store additional variables.
1234  */
1235 static int
1236 flags_match(ipfw_insn *cmd, uint8_t bits)
1237 {
1238         u_char want_clear;
1239         bits = ~bits;
1240
1241         if (((cmd->arg1 & 0xff) & bits) != 0)
1242                 return 0; /* some bits we want set were clear */
1243
1244         want_clear = (cmd->arg1 >> 8) & 0xff;
1245         if ((want_clear & bits) != want_clear)
1246                 return 0; /* some bits we want clear were set */
1247         return 1;
1248 }
1249
1250 static int
1251 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1252 {
1253         int optlen, bits = 0;
1254         u_char *cp = (u_char *)(ip + 1);
1255         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1256
1257         for (; x > 0; x -= optlen, cp += optlen) {
1258                 int opt = cp[IPOPT_OPTVAL];
1259
1260                 if (opt == IPOPT_EOL)
1261                         break;
1262
1263                 if (opt == IPOPT_NOP) {
1264                         optlen = 1;
1265                 } else {
1266                         optlen = cp[IPOPT_OLEN];
1267                         if (optlen <= 0 || optlen > x)
1268                                 return 0; /* invalid or truncated */
1269                 }
1270
1271                 switch (opt) {
1272                 case IPOPT_LSRR:
1273                         bits |= IP_FW_IPOPT_LSRR;
1274                         break;
1275
1276                 case IPOPT_SSRR:
1277                         bits |= IP_FW_IPOPT_SSRR;
1278                         break;
1279
1280                 case IPOPT_RR:
1281                         bits |= IP_FW_IPOPT_RR;
1282                         break;
1283
1284                 case IPOPT_TS:
1285                         bits |= IP_FW_IPOPT_TS;
1286                         break;
1287
1288                 default:
1289                         break;
1290                 }
1291         }
1292         return (flags_match(cmd, bits));
1293 }
1294
1295 static int
1296 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1297 {
1298         int optlen, bits = 0;
1299         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1300         u_char *cp = (u_char *)(tcp + 1);
1301         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1302
1303         for (; x > 0; x -= optlen, cp += optlen) {
1304                 int opt = cp[0];
1305
1306                 if (opt == TCPOPT_EOL)
1307                         break;
1308
1309                 if (opt == TCPOPT_NOP) {
1310                         optlen = 1;
1311                 } else {
1312                         optlen = cp[1];
1313                         if (optlen <= 0)
1314                                 break;
1315                 }
1316
1317                 switch (opt) {
1318                 case TCPOPT_MAXSEG:
1319                         bits |= IP_FW_TCPOPT_MSS;
1320                         break;
1321
1322                 case TCPOPT_WINDOW:
1323                         bits |= IP_FW_TCPOPT_WINDOW;
1324                         break;
1325
1326                 case TCPOPT_SACK_PERMITTED:
1327                 case TCPOPT_SACK:
1328                         bits |= IP_FW_TCPOPT_SACK;
1329                         break;
1330
1331                 case TCPOPT_TIMESTAMP:
1332                         bits |= IP_FW_TCPOPT_TS;
1333                         break;
1334
1335                 case TCPOPT_CC:
1336                 case TCPOPT_CCNEW:
1337                 case TCPOPT_CCECHO:
1338                         bits |= IP_FW_TCPOPT_CC;
1339                         break;
1340
1341                 default:
1342                         break;
1343                 }
1344         }
1345         return (flags_match(cmd, bits));
1346 }
1347
1348 static int
1349 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1350 {
1351         if (ifp == NULL)        /* no iface with this packet, match fails */
1352                 return 0;
1353
1354         /* Check by name or by IP address */
1355         if (cmd->name[0] != '\0') { /* match by name */
1356                 /* Check name */
1357                 if (cmd->p.glob) {
1358                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1359                                 return(1);
1360                 } else {
1361                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1362                                 return(1);
1363                 }
1364         } else {
1365                 struct ifaddr_container *ifac;
1366
1367                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1368                         struct ifaddr *ia = ifac->ifa;
1369
1370                         if (ia->ifa_addr == NULL)
1371                                 continue;
1372                         if (ia->ifa_addr->sa_family != AF_INET)
1373                                 continue;
1374                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1375                             (ia->ifa_addr))->sin_addr.s_addr)
1376                                 return(1);      /* match */
1377                 }
1378         }
1379         return(0);      /* no match, fail ... */
1380 }
1381
1382 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1383
1384 /*
1385  * We enter here when we have a rule with O_LOG.
1386  * XXX this function alone takes about 2Kbytes of code!
1387  */
1388 static void
1389 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1390     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1391 {
1392         char *action;
1393         int limit_reached = 0;
1394         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1395
1396         fragment[0] = '\0';
1397         proto[0] = '\0';
1398
1399         if (f == NULL) {        /* bogus pkt */
1400                 if (verbose_limit != 0 &&
1401                     ctx->ipfw_norule_counter >= verbose_limit)
1402                         return;
1403                 ctx->ipfw_norule_counter++;
1404                 if (ctx->ipfw_norule_counter == verbose_limit)
1405                         limit_reached = verbose_limit;
1406                 action = "Refuse";
1407         } else {        /* O_LOG is the first action, find the real one */
1408                 ipfw_insn *cmd = ACTION_PTR(f);
1409                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1410
1411                 if (l->max_log != 0 && l->log_left == 0)
1412                         return;
1413                 l->log_left--;
1414                 if (l->log_left == 0)
1415                         limit_reached = l->max_log;
1416                 cmd += F_LEN(cmd);      /* point to first action */
1417                 if (cmd->opcode == O_PROB)
1418                         cmd += F_LEN(cmd);
1419
1420                 action = action2;
1421                 switch (cmd->opcode) {
1422                 case O_DENY:
1423                         action = "Deny";
1424                         break;
1425
1426                 case O_REJECT:
1427                         if (cmd->arg1==ICMP_REJECT_RST) {
1428                                 action = "Reset";
1429                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1430                                 action = "Reject";
1431                         } else {
1432                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1433                                           cmd->arg1);
1434                         }
1435                         break;
1436
1437                 case O_ACCEPT:
1438                         action = "Accept";
1439                         break;
1440
1441                 case O_COUNT:
1442                         action = "Count";
1443                         break;
1444
1445                 case O_DIVERT:
1446                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1447                         break;
1448
1449                 case O_TEE:
1450                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1451                         break;
1452
1453                 case O_SKIPTO:
1454                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1455                         break;
1456
1457                 case O_PIPE:
1458                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1459                         break;
1460
1461                 case O_QUEUE:
1462                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1463                         break;
1464
1465                 case O_FORWARD_IP:
1466                         {
1467                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1468                                 int len;
1469
1470                                 len = ksnprintf(SNPARGS(action2, 0),
1471                                     "Forward to %s",
1472                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1473                                 if (sa->sa.sin_port) {
1474                                         ksnprintf(SNPARGS(action2, len), ":%d",
1475                                                   sa->sa.sin_port);
1476                                 }
1477                         }
1478                         break;
1479
1480                 default:
1481                         action = "UNKNOWN";
1482                         break;
1483                 }
1484         }
1485
1486         if (hlen == 0) {        /* non-ip */
1487                 ksnprintf(SNPARGS(proto, 0), "MAC");
1488         } else {
1489                 struct ip *ip = mtod(m, struct ip *);
1490                 /* these three are all aliases to the same thing */
1491                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1492                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1493                 struct udphdr *const udp = (struct udphdr *)icmp;
1494
1495                 int ip_off, offset, ip_len;
1496                 int len;
1497
1498                 if (eh != NULL) { /* layer 2 packets are as on the wire */
1499                         ip_off = ntohs(ip->ip_off);
1500                         ip_len = ntohs(ip->ip_len);
1501                 } else {
1502                         ip_off = ip->ip_off;
1503                         ip_len = ip->ip_len;
1504                 }
1505                 offset = ip_off & IP_OFFMASK;
1506                 switch (ip->ip_p) {
1507                 case IPPROTO_TCP:
1508                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1509                                         kinet_ntoa(ip->ip_src, abuf));
1510                         if (offset == 0) {
1511                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1512                                           ntohs(tcp->th_sport),
1513                                           kinet_ntoa(ip->ip_dst, abuf),
1514                                           ntohs(tcp->th_dport));
1515                         } else {
1516                                 ksnprintf(SNPARGS(proto, len), " %s",
1517                                           kinet_ntoa(ip->ip_dst, abuf));
1518                         }
1519                         break;
1520
1521                 case IPPROTO_UDP:
1522                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1523                                         kinet_ntoa(ip->ip_src, abuf));
1524                         if (offset == 0) {
1525                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1526                                           ntohs(udp->uh_sport),
1527                                           kinet_ntoa(ip->ip_dst, abuf),
1528                                           ntohs(udp->uh_dport));
1529                         } else {
1530                                 ksnprintf(SNPARGS(proto, len), " %s",
1531                                           kinet_ntoa(ip->ip_dst, abuf));
1532                         }
1533                         break;
1534
1535                 case IPPROTO_ICMP:
1536                         if (offset == 0) {
1537                                 len = ksnprintf(SNPARGS(proto, 0),
1538                                                 "ICMP:%u.%u ",
1539                                                 icmp->icmp_type,
1540                                                 icmp->icmp_code);
1541                         } else {
1542                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1543                         }
1544                         len += ksnprintf(SNPARGS(proto, len), "%s",
1545                                          kinet_ntoa(ip->ip_src, abuf));
1546                         ksnprintf(SNPARGS(proto, len), " %s",
1547                                   kinet_ntoa(ip->ip_dst, abuf));
1548                         break;
1549
1550                 default:
1551                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1552                                         kinet_ntoa(ip->ip_src, abuf));
1553                         ksnprintf(SNPARGS(proto, len), " %s",
1554                                   kinet_ntoa(ip->ip_dst, abuf));
1555                         break;
1556                 }
1557
1558                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1559                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1560                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1561                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1562                 }
1563         }
1564
1565         if (oif || m->m_pkthdr.rcvif) {
1566                 log(LOG_SECURITY | LOG_INFO,
1567                     "ipfw: %d %s %s %s via %s%s\n",
1568                     f ? f->rulenum : -1,
1569                     action, proto, oif ? "out" : "in",
1570                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1571                     fragment);
1572         } else {
1573                 log(LOG_SECURITY | LOG_INFO,
1574                     "ipfw: %d %s %s [no if info]%s\n",
1575                     f ? f->rulenum : -1,
1576                     action, proto, fragment);
1577         }
1578
1579         if (limit_reached) {
1580                 log(LOG_SECURITY | LOG_NOTICE,
1581                     "ipfw: limit %d reached on entry %d\n",
1582                     limit_reached, f ? f->rulenum : -1);
1583         }
1584 }
1585
1586 #undef SNPARGS
1587
1588 static void
1589 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1590 {
1591         struct ip_fw *rule = slave_x->xlat_rule;
1592
1593         KKASSERT(rule->cpuid == mycpuid);
1594
1595         /* No more cross references; free this pair now. */
1596         kfree(x, M_IPFW);
1597         kfree(slave_x, M_IPFW);
1598
1599         /* See the comment in ipfw_ip_xlate_dispatch(). */
1600         rule->cross_refs--;
1601 }
1602
1603 static void
1604 ipfw_xlat_reap_dispatch(netmsg_t nm)
1605 {
1606         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1607         struct ipfw_state *s, *ns;
1608
1609         ASSERT_NETISR_NCPUS(mycpuid);
1610
1611         crit_enter();
1612         /* Reply ASAP. */
1613         netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1614         crit_exit();
1615
1616         /* TODO: limit scanning depth */
1617         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1618                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1619                 struct ipfw_xlat *slave_x = x->xlat_pair;
1620                 uint64_t crefs;
1621
1622                 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1623                 if (crefs == 0) {
1624                         TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1625                         ipfw_xlat_reap(x, slave_x);
1626                 }
1627         }
1628         if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1629                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1630                     &ctx->ipfw_xlatreap_nm);
1631         }
1632 }
1633
1634 static void
1635 ipfw_xlat_reap_timeo(void *xnm)
1636 {
1637         struct netmsg_base *nm = xnm;
1638
1639         KKASSERT(mycpuid < netisr_ncpus);
1640
1641         crit_enter();
1642         if (nm->lmsg.ms_flags & MSGF_DONE)
1643                 netisr_sendmsg_oncpu(nm);
1644         crit_exit();
1645 }
1646
1647 static void
1648 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1649 {
1650         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1651         struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1652         struct ipfw_xlat *slave_x = x->xlat_pair;
1653         uint64_t crefs;
1654
1655         ASSERT_NETISR_NCPUS(mycpuid);
1656
1657         KKASSERT(slave_x != NULL);
1658         KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1659
1660         KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1661             ("master xlat is still linked"));
1662         if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1663                 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1664
1665         /* See the comment in ipfw_ip_xlate_dispatch(). */
1666         slave_x->xlat_crefs--;
1667
1668         crefs = slave_x->xlat_crefs + x->xlat_crefs;
1669         if (crefs == 0) {
1670                 ipfw_xlat_reap(x, slave_x);
1671                 return;
1672         }
1673
1674         if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1675                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1676                     &ctx->ipfw_xlatreap_nm);
1677         }
1678
1679         /*
1680          * This pair is still referenced; defer its destruction.
1681          * YYY reuse st_link.
1682          */
1683         TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1684 }
1685
1686 static __inline void
1687 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1688 {
1689
1690         x->xlat_invalid = 1;
1691         x->xlat_pair->xlat_invalid = 1;
1692 }
1693
1694 static void
1695 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1696 {
1697         struct ipfw_xlat *x, *slave_x;
1698         struct netmsg_base *nm;
1699
1700         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1701             IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1702         KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1703             ("delete slave xlat"));
1704
1705         KASSERT(ctx->ipfw_state_cnt > 0,
1706             ("invalid state count %d", ctx->ipfw_state_cnt));
1707         ctx->ipfw_state_cnt--;
1708         if (ctx->ipfw_state_loosecnt > 0)
1709                 ctx->ipfw_state_loosecnt--;
1710
1711         /*
1712          * Unhook this state.
1713          */
1714         if (s->st_track != NULL) {
1715                 struct ipfw_track *t = s->st_track;
1716
1717                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1718                     ("track state list is empty"));
1719                 LIST_REMOVE(s, st_trklink);
1720
1721                 KASSERT(*t->t_count > 0,
1722                     ("invalid track count %d", *t->t_count));
1723                 atomic_subtract_int(t->t_count, 1);
1724         }
1725         ipfw_state_unlink(ctx, s);
1726
1727         /*
1728          * Free this state.  Xlat requires special processing,
1729          * since xlat are paired state and they could be on
1730          * different cpus.
1731          */
1732
1733         if (!IPFW_ISXLAT(s->st_type)) {
1734                 /* Not xlat; free now. */
1735                 kfree(s, M_IPFW);
1736                 /* Done! */
1737                 return;
1738         }
1739         x = (struct ipfw_xlat *)s;
1740
1741         if (x->xlat_pair == NULL) {
1742                 /* Not setup yet; free now. */
1743                 kfree(x, M_IPFW);
1744                 /* Done! */
1745                 return;
1746         }
1747         slave_x = x->xlat_pair;
1748         KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1749
1750         if (x->xlat_pcpu == mycpuid) {
1751                 /*
1752                  * Paired states are on the same cpu; delete this
1753                  * pair now.
1754                  */
1755                 KKASSERT(x->xlat_crefs == 0);
1756                 KKASSERT(slave_x->xlat_crefs == 0);
1757                 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1758                         ipfw_state_unlink(ctx, &slave_x->xlat_st);
1759                 kfree(x, M_IPFW);
1760                 kfree(slave_x, M_IPFW);
1761                 return;
1762         }
1763
1764         /*
1765          * Free the paired states on the cpu owning the slave xlat.
1766          */
1767
1768         /* 
1769          * Mark the state pair invalid; completely deleting them
1770          * may take some time.
1771          */
1772         ipfw_xlat_invalidate(x);
1773
1774         nm = &x->xlat_freenm;
1775         netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1776             ipfw_xlat_free_dispatch);
1777         nm->lmsg.u.ms_resultp = x;
1778
1779         /* See the comment in ipfw_xlate_redispatch(). */
1780         x->xlat_rule->cross_refs++;
1781         x->xlat_crefs++;
1782
1783         netisr_sendmsg(nm, x->xlat_pcpu);
1784 }
1785
1786 static void
1787 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1788 {
1789
1790         if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1791                 KKASSERT(IPFW_ISXLAT(s->st_type));
1792                 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1793                 ipfw_state_unlink(ctx, s);
1794                 return;
1795         }
1796         ipfw_state_del(ctx, s);
1797 }
1798
1799 static int
1800 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1801 {
1802         struct ipfw_state *s, *anchor;
1803         int expired;
1804
1805         if (reap_max < ipfw_state_reap_min)
1806                 reap_max = ipfw_state_reap_min;
1807
1808         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1809                 /*
1810                  * Kick start state expiring.  Ignore scan limit,
1811                  * we are short of states.
1812                  */
1813                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1814                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1815                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1816                 return (expired);
1817         }
1818
1819         /*
1820          * States are being expired.
1821          */
1822
1823         if (ctx->ipfw_state_cnt == 0)
1824                 return (0);
1825
1826         expired = 0;
1827         anchor = &ctx->ipfw_stateexp_anch;
1828         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1829                 /*
1830                  * Ignore scan limit; we are short of states.
1831                  */
1832
1833                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1834                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1835
1836                 if (IPFW_STATE_SCANSKIP(s))
1837                         continue;
1838
1839                 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1840                         ipfw_state_del(ctx, s);
1841                         if (++expired >= reap_max)
1842                                 break;
1843                         if ((expired & 0xff) == 0 && 
1844                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1845                             ipfw_state_max)
1846                                 break;
1847                 }
1848         }
1849         /*
1850          * NOTE:
1851          * Leave the anchor on the list, even if the end of the list has
1852          * been reached.  ipfw_state_expire_more_dispatch() will handle
1853          * the removal.
1854          */
1855         return (expired);
1856 }
1857
1858 static void
1859 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1860 {
1861         struct ipfw_state *s, *sn;
1862
1863         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1864                 if (IPFW_STATE_SCANSKIP(s))
1865                         continue;
1866                 if (rule != NULL && s->st_rule != rule)
1867                         continue;
1868                 ipfw_state_del(ctx, s);
1869         }
1870 }
1871
1872 static void
1873 ipfw_state_expire_done(struct ipfw_context *ctx)
1874 {
1875
1876         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1877             ("stateexp is not in progress"));
1878         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1879         callout_reset(&ctx->ipfw_stateto_ch, hz,
1880             ipfw_state_expire_ipifunc, NULL);
1881 }
1882
1883 static void
1884 ipfw_state_expire_more(struct ipfw_context *ctx)
1885 {
1886         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1887
1888         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1889             ("stateexp is not in progress"));
1890         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1891             ("stateexp more did not finish"));
1892         netisr_sendmsg_oncpu(nm);
1893 }
1894
1895 static int
1896 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1897     int scan_max, int expire_max)
1898 {
1899         struct ipfw_state *s;
1900         int scanned = 0, expired = 0;
1901
1902         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1903             ("stateexp is not in progress"));
1904
1905         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1906                 if (scanned++ >= scan_max) {
1907                         ipfw_state_expire_more(ctx);
1908                         return (expired);
1909                 }
1910
1911                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1912                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1913
1914                 if (IPFW_STATE_SCANSKIP(s))
1915                         continue;
1916
1917                 if (IPFW_STATE_ISDEAD(s) ||
1918                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1919                      IPFW_STATE_TCPCLOSED(s))) {
1920                         ipfw_state_del(ctx, s);
1921                         if (++expired >= expire_max) {
1922                                 ipfw_state_expire_more(ctx);
1923                                 return (expired);
1924                         }
1925                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1926                             (expired & 0xff) == 0 &&
1927                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1928                             ipfw_state_max) {
1929                                 ipfw_state_expire_more(ctx);
1930                                 return (expired);
1931                         }
1932                 }
1933         }
1934         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1935         ipfw_state_expire_done(ctx);
1936         return (expired);
1937 }
1938
1939 static void
1940 ipfw_state_expire_more_dispatch(netmsg_t nm)
1941 {
1942         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1943         struct ipfw_state *anchor;
1944
1945         ASSERT_NETISR_NCPUS(mycpuid);
1946         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1947             ("statexp is not in progress"));
1948
1949         /* Reply ASAP */
1950         netisr_replymsg(&nm->base, 0);
1951
1952         anchor = &ctx->ipfw_stateexp_anch;
1953         if (ctx->ipfw_state_cnt == 0) {
1954                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1955                 ipfw_state_expire_done(ctx);
1956                 return;
1957         }
1958         ipfw_state_expire_loop(ctx, anchor,
1959             ipfw_state_scan_max, ipfw_state_expire_max);
1960 }
1961
1962 static int
1963 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1964 {
1965         struct ipfw_state *anchor;
1966
1967         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1968             ("stateexp is in progress"));
1969         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1970
1971         if (ctx->ipfw_state_cnt == 0) {
1972                 ipfw_state_expire_done(ctx);
1973                 return (0);
1974         }
1975
1976         /*
1977          * Do not expire more than once per second, it is useless.
1978          */
1979         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1980             ctx->ipfw_state_lastexp == time_uptime) {
1981                 ipfw_state_expire_done(ctx);
1982                 return (0);
1983         }
1984         ctx->ipfw_state_lastexp = time_uptime;
1985
1986         anchor = &ctx->ipfw_stateexp_anch;
1987         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1988         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1989 }
1990
1991 static void
1992 ipfw_state_expire_dispatch(netmsg_t nm)
1993 {
1994         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1995
1996         ASSERT_NETISR_NCPUS(mycpuid);
1997
1998         /* Reply ASAP */
1999         crit_enter();
2000         netisr_replymsg(&nm->base, 0);
2001         crit_exit();
2002
2003         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2004                 /* Running; done. */
2005                 return;
2006         }
2007         ipfw_state_expire_start(ctx,
2008             ipfw_state_scan_max, ipfw_state_expire_max);
2009 }
2010
2011 static void
2012 ipfw_state_expire_ipifunc(void *dummy __unused)
2013 {
2014         struct netmsg_base *msg;
2015
2016         KKASSERT(mycpuid < netisr_ncpus);
2017         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2018
2019         crit_enter();
2020         if (msg->lmsg.ms_flags & MSGF_DONE)
2021                 netisr_sendmsg_oncpu(msg);
2022         crit_exit();
2023 }
2024
2025 static boolean_t
2026 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2027 {
2028         uint32_t seq = ntohl(tcp->th_seq);
2029         uint32_t ack = ntohl(tcp->th_ack);
2030
2031         if (tcp->th_flags & TH_RST)
2032                 return (TRUE);
2033
2034         if (dir == MATCH_FORWARD) {
2035                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2036                         s->st_flags |= IPFW_STATE_F_SEQFWD;
2037                         s->st_seq_fwd = seq;
2038                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2039                         s->st_seq_fwd = seq;
2040                 } else {
2041                         /* Out-of-sequence; done. */
2042                         return (FALSE);
2043                 }
2044                 if (tcp->th_flags & TH_ACK) {
2045                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2046                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
2047                                 s->st_ack_fwd = ack;
2048                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2049                                 s->st_ack_fwd = ack;
2050                         } else {
2051                                 /* Out-of-sequence; done. */
2052                                 return (FALSE);
2053                         }
2054
2055                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2056                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2057                                 s->st_state |= (TH_ACK << 8);
2058                 }
2059         } else {
2060                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2061                         s->st_flags |= IPFW_STATE_F_SEQREV;
2062                         s->st_seq_rev = seq;
2063                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2064                         s->st_seq_rev = seq;
2065                 } else {
2066                         /* Out-of-sequence; done. */
2067                         return (FALSE);
2068                 }
2069                 if (tcp->th_flags & TH_ACK) {
2070                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2071                                 s->st_flags |= IPFW_STATE_F_ACKREV;
2072                                 s->st_ack_rev= ack;
2073                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2074                                 s->st_ack_rev = ack;
2075                         } else {
2076                                 /* Out-of-sequence; done. */
2077                                 return (FALSE);
2078                         }
2079
2080                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2081                             s->st_ack_rev == s->st_seq_fwd + 1)
2082                                 s->st_state |= TH_ACK;
2083                 }
2084         }
2085         return (TRUE);
2086 }
2087
2088 static void
2089 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2090     const struct tcphdr *tcp, struct ipfw_state *s)
2091 {
2092
2093         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2094                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2095
2096                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2097                         return;
2098
2099                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2100                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2101                 case TH_SYN:                            /* opening */
2102                         s->st_expire = time_uptime + dyn_syn_lifetime;
2103                         break;
2104
2105                 case BOTH_SYN:                  /* move to established */
2106                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
2107                 case BOTH_SYN | (TH_FIN << 8):
2108                         s->st_expire = time_uptime + dyn_ack_lifetime;
2109                         break;
2110
2111                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
2112                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2113                                 /* And both FINs were ACKed. */
2114                                 s->st_expire = time_uptime + dyn_fin_lifetime;
2115                         } else {
2116                                 s->st_expire = time_uptime +
2117                                     dyn_finwait_lifetime;
2118                         }
2119                         break;
2120
2121                 default:
2122 #if 0
2123                         /*
2124                          * reset or some invalid combination, but can also
2125                          * occur if we use keep-state the wrong way.
2126                          */
2127                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2128                                 kprintf("invalid state: 0x%x\n", s->st_state);
2129 #endif
2130                         s->st_expire = time_uptime + dyn_rst_lifetime;
2131                         break;
2132                 }
2133         } else if (pkt->proto == IPPROTO_UDP) {
2134                 s->st_expire = time_uptime + dyn_udp_lifetime;
2135         } else {
2136                 /* other protocols */
2137                 s->st_expire = time_uptime + dyn_short_lifetime;
2138         }
2139 }
2140
2141 /*
2142  * Lookup a state.
2143  */
2144 static struct ipfw_state *
2145 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2146     int *match_direction, const struct tcphdr *tcp)
2147 {
2148         struct ipfw_state *key, *s;
2149         int dir = MATCH_NONE;
2150
2151         key = &ctx->ipfw_state_tmpkey;
2152         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2153             pkt->dst_ip, pkt->dst_port, pkt->proto);
2154         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2155         if (s == NULL)
2156                 goto done; /* not found. */
2157         if (IPFW_STATE_ISDEAD(s)) {
2158                 ipfw_state_remove(ctx, s);
2159                 s = NULL;
2160                 goto done;
2161         }
2162         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2163                 /* TCP ports recycling is too fast. */
2164                 ctx->ipfw_sts_tcprecycled++;
2165                 ipfw_state_remove(ctx, s);
2166                 s = NULL;
2167                 goto done;
2168         }
2169
2170         if (s->st_swap == key->st_swap) {
2171                 dir = MATCH_FORWARD;
2172         } else {
2173                 KASSERT((s->st_swap & key->st_swap) == 0,
2174                     ("found mismatch state"));
2175                 dir = MATCH_REVERSE;
2176         }
2177
2178         /* Update this state. */
2179         ipfw_state_update(pkt, dir, tcp, s);
2180
2181         if (s->st_track != NULL) {
2182                 /* This track has been used. */
2183                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2184         }
2185 done:
2186         if (match_direction)
2187                 *match_direction = dir;
2188         return (s);
2189 }
2190
2191 static struct ipfw_state *
2192 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2193     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2194 {
2195         struct ipfw_state *s;
2196         size_t sz;
2197
2198         KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2199             ("invalid state type %u", type));
2200
2201         sz = sizeof(struct ipfw_state);
2202         if (IPFW_ISXLAT(type))
2203                 sz = sizeof(struct ipfw_xlat);
2204
2205         s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2206         if (s == NULL) {
2207                 ctx->ipfw_sts_nomem++;
2208                 return (NULL);
2209         }
2210
2211         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2212             id->dst_ip, id->dst_port, id->proto);
2213
2214         s->st_rule = rule;
2215         s->st_type = type;
2216         if (IPFW_ISXLAT(type)) {
2217                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2218
2219                 x->xlat_dir = MATCH_NONE;
2220                 x->xlat_pcpu = -1;
2221         }
2222
2223         /*
2224          * Update this state:
2225          * Set st_expire and st_state.
2226          */
2227         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2228
2229         return (s);
2230 }
2231
2232 static struct ipfw_state *
2233 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2234     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2235     const struct tcphdr *tcp)
2236 {
2237         struct ipfw_state *s, *dup;
2238
2239         s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2240         if (s == NULL)
2241                 return (NULL);
2242
2243         ctx->ipfw_state_cnt++;
2244         ctx->ipfw_state_loosecnt++;
2245         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2246                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2247                 ctx->ipfw_state_loosecnt = 0;
2248         }
2249
2250         dup = ipfw_state_link(ctx, s);
2251         if (dup != NULL)
2252                 panic("ipfw: %u state exists %p", type, dup);
2253
2254         if (t != NULL) {
2255                 /* Keep the track referenced. */
2256                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2257                 s->st_track = t;
2258         }
2259         return (s);
2260 }
2261
2262 static boolean_t
2263 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2264 {
2265         struct ipfw_trkcnt *trk;
2266         boolean_t trk_freed = FALSE;
2267
2268         KASSERT(t->t_count != NULL, ("track anchor"));
2269         KASSERT(LIST_EMPTY(&t->t_state_list),
2270             ("invalid track is still referenced"));
2271
2272         trk = t->t_trkcnt;
2273         KASSERT(trk != NULL, ("track has no trkcnt"));
2274
2275         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2276         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2277         kfree(t, M_IPFW);
2278
2279         /*
2280          * fdrop() style reference counting.
2281          * See kern/kern_descrip.c fdrop().
2282          */
2283         for (;;) {
2284                 int refs = trk->tc_refs;
2285
2286                 cpu_ccfence();
2287                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2288                 if (refs == 1) {
2289                         IPFW_TRKCNT_TOKGET;
2290                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2291                                 KASSERT(trk->tc_count == 0,
2292                                     ("%d states reference this trkcnt",
2293                                      trk->tc_count));
2294                                 RB_REMOVE(ipfw_trkcnt_tree,
2295                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
2296
2297                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2298                                     ("invalid trkcnt cnt %d",
2299                                      ipfw_gd.ipfw_trkcnt_cnt));
2300                                 ipfw_gd.ipfw_trkcnt_cnt--;
2301                                 IPFW_TRKCNT_TOKREL;
2302
2303                                 if (ctx->ipfw_trkcnt_spare == NULL)
2304                                         ctx->ipfw_trkcnt_spare = trk;
2305                                 else
2306                                         kfree(trk, M_IPFW);
2307                                 trk_freed = TRUE;
2308                                 break; /* done! */
2309                         }
2310                         IPFW_TRKCNT_TOKREL;
2311                         /* retry */
2312                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2313                         break; /* done! */
2314                 }
2315                 /* retry */
2316         }
2317         return (trk_freed);
2318 }
2319
2320 static void
2321 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2322 {
2323         struct ipfw_track *t, *tn;
2324
2325         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2326                 if (t->t_count == NULL) /* anchor */
2327                         continue;
2328                 if (rule != NULL && t->t_rule != rule)
2329                         continue;
2330                 ipfw_track_free(ctx, t);
2331         }
2332 }
2333
2334 static boolean_t
2335 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2336     boolean_t reap)
2337 {
2338         struct ipfw_state *s, *sn;
2339         boolean_t ret = FALSE;
2340
2341         KASSERT(t->t_count != NULL, ("track anchor"));
2342
2343         if (LIST_EMPTY(&t->t_state_list))
2344                 return (FALSE);
2345
2346         /*
2347          * Do not expire more than once per second, it is useless.
2348          */
2349         if (t->t_lastexp == time_uptime)
2350                 return (FALSE);
2351         t->t_lastexp = time_uptime;
2352
2353         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2354                 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2355                         KASSERT(s->st_track == t,
2356                             ("state track %p does not match %p",
2357                              s->st_track, t));
2358                         ipfw_state_del(ctx, s);
2359                         ret = TRUE;
2360                 }
2361         }
2362         return (ret);
2363 }
2364
2365 static __inline struct ipfw_trkcnt *
2366 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2367 {
2368         struct ipfw_trkcnt *trk;
2369
2370         if (ctx->ipfw_trkcnt_spare != NULL) {
2371                 trk = ctx->ipfw_trkcnt_spare;
2372                 ctx->ipfw_trkcnt_spare = NULL;
2373         } else {
2374                 trk = kmalloc(sizeof(*trk), M_IPFW,
2375                               M_INTWAIT | M_NULLOK | M_CACHEALIGN);
2376         }
2377         return (trk);
2378 }
2379
2380 static void
2381 ipfw_track_expire_done(struct ipfw_context *ctx)
2382 {
2383
2384         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2385             ("trackexp is not in progress"));
2386         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2387         callout_reset(&ctx->ipfw_trackto_ch, hz,
2388             ipfw_track_expire_ipifunc, NULL);
2389 }
2390
2391 static void
2392 ipfw_track_expire_more(struct ipfw_context *ctx)
2393 {
2394         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2395
2396         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2397             ("trackexp is not in progress"));
2398         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2399             ("trackexp more did not finish"));
2400         netisr_sendmsg_oncpu(nm);
2401 }
2402
2403 static int
2404 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2405     int scan_max, int expire_max)
2406 {
2407         struct ipfw_track *t;
2408         int scanned = 0, expired = 0;
2409         boolean_t reap = FALSE;
2410
2411         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2412             ("trackexp is not in progress"));
2413
2414         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2415                 reap = TRUE;
2416
2417         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2418                 if (scanned++ >= scan_max) {
2419                         ipfw_track_expire_more(ctx);
2420                         return (expired);
2421                 }
2422
2423                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2424                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2425
2426                 if (t->t_count == NULL) /* anchor */
2427                         continue;
2428
2429                 ipfw_track_state_expire(ctx, t, reap);
2430                 if (!LIST_EMPTY(&t->t_state_list)) {
2431                         /* There are states referencing this track. */
2432                         continue;
2433                 }
2434
2435                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2436                         /* Expired. */
2437                         if (ipfw_track_free(ctx, t)) {
2438                                 if (++expired >= expire_max) {
2439                                         ipfw_track_expire_more(ctx);
2440                                         return (expired);
2441                                 }
2442                         }
2443                 }
2444         }
2445         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2446         ipfw_track_expire_done(ctx);
2447         return (expired);
2448 }
2449
2450 static int
2451 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2452 {
2453         struct ipfw_track *anchor;
2454
2455         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2456             ("trackexp is in progress"));
2457         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2458
2459         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2460                 ipfw_track_expire_done(ctx);
2461                 return (0);
2462         }
2463
2464         /*
2465          * Do not expire more than once per second, it is useless.
2466          */
2467         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2468             ctx->ipfw_track_lastexp == time_uptime) {
2469                 ipfw_track_expire_done(ctx);
2470                 return (0);
2471         }
2472         ctx->ipfw_track_lastexp = time_uptime;
2473
2474         anchor = &ctx->ipfw_trackexp_anch;
2475         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2476         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2477 }
2478
2479 static void
2480 ipfw_track_expire_more_dispatch(netmsg_t nm)
2481 {
2482         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2483         struct ipfw_track *anchor;
2484
2485         ASSERT_NETISR_NCPUS(mycpuid);
2486         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2487             ("trackexp is not in progress"));
2488
2489         /* Reply ASAP */
2490         netisr_replymsg(&nm->base, 0);
2491
2492         anchor = &ctx->ipfw_trackexp_anch;
2493         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2494                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2495                 ipfw_track_expire_done(ctx);
2496                 return;
2497         }
2498         ipfw_track_expire_loop(ctx, anchor,
2499             ipfw_track_scan_max, ipfw_track_expire_max);
2500 }
2501
2502 static void
2503 ipfw_track_expire_dispatch(netmsg_t nm)
2504 {
2505         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2506
2507         ASSERT_NETISR_NCPUS(mycpuid);
2508
2509         /* Reply ASAP */
2510         crit_enter();
2511         netisr_replymsg(&nm->base, 0);
2512         crit_exit();
2513
2514         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2515                 /* Running; done. */
2516                 return;
2517         }
2518         ipfw_track_expire_start(ctx,
2519             ipfw_track_scan_max, ipfw_track_expire_max);
2520 }
2521
2522 static void
2523 ipfw_track_expire_ipifunc(void *dummy __unused)
2524 {
2525         struct netmsg_base *msg;
2526
2527         KKASSERT(mycpuid < netisr_ncpus);
2528         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2529
2530         crit_enter();
2531         if (msg->lmsg.ms_flags & MSGF_DONE)
2532                 netisr_sendmsg_oncpu(msg);
2533         crit_exit();
2534 }
2535
2536 static int
2537 ipfw_track_reap(struct ipfw_context *ctx)
2538 {
2539         struct ipfw_track *t, *anchor;
2540         int expired;
2541
2542         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2543                 /*
2544                  * Kick start track expiring.  Ignore scan limit,
2545                  * we are short of tracks.
2546                  */
2547                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2548                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2549                     ipfw_track_reap_max);
2550                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2551                 return (expired);
2552         }
2553
2554         /*
2555          * Tracks are being expired.
2556          */
2557
2558         if (RB_EMPTY(&ctx->ipfw_track_tree))
2559                 return (0);
2560
2561         expired = 0;
2562         anchor = &ctx->ipfw_trackexp_anch;
2563         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2564                 /*
2565                  * Ignore scan limit; we are short of tracks.
2566                  */
2567
2568                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2569                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2570
2571                 if (t->t_count == NULL) /* anchor */
2572                         continue;
2573
2574                 ipfw_track_state_expire(ctx, t, TRUE);
2575                 if (!LIST_EMPTY(&t->t_state_list)) {
2576                         /* There are states referencing this track. */
2577                         continue;
2578                 }
2579
2580                 if (ipfw_track_free(ctx, t)) {
2581                         if (++expired >= ipfw_track_reap_max) {
2582                                 ipfw_track_expire_more(ctx);
2583                                 break;
2584                         }
2585                 }
2586         }
2587         /*
2588          * NOTE:
2589          * Leave the anchor on the list, even if the end of the list has
2590          * been reached.  ipfw_track_expire_more_dispatch() will handle
2591          * the removal.
2592          */
2593         return (expired);
2594 }
2595
2596 static struct ipfw_track *
2597 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2598     uint16_t limit_mask, struct ip_fw *rule)
2599 {
2600         struct ipfw_track *key, *t, *dup;
2601         struct ipfw_trkcnt *trk, *ret;
2602         boolean_t do_expire = FALSE;
2603
2604         KASSERT(rule->track_ruleid != 0,
2605             ("rule %u has no track ruleid", rule->rulenum));
2606
2607         key = &ctx->ipfw_track_tmpkey;
2608         key->t_proto = id->proto;
2609         key->t_addrs = 0;
2610         key->t_ports = 0;
2611         key->t_rule = rule;
2612         if (limit_mask & DYN_SRC_ADDR)
2613                 key->t_saddr = id->src_ip;
2614         if (limit_mask & DYN_DST_ADDR)
2615                 key->t_daddr = id->dst_ip;
2616         if (limit_mask & DYN_SRC_PORT)
2617                 key->t_sport = id->src_port;
2618         if (limit_mask & DYN_DST_PORT)
2619                 key->t_dport = id->dst_port;
2620
2621         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2622         if (t != NULL)
2623                 goto done;
2624
2625         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2626         if (t == NULL) {
2627                 ctx->ipfw_tks_nomem++;
2628                 return (NULL);
2629         }
2630
2631         t->t_key = key->t_key;
2632         t->t_rule = rule;
2633         t->t_lastexp = 0;
2634         LIST_INIT(&t->t_state_list);
2635
2636         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2637                 time_t globexp, uptime;
2638
2639                 trk = NULL;
2640                 do_expire = TRUE;
2641
2642                 /*
2643                  * Do not expire globally more than once per second,
2644                  * it is useless.
2645                  */
2646                 uptime = time_uptime;
2647                 globexp = ipfw_gd.ipfw_track_globexp;
2648                 if (globexp != uptime &&
2649                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2650                     globexp, uptime)) {
2651                         int cpu;
2652
2653                         /* Expire tracks on other CPUs. */
2654                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2655                                 if (cpu == mycpuid)
2656                                         continue;
2657                                 lwkt_send_ipiq(globaldata_find(cpu),
2658                                     ipfw_track_expire_ipifunc, NULL);
2659                         }
2660                 }
2661         } else {
2662                 trk = ipfw_trkcnt_alloc(ctx);
2663         }
2664         if (trk == NULL) {
2665                 struct ipfw_trkcnt *tkey;
2666
2667                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2668                 key = NULL; /* tkey overlaps key */
2669
2670                 tkey->tc_key = t->t_key;
2671                 tkey->tc_ruleid = rule->track_ruleid;
2672
2673                 IPFW_TRKCNT_TOKGET;
2674                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2675                     tkey);
2676                 if (trk == NULL) {
2677                         IPFW_TRKCNT_TOKREL;
2678                         if (do_expire) {
2679                                 ctx->ipfw_tks_reap++;
2680                                 if (ipfw_track_reap(ctx) > 0) {
2681                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2682                                             ipfw_track_max) {
2683                                                 trk = ipfw_trkcnt_alloc(ctx);
2684                                                 if (trk != NULL)
2685                                                         goto install;
2686                                                 ctx->ipfw_tks_cntnomem++;
2687                                         } else {
2688                                                 ctx->ipfw_tks_overflow++;
2689                                         }
2690                                 } else {
2691                                         ctx->ipfw_tks_reapfailed++;
2692                                         ctx->ipfw_tks_overflow++;
2693                                 }
2694                         } else {
2695                                 ctx->ipfw_tks_cntnomem++;
2696                         }
2697                         kfree(t, M_IPFW);
2698                         return (NULL);
2699                 }
2700                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2701                     ("invalid trkcnt refs %d", trk->tc_refs));
2702                 atomic_add_int(&trk->tc_refs, 1);
2703                 IPFW_TRKCNT_TOKREL;
2704         } else {
2705 install:
2706                 trk->tc_key = t->t_key;
2707                 trk->tc_ruleid = rule->track_ruleid;
2708                 trk->tc_refs = 0;
2709                 trk->tc_count = 0;
2710                 trk->tc_expire = 0;
2711                 trk->tc_rulenum = rule->rulenum;
2712
2713                 IPFW_TRKCNT_TOKGET;
2714                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2715                     trk);
2716                 if (ret != NULL) {
2717                         KASSERT(ret->tc_refs > 0 &&
2718                             ret->tc_refs < netisr_ncpus,
2719                             ("invalid trkcnt refs %d", ret->tc_refs));
2720                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2721                             ("trkcnt spare was installed"));
2722                         ctx->ipfw_trkcnt_spare = trk;
2723                         trk = ret;
2724                 } else {
2725                         ipfw_gd.ipfw_trkcnt_cnt++;
2726                 }
2727                 atomic_add_int(&trk->tc_refs, 1);
2728                 IPFW_TRKCNT_TOKREL;
2729         }
2730         t->t_count = &trk->tc_count;
2731         t->t_trkcnt = trk;
2732
2733         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2734         if (dup != NULL)
2735                 panic("ipfw: track exists");
2736         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2737 done:
2738         t->t_expire = time_uptime + dyn_short_lifetime;
2739         return (t);
2740 }
2741
2742 /*
2743  * Install state for rule type cmd->o.opcode
2744  *
2745  * Returns NULL if state is not installed because of errors or because
2746  * states limitations are enforced.
2747  */
2748 static struct ipfw_state *
2749 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2750     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2751 {
2752         struct ipfw_state *s;
2753         struct ipfw_track *t;
2754         int count, diff;
2755
2756         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2757             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2758                 boolean_t overflow = TRUE;
2759
2760                 ctx->ipfw_sts_reap++;
2761                 if (ipfw_state_reap(ctx, diff) == 0)
2762                         ctx->ipfw_sts_reapfailed++;
2763                 if (ipfw_state_cntsync() < ipfw_state_max)
2764                         overflow = FALSE;
2765
2766                 if (overflow) {
2767                         time_t globexp, uptime;
2768                         int cpu;
2769
2770                         /*
2771                          * Do not expire globally more than once per second,
2772                          * it is useless.
2773                          */
2774                         uptime = time_uptime;
2775                         globexp = ipfw_gd.ipfw_state_globexp;
2776                         if (globexp == uptime ||
2777                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2778                             globexp, uptime)) {
2779                                 ctx->ipfw_sts_overflow++;
2780                                 return (NULL);
2781                         }
2782
2783                         /* Expire states on other CPUs. */
2784                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2785                                 if (cpu == mycpuid)
2786                                         continue;
2787                                 lwkt_send_ipiq(globaldata_find(cpu),
2788                                     ipfw_state_expire_ipifunc, NULL);
2789                         }
2790                         ctx->ipfw_sts_overflow++;
2791                         return (NULL);
2792                 }
2793         }
2794
2795         switch (cmd->o.opcode) {
2796         case O_KEEP_STATE: /* bidir rule */
2797         case O_REDIRECT:
2798                 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2799                     tcp);
2800                 if (s == NULL)
2801                         return (NULL);
2802                 break;
2803
2804         case O_LIMIT: /* limit number of sessions */
2805                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2806                 if (t == NULL)
2807                         return (NULL);
2808
2809                 if (*t->t_count >= cmd->conn_limit) {
2810                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2811                                 return (NULL);
2812                 }
2813                 for (;;) {
2814                         count = *t->t_count;
2815                         if (count >= cmd->conn_limit)
2816                                 return (NULL);
2817                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2818                                 break;
2819                 }
2820
2821                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2822                 if (s == NULL) {
2823                         /* Undo damage. */
2824                         atomic_subtract_int(t->t_count, 1);
2825                         return (NULL);
2826                 }
2827                 break;
2828
2829         default:
2830                 panic("unknown state type %u\n", cmd->o.opcode);
2831         }
2832
2833         if (s->st_type == O_REDIRECT) {
2834                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2835                 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2836
2837                 x->xlat_addr = r->addr.s_addr;
2838                 x->xlat_port = r->port;
2839                 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2840                 x->xlat_dir = MATCH_FORWARD;
2841                 KKASSERT(x->xlat_ifp != NULL);
2842         }
2843         return (s);
2844 }
2845
2846 static int
2847 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2848     const struct in_addr *in)
2849 {
2850         struct radix_node_head *rnh;
2851         struct sockaddr_in sin;
2852         struct ipfw_tblent *te;
2853
2854         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2855         rnh = ctx->ipfw_tables[tableid];
2856         if (rnh == NULL)
2857                 return (0); /* no match */
2858
2859         memset(&sin, 0, sizeof(sin));
2860         sin.sin_family = AF_INET;
2861         sin.sin_len = sizeof(sin);
2862         sin.sin_addr = *in;
2863
2864         te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2865         if (te == NULL)
2866                 return (0); /* no match */
2867
2868         te->te_use++;
2869         te->te_lastuse = time_second;
2870         return (1); /* match */
2871 }
2872
2873 /*
2874  * Transmit a TCP packet, containing either a RST or a keepalive.
2875  * When flags & TH_RST, we are sending a RST packet, because of a
2876  * "reset" action matched the packet.
2877  * Otherwise we are sending a keepalive, and flags & TH_
2878  *
2879  * Only {src,dst}_{ip,port} of "id" are used.
2880  */
2881 static void
2882 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2883 {
2884         struct mbuf *m;
2885         struct ip *ip;
2886         struct tcphdr *tcp;
2887         struct route sro;       /* fake route */
2888
2889         MGETHDR(m, M_NOWAIT, MT_HEADER);
2890         if (m == NULL)
2891                 return;
2892         m->m_pkthdr.rcvif = NULL;
2893         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2894         m->m_data += max_linkhdr;
2895
2896         ip = mtod(m, struct ip *);
2897         bzero(ip, m->m_len);
2898         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2899         ip->ip_p = IPPROTO_TCP;
2900         tcp->th_off = 5;
2901
2902         /*
2903          * Assume we are sending a RST (or a keepalive in the reverse
2904          * direction), swap src and destination addresses and ports.
2905          */
2906         ip->ip_src.s_addr = htonl(id->dst_ip);
2907         ip->ip_dst.s_addr = htonl(id->src_ip);
2908         tcp->th_sport = htons(id->dst_port);
2909         tcp->th_dport = htons(id->src_port);
2910         if (flags & TH_RST) {   /* we are sending a RST */
2911                 if (flags & TH_ACK) {
2912                         tcp->th_seq = htonl(ack);
2913                         tcp->th_ack = htonl(0);
2914                         tcp->th_flags = TH_RST;
2915                 } else {
2916                         if (flags & TH_SYN)
2917                                 seq++;
2918                         tcp->th_seq = htonl(0);
2919                         tcp->th_ack = htonl(seq);
2920                         tcp->th_flags = TH_RST | TH_ACK;
2921                 }
2922         } else {
2923                 /*
2924                  * We are sending a keepalive. flags & TH_SYN determines
2925                  * the direction, forward if set, reverse if clear.
2926                  * NOTE: seq and ack are always assumed to be correct
2927                  * as set by the caller. This may be confusing...
2928                  */
2929                 if (flags & TH_SYN) {
2930                         /*
2931                          * we have to rewrite the correct addresses!
2932                          */
2933                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2934                         ip->ip_src.s_addr = htonl(id->src_ip);
2935                         tcp->th_dport = htons(id->dst_port);
2936                         tcp->th_sport = htons(id->src_port);
2937                 }
2938                 tcp->th_seq = htonl(seq);
2939                 tcp->th_ack = htonl(ack);
2940                 tcp->th_flags = TH_ACK;
2941         }
2942
2943         /*
2944          * set ip_len to the payload size so we can compute
2945          * the tcp checksum on the pseudoheader
2946          * XXX check this, could save a couple of words ?
2947          */
2948         ip->ip_len = htons(sizeof(struct tcphdr));
2949         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2950
2951         /*
2952          * now fill fields left out earlier
2953          */
2954         ip->ip_ttl = ip_defttl;
2955         ip->ip_len = m->m_pkthdr.len;
2956
2957         bzero(&sro, sizeof(sro));
2958         ip_rtaddr(ip->ip_dst, &sro);
2959
2960         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2961         ip_output(m, NULL, &sro, 0, NULL, NULL);
2962         if (sro.ro_rt)
2963                 RTFREE(sro.ro_rt);
2964 }
2965
2966 /*
2967  * Send a reject message, consuming the mbuf passed as an argument.
2968  */
2969 static void
2970 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2971 {
2972         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2973                 /* We need the IP header in host order for icmp_error(). */
2974                 if (args->eh != NULL) {
2975                         struct ip *ip = mtod(args->m, struct ip *);
2976
2977                         ip->ip_len = ntohs(ip->ip_len);
2978                         ip->ip_off = ntohs(ip->ip_off);
2979                 }
2980                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2981         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2982                 struct tcphdr *const tcp =
2983                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2984
2985                 if ((tcp->th_flags & TH_RST) == 0) {
2986                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2987                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2988                 }
2989                 m_freem(args->m);
2990         } else {
2991                 m_freem(args->m);
2992         }
2993         args->m = NULL;
2994 }
2995
2996 /*
2997  * Given an ip_fw *, lookup_next_rule will return a pointer
2998  * to the next rule, which can be either the jump
2999  * target (for skipto instructions) or the next one in the list (in
3000  * all other cases including a missing jump target).
3001  * The result is also written in the "next_rule" field of the rule.
3002  * Backward jumps are not allowed, so start looking from the next
3003  * rule...
3004  *
3005  * This never returns NULL -- in case we do not have an exact match,
3006  * the next rule is returned. When the ruleset is changed,
3007  * pointers are flushed so we are always correct.
3008  */
3009 static struct ip_fw *
3010 lookup_next_rule(struct ip_fw *me)
3011 {
3012         struct ip_fw *rule = NULL;
3013         ipfw_insn *cmd;
3014
3015         /* look for action, in case it is a skipto */
3016         cmd = ACTION_PTR(me);
3017         if (cmd->opcode == O_LOG)
3018                 cmd += F_LEN(cmd);
3019         if (cmd->opcode == O_SKIPTO) {
3020                 for (rule = me->next; rule; rule = rule->next) {
3021                         if (rule->rulenum >= cmd->arg1)
3022                                 break;
3023                 }
3024         }
3025         if (rule == NULL)                       /* failure or not a skipto */
3026                 rule = me->next;
3027         me->next_rule = rule;
3028         return rule;
3029 }
3030
3031 static int
3032 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3033                 enum ipfw_opcodes opcode, uid_t uid)
3034 {
3035         struct in_addr src_ip, dst_ip;
3036         struct inpcbinfo *pi;
3037         boolean_t wildcard;
3038         struct inpcb *pcb;
3039
3040         if (fid->proto == IPPROTO_TCP) {
3041                 wildcard = FALSE;
3042                 pi = &tcbinfo[mycpuid];
3043         } else if (fid->proto == IPPROTO_UDP) {
3044                 wildcard = TRUE;
3045                 pi = &udbinfo[mycpuid];
3046         } else {
3047                 return 0;
3048         }
3049
3050         /*
3051          * Values in 'fid' are in host byte order
3052          */
3053         dst_ip.s_addr = htonl(fid->dst_ip);
3054         src_ip.s_addr = htonl(fid->src_ip);
3055         if (oif) {
3056                 pcb = in_pcblookup_hash(pi,
3057                         dst_ip, htons(fid->dst_port),
3058                         src_ip, htons(fid->src_port),
3059                         wildcard, oif);
3060         } else {
3061                 pcb = in_pcblookup_hash(pi,
3062                         src_ip, htons(fid->src_port),
3063                         dst_ip, htons(fid->dst_port),
3064                         wildcard, NULL);
3065         }
3066         if (pcb == NULL || pcb->inp_socket == NULL)
3067                 return 0;
3068
3069         if (opcode == O_UID) {
3070 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3071                 return !socheckuid(pcb->inp_socket, uid);
3072 #undef socheckuid
3073         } else  {
3074                 return groupmember(uid, pcb->inp_socket->so_cred);
3075         }
3076 }
3077
3078 static int
3079 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3080 {
3081
3082         if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3083                 struct ifaddr_container *ifac;
3084                 struct ifnet *ifp;
3085
3086                 ifp = ifunit_netisr(cmd->ifname);
3087                 if (ifp == NULL)
3088                         return (0);
3089
3090                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3091                         struct ifaddr *ia = ifac->ifa;
3092
3093                         if (ia->ifa_addr == NULL)
3094                                 continue;
3095                         if (ia->ifa_addr->sa_family != AF_INET)
3096                                 continue;
3097
3098                         cmd->mask.s_addr = INADDR_ANY;
3099                         if (cmd->o.arg1 & IPFW_IFIP_NET) {
3100                                 cmd->mask = ((struct sockaddr_in *)
3101                                     ia->ifa_netmask)->sin_addr;
3102                         }
3103                         if (cmd->mask.s_addr == INADDR_ANY)
3104                                 cmd->mask.s_addr = INADDR_BROADCAST;
3105
3106                         cmd->addr =
3107                             ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3108                         cmd->addr.s_addr &= cmd->mask.s_addr;
3109
3110                         cmd->o.arg1 |= IPFW_IFIP_VALID;
3111                         break;
3112                 }
3113                 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3114                         return (0);
3115         }
3116         return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3117 }
3118
3119 static void
3120 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3121     struct in_addr *old_addr, uint16_t *old_port)
3122 {
3123         struct ip *ip = mtod(m, struct ip *);
3124         struct in_addr *addr;
3125         uint16_t *port, *csum, dlen = 0;
3126         uint8_t udp = 0;
3127         boolean_t pseudo = FALSE;
3128
3129         if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3130                 addr = &ip->ip_src;
3131                 switch (ip->ip_p) {
3132                 case IPPROTO_TCP:
3133                         port = &L3HDR(struct tcphdr, ip)->th_sport;
3134                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3135                         break;
3136                 case IPPROTO_UDP:
3137                         port = &L3HDR(struct udphdr, ip)->uh_sport;
3138                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3139                         udp = 1;
3140                         break;
3141                 default:
3142                         panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3143                 }
3144         } else {
3145                 addr = &ip->ip_dst;
3146                 switch (ip->ip_p) {
3147                 case IPPROTO_TCP:
3148                         port = &L3HDR(struct tcphdr, ip)->th_dport;
3149                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3150                         break;
3151                 case IPPROTO_UDP:
3152                         port = &L3HDR(struct udphdr, ip)->uh_dport;
3153                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3154                         udp = 1;
3155                         break;
3156                 default:
3157                         panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3158                 }
3159         }
3160         if (old_addr != NULL)
3161                 *old_addr = *addr;
3162         if (old_port != NULL) {
3163                 if (x->xlat_port != 0)
3164                         *old_port = *port;
3165                 else
3166                         *old_port = 0;
3167         }
3168
3169         if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3170                 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3171                         dlen = ip->ip_len - (ip->ip_hl << 2);
3172                 pseudo = TRUE;
3173         }
3174
3175         if (!pseudo) {
3176                 const uint16_t *oaddr, *naddr;
3177
3178                 oaddr = (const uint16_t *)&addr->s_addr;
3179                 naddr = (const uint16_t *)&x->xlat_addr;
3180
3181                 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3182                     oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3183                 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3184                     oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3185         }
3186         addr->s_addr = x->xlat_addr;
3187
3188         if (x->xlat_port != 0) {
3189                 if (!pseudo) {
3190                         *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3191                             udp);
3192                 }
3193                 *port = x->xlat_port;
3194         }
3195
3196         if (pseudo) {
3197                 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3198                     htons(dlen + ip->ip_p));
3199         }
3200 }
3201
3202 static void
3203 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3204 {
3205         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3206         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3207         struct mbuf *m = nm->m;
3208         struct ipfw_xlat *x = nm->arg1;
3209         struct ip_fw *rule = x->xlat_rule;
3210
3211         ASSERT_NETISR_NCPUS(mycpuid);
3212         KASSERT(rule->cpuid == mycpuid,
3213             ("rule does not belong to cpu%d", mycpuid));
3214         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3215             ("mbuf does not have ipfw continue rule"));
3216
3217         KASSERT(ctx->ipfw_cont_rule == NULL,
3218             ("pending ipfw continue rule"));
3219         KASSERT(ctx->ipfw_cont_xlat == NULL,
3220             ("pending ipfw continue xlat"));
3221         ctx->ipfw_cont_rule = rule;
3222         ctx->ipfw_cont_xlat = x;
3223
3224         if (nm->arg2 == 0)
3225                 ip_input(m);
3226         else
3227                 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3228
3229         /* May not be cleared, if ipfw was unload/disabled. */
3230         ctx->ipfw_cont_rule = NULL;
3231         ctx->ipfw_cont_xlat = NULL;
3232
3233         /*
3234          * This state is no longer used; decrement its xlat_crefs,
3235          * so this state can be deleted.
3236          */
3237         x->xlat_crefs--;
3238         /*
3239          * This rule is no longer used; decrement its cross_refs,
3240          * so this rule can be deleted.
3241          *
3242          * NOTE:
3243          * Decrement cross_refs in the last step of this function,
3244          * so that the module could be unloaded safely.
3245          */
3246         rule->cross_refs--;
3247 }
3248
3249 static void
3250 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3251     uint32_t flags)
3252 {
3253         struct netmsg_genpkt *nm;
3254
3255         KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3256             x->xlat_pcpu, cpuid));
3257
3258         /*
3259          * Bump cross_refs to prevent this rule and its siblings
3260          * from being deleted, while this mbuf is inflight.  The
3261          * cross_refs of the sibling rule on the target cpu will
3262          * be decremented, once this mbuf is going to be filtered
3263          * on the target cpu.
3264          */
3265         x->xlat_rule->cross_refs++;
3266         /*
3267          * Bump xlat_crefs to prevent this state and its paired
3268          * state from being deleted, while this mbuf is inflight.
3269          * The xlat_crefs of the paired state on the target cpu
3270          * will be decremented, once this mbuf is going to be
3271          * filtered on the target cpu.
3272          */
3273         x->xlat_crefs++;
3274
3275         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3276         if (flags & IPFW_XLATE_INSERT)
3277                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3278         if (flags & IPFW_XLATE_FORWARD)
3279                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3280
3281         if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3282                 struct ip *ip = mtod(m, struct ip *);
3283
3284                 /*
3285                  * NOTE:
3286                  * ip_input() expects ip_len/ip_off are in network
3287                  * byte order.
3288                  */
3289                 ip->ip_len = htons(ip->ip_len);
3290                 ip->ip_off = htons(ip->ip_off);
3291         }
3292
3293         nm = &m->m_hdr.mh_genmsg;
3294         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3295             ipfw_ip_xlate_dispatch);
3296         nm->m = m;
3297         nm->arg1 = x->xlat_pair;
3298         nm->arg2 = 0;
3299         if (flags & IPFW_XLATE_OUTPUT)
3300                 nm->arg2 = 1;
3301         netisr_sendmsg(&nm->base, cpuid);
3302 }
3303
3304 static struct mbuf *
3305 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3306     struct ip_fw_local *local, struct ip **ip0)
3307 {
3308         struct ip *ip = mtod(m, struct ip *);
3309         struct tcphdr *tcp;
3310         struct udphdr *udp;
3311
3312         /*
3313          * Collect parameters into local variables for faster matching.
3314          */
3315         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
3316                 local->proto = args->f_id.proto = 0;    /* mark f_id invalid */
3317                 goto done;
3318         }
3319
3320         local->proto = args->f_id.proto = ip->ip_p;
3321         local->src_ip = ip->ip_src;
3322         local->dst_ip = ip->ip_dst;
3323         if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3324                 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3325                 local->ip_len = ntohs(ip->ip_len);
3326         } else {
3327                 local->offset = ip->ip_off & IP_OFFMASK;
3328                 local->ip_len = ip->ip_len;
3329         }
3330
3331 #define PULLUP_TO(len)                                  \
3332 do {                                                    \
3333         if (m->m_len < (len)) {                         \
3334                 args->m = m = m_pullup(m, (len));       \
3335                 if (m == NULL) {                        \
3336                         ip = NULL;                      \
3337                         goto done;                      \
3338                 }                                       \
3339                 ip = mtod(m, struct ip *);              \
3340         }                                               \
3341 } while (0)
3342
3343         if (local->offset == 0) {
3344                 switch (local->proto) {
3345                 case IPPROTO_TCP:
3346                         PULLUP_TO(hlen + sizeof(struct tcphdr));
3347                         local->tcp = tcp = L3HDR(struct tcphdr, ip);
3348                         local->dst_port = tcp->th_dport;
3349                         local->src_port = tcp->th_sport;
3350                         args->f_id.flags = tcp->th_flags;
3351                         break;
3352
3353                 case IPPROTO_UDP:
3354                         PULLUP_TO(hlen + sizeof(struct udphdr));
3355                         udp = L3HDR(struct udphdr, ip);
3356                         local->dst_port = udp->uh_dport;
3357                         local->src_port = udp->uh_sport;
3358                         break;
3359
3360                 case IPPROTO_ICMP:
3361                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
3362                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3363                         break;
3364
3365                 default:
3366                         break;
3367                 }
3368         }
3369
3370 #undef PULLUP_TO
3371
3372         args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3373         args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3374         args->f_id.src_port = local->src_port = ntohs(local->src_port);
3375         args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3376 done:
3377         *ip0 = ip;
3378         return (m);
3379 }
3380
3381 static struct mbuf *
3382 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3383     struct ip_fw_local *local, struct ip **ip0)
3384 {
3385         struct ip *ip = mtod(m, struct ip *);
3386
3387         ip->ip_len = htons(ip->ip_len);
3388         ip->ip_off = htons(ip->ip_off);
3389
3390         m->m_flags &= ~M_HASH;
3391         ip_hashfn(&m, 0);
3392         args->m = m;
3393         if (m == NULL) {
3394                 *ip0 = NULL;
3395                 return (NULL);
3396         }
3397         KASSERT(m->m_flags & M_HASH, ("no hash"));
3398
3399         /* 'm' might be changed by ip_hashfn(). */
3400         ip = mtod(m, struct ip *);
3401         ip->ip_len = ntohs(ip->ip_len);
3402         ip->ip_off = ntohs(ip->ip_off);
3403
3404         return (ipfw_setup_local(m, hlen, args, local, ip0));
3405 }
3406
3407 /*
3408  * The main check routine for the firewall.
3409  *
3410  * All arguments are in args so we can modify them and return them
3411  * back to the caller.
3412  *
3413  * Parameters:
3414  *
3415  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
3416  *              Starts with the IP header.
3417  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
3418  *      args->oif       Outgoing interface, or NULL if packet is incoming.
3419  *              The incoming interface is in the mbuf. (in)
3420  *
3421  *      args->rule      Pointer to the last matching rule (in/out)
3422  *      args->f_id      Addresses grabbed from the packet (out)
3423  *
3424  * Return value:
3425  *
3426  *      If the packet was denied/rejected and has been dropped, *m is equal
3427  *      to NULL upon return.
3428  *
3429  *      IP_FW_DENY      the packet must be dropped.
3430  *      IP_FW_PASS      The packet is to be accepted and routed normally.
3431  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
3432  *      IP_FW_TEE       Tee the packet to port (args->cookie)
3433  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
3434  *      IP_FW_CONTINUE  Continue processing on another cpu.
3435  */
3436 static int
3437 ipfw_chk(struct ip_fw_args *args)
3438 {
3439         /*
3440          * Local variables hold state during the processing of a packet.
3441          *
3442          * IMPORTANT NOTE: to speed up the processing of rules, there
3443          * are some assumption on the values of the variables, which
3444          * are documented here. Should you change them, please check
3445          * the implementation of the various instructions to make sure
3446          * that they still work.
3447          *
3448          * args->eh     The MAC header. It is non-null for a layer2
3449          *      packet, it is NULL for a layer-3 packet.
3450          *
3451          * m | args->m  Pointer to the mbuf, as received from the caller.
3452          *      It may change if ipfw_chk() does an m_pullup, or if it
3453          *      consumes the packet because it calls send_reject().
3454          *      XXX This has to change, so that ipfw_chk() never modifies
3455          *      or consumes the buffer.
3456          * ip   is simply an alias of the value of m, and it is kept
3457          *      in sync with it (the packet is  supposed to start with
3458          *      the ip header).
3459          */
3460         struct mbuf *m = args->m;
3461         struct ip *ip = mtod(m, struct ip *);
3462
3463         /*
3464          * oif | args->oif      If NULL, ipfw_chk has been called on the
3465          *      inbound path (ether_input, ip_input).
3466          *      If non-NULL, ipfw_chk has been called on the outbound path
3467          *      (ether_output, ip_output).
3468          */
3469         struct ifnet *oif = args->oif;
3470
3471         struct ip_fw *f = NULL;         /* matching rule */
3472         int retval = IP_FW_PASS;
3473         struct m_tag *mtag;
3474         struct divert_info *divinfo;
3475         struct ipfw_state *s;
3476
3477         /*
3478          * hlen The length of the IPv4 header.
3479          *      hlen >0 means we have an IPv4 packet.
3480          */
3481         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
3482
3483         struct ip_fw_local lc;
3484
3485         /*
3486          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3487          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
3488          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3489          */
3490         int dyn_dir = MATCH_UNKNOWN;
3491         struct ip_fw *dyn_f = NULL;
3492         int cpuid = mycpuid;
3493         struct ipfw_context *ctx;
3494
3495         ASSERT_NETISR_NCPUS(cpuid);
3496         ctx = ipfw_ctx[cpuid];
3497
3498         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3499                 return IP_FW_PASS;      /* accept */
3500
3501         if (args->eh == NULL ||         /* layer 3 packet */
3502             (m->m_pkthdr.len >= sizeof(struct ip) &&
3503              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3504                 hlen = ip->ip_hl << 2;
3505
3506         memset(&lc, 0, sizeof(lc));
3507
3508         m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3509         if (m == NULL)
3510                 goto pullup_failed;
3511
3512         if (args->rule) {
3513                 /*
3514                  * Packet has already been tagged. Look for the next rule
3515                  * to restart processing.
3516                  *
3517                  * If fw_one_pass != 0 then just accept it.
3518                  * XXX should not happen here, but optimized out in
3519                  * the caller.
3520                  */
3521                 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3522                         return IP_FW_PASS;
3523                 args->flags &= ~IP_FWARG_F_CONT;
3524
3525                 /* This rule is being/has been flushed */
3526                 if (ipfw_flushing)
3527                         return IP_FW_DENY;
3528
3529                 KASSERT(args->rule->cpuid == cpuid,
3530                         ("rule used on cpu%d", cpuid));
3531
3532                 /* This rule was deleted */
3533                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3534                         return IP_FW_DENY;
3535
3536                 if (args->xlat != NULL) {
3537                         struct ipfw_xlat *x = args->xlat;
3538
3539                         /* This xlat is being deleted. */
3540                         if (x->xlat_invalid)
3541                                 return IP_FW_DENY;
3542
3543                         f = args->rule;
3544
3545                         dyn_f = f;
3546                         dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3547                             MATCH_FORWARD : MATCH_REVERSE;
3548
3549                         if (args->flags & IP_FWARG_F_XLATINS) {
3550                                 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3551                                     ("not slave %u state", x->xlat_type));
3552                                 s = ipfw_state_link(ctx, &x->xlat_st);
3553                                 if (s != NULL) {
3554                                         ctx->ipfw_xlate_conflicts++;
3555                                         if (IPFW_STATE_ISDEAD(s)) {
3556                                                 ipfw_state_remove(ctx, s);
3557                                                 s = ipfw_state_link(ctx,
3558                                                     &x->xlat_st);
3559                                         }
3560                                         if (s != NULL) {
3561                                                 if (bootverbose) {
3562                                                         kprintf("ipfw: "
3563                                                         "slave %u state "
3564                                                         "conflicts %u state\n",
3565                                                         x->xlat_type,
3566                                                         s->st_type);
3567                                                 }
3568                                                 ipfw_xlat_invalidate(x);
3569                                                 return IP_FW_DENY;
3570                                         }
3571                                         ctx->ipfw_xlate_cresolved++;
3572                                 }
3573                         } else {
3574                                 ipfw_state_update(&args->f_id, dyn_dir,
3575                                     lc.tcp, &x->xlat_st);
3576                         }
3577                 } else {
3578                         /* TODO: setup dyn_f, dyn_dir */
3579
3580                         f = args->rule->next_rule;
3581                         if (f == NULL)
3582                                 f = lookup_next_rule(args->rule);
3583                 }
3584         } else {
3585                 /*
3586                  * Find the starting rule. It can be either the first
3587                  * one, or the one after divert_rule if asked so.
3588                  */
3589                 int skipto;
3590
3591                 KKASSERT((args->flags &
3592                     (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3593                 KKASSERT(args->xlat == NULL);
3594
3595                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3596                 if (mtag != NULL) {
3597                         divinfo = m_tag_data(mtag);
3598                         skipto = divinfo->skipto;
3599                 } else {
3600                         skipto = 0;
3601                 }
3602
3603                 f = ctx->ipfw_layer3_chain;
3604                 if (args->eh == NULL && skipto != 0) {
3605                         /* No skipto during rule flushing */
3606                         if (ipfw_flushing)
3607                                 return IP_FW_DENY;
3608
3609                         if (skipto >= IPFW_DEFAULT_RULE)
3610                                 return IP_FW_DENY; /* invalid */
3611
3612                         while (f && f->rulenum <= skipto)
3613                                 f = f->next;
3614                         if (f == NULL)  /* drop packet */
3615                                 return IP_FW_DENY;
3616                 } else if (ipfw_flushing) {
3617                         /* Rules are being flushed; skip to default rule */
3618                         f = ctx->ipfw_default_rule;
3619                 }
3620         }
3621         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3622                 m_tag_delete(m, mtag);
3623
3624         /*
3625          * Now scan the rules, and parse microinstructions for each rule.
3626          */
3627         for (; f; f = f->next) {
3628                 int l, cmdlen;
3629                 ipfw_insn *cmd;
3630                 int skip_or; /* skip rest of OR block */
3631
3632 again:
3633                 if (ctx->ipfw_set_disable & (1 << f->set)) {
3634                         args->xlat = NULL;
3635                         continue;
3636                 }
3637
3638                 if (args->xlat != NULL) {
3639                         args->xlat = NULL;
3640                         l = f->cmd_len - f->act_ofs;
3641                         cmd = ACTION_PTR(f);
3642                 } else {
3643                         l = f->cmd_len;
3644                         cmd = f->cmd;
3645                 }
3646
3647                 skip_or = 0;
3648                 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3649                         int match;
3650
3651                         /*
3652                          * check_body is a jump target used when we find a
3653                          * CHECK_STATE, and need to jump to the body of
3654                          * the target rule.
3655                          */
3656 check_body:
3657                         cmdlen = F_LEN(cmd);
3658                         /*
3659                          * An OR block (insn_1 || .. || insn_n) has the
3660                          * F_OR bit set in all but the last instruction.
3661                          * The first match will set "skip_or", and cause
3662                          * the following instructions to be skipped until
3663                          * past the one with the F_OR bit clear.
3664                          */
3665                         if (skip_or) {          /* skip this instruction */
3666                                 if ((cmd->len & F_OR) == 0)
3667                                         skip_or = 0;    /* next one is good */
3668                                 continue;
3669                         }
3670                         match = 0; /* set to 1 if we succeed */
3671
3672                         switch (cmd->opcode) {
3673                         /*
3674                          * The first set of opcodes compares the packet's
3675                          * fields with some pattern, setting 'match' if a
3676                          * match is found. At the end of the loop there is
3677                          * logic to deal with F_NOT and F_OR flags associated
3678                          * with the opcode.
3679                          */
3680                         case O_NOP:
3681                                 match = 1;
3682                                 break;
3683
3684                         case O_FORWARD_MAC:
3685                                 kprintf("ipfw: opcode %d unimplemented\n",
3686                                         cmd->opcode);
3687                                 break;
3688
3689                         case O_GID:
3690                         case O_UID:
3691                                 /*
3692                                  * We only check offset == 0 && proto != 0,
3693                                  * as this ensures that we have an IPv4
3694                                  * packet with the ports info.
3695                                  */
3696                                 if (lc.offset!=0)
3697                                         break;
3698
3699                                 match = ipfw_match_uid(&args->f_id, oif,
3700                                         cmd->opcode,
3701                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3702                                 break;
3703
3704                         case O_RECV:
3705                                 match = iface_match(m->m_pkthdr.rcvif,
3706                                     (ipfw_insn_if *)cmd);
3707                                 break;
3708
3709                         case O_XMIT:
3710                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3711                                 break;
3712
3713                         case O_VIA:
3714                                 match = iface_match(oif ? oif :
3715                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3716                                 break;
3717
3718                         case O_MACADDR2:
3719                                 if (args->eh != NULL) { /* have MAC header */
3720                                         uint32_t *want = (uint32_t *)
3721                                                 ((ipfw_insn_mac *)cmd)->addr;
3722                                         uint32_t *mask = (uint32_t *)
3723                                                 ((ipfw_insn_mac *)cmd)->mask;
3724                                         uint32_t *hdr = (uint32_t *)args->eh;
3725
3726                                         match =
3727                                         (want[0] == (hdr[0] & mask[0]) &&
3728                                          want[1] == (hdr[1] & mask[1]) &&
3729                                          want[2] == (hdr[2] & mask[2]));
3730                                 }
3731                                 break;
3732
3733                         case O_MAC_TYPE:
3734                                 if (args->eh != NULL) {
3735                                         uint16_t t =
3736                                             ntohs(args->eh->ether_type);
3737                                         uint16_t *p =
3738                                             ((ipfw_insn_u16 *)cmd)->ports;
3739                                         int i;
3740
3741                                         /* Special vlan handling */
3742                                         if (m->m_flags & M_VLANTAG)
3743                                                 t = ETHERTYPE_VLAN;
3744
3745                                         for (i = cmdlen - 1; !match && i > 0;
3746                                              i--, p += 2) {
3747                                                 match =
3748                                                 (t >= p[0] && t <= p[1]);
3749                                         }
3750                                 }
3751                                 break;
3752
3753                         case O_FRAG:
3754                                 match = (hlen > 0 && lc.offset != 0);
3755                                 break;
3756
3757                         case O_IPFRAG:
3758                                 if (hlen > 0) {
3759                                         uint16_t off;
3760
3761                                         if (args->eh != NULL)
3762                                                 off = ntohs(ip->ip_off);
3763                                         else
3764                                                 off = ip->ip_off;
3765                                         if (off & (IP_MF | IP_OFFMASK))
3766                                                 match = 1;
3767                                 }
3768                                 break;
3769
3770                         case O_IN:      /* "out" is "not in" */
3771                                 match = (oif == NULL);
3772                                 break;
3773
3774                         case O_LAYER2:
3775                                 match = (args->eh != NULL);
3776                                 break;
3777
3778                         case O_PROTO:
3779                                 /*
3780                                  * We do not allow an arg of 0 so the
3781                                  * check of "proto" only suffices.
3782                                  */
3783                                 match = (lc.proto == cmd->arg1);
3784                                 break;
3785
3786                         case O_IP_SRC:
3787                                 match = (hlen > 0 &&
3788                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3789                                     lc.src_ip.s_addr);
3790                                 break;
3791
3792                         case O_IP_SRC_MASK:
3793                                 match = (hlen > 0 &&
3794                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3795                                      (lc.src_ip.s_addr &
3796                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3797                                 break;
3798
3799                         case O_IP_SRC_ME:
3800                                 if (hlen > 0) {
3801                                         struct ifnet *tif;
3802
3803                                         tif = INADDR_TO_IFP(&lc.src_ip);
3804                                         match = (tif != NULL);
3805                                 }
3806                                 break;
3807
3808                         case O_IP_SRC_TABLE:
3809                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3810                                     &lc.src_ip);
3811                                 break;
3812
3813                         case O_IP_SRC_IFIP:
3814                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3815                                     &lc.src_ip);
3816                                 break;
3817
3818                         case O_IP_DST_SET:
3819                         case O_IP_SRC_SET:
3820                                 if (hlen > 0) {
3821                                         uint32_t *d = (uint32_t *)(cmd + 1);
3822                                         uint32_t addr =
3823                                             cmd->opcode == O_IP_DST_SET ?
3824                                                 args->f_id.dst_ip :
3825                                                 args->f_id.src_ip;
3826
3827                                         if (addr < d[0])
3828                                                 break;
3829                                         addr -= d[0]; /* subtract base */
3830                                         match =
3831                                         (addr < cmd->arg1) &&
3832                                          (d[1 + (addr >> 5)] &
3833                                           (1 << (addr & 0x1f)));
3834                                 }
3835                                 break;
3836
3837                         case O_IP_DST:
3838                                 match = (hlen > 0 &&
3839                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3840                                     lc.dst_ip.s_addr);
3841                                 break;
3842
3843                         case O_IP_DST_MASK:
3844                                 match = (hlen > 0) &&
3845                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3846                                      (lc.dst_ip.s_addr &
3847                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3848                                 break;
3849
3850                         case O_IP_DST_ME:
3851                                 if (hlen > 0) {
3852                                         struct ifnet *tif;
3853
3854                                         tif = INADDR_TO_IFP(&lc.dst_ip);
3855                                         match = (tif != NULL);
3856                                 }
3857                                 break;
3858
3859                         case O_IP_DST_TABLE:
3860                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3861                                     &lc.dst_ip);
3862                                 break;
3863
3864                         case O_IP_DST_IFIP:
3865                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3866                                     &lc.dst_ip);
3867                                 break;
3868
3869                         case O_IP_SRCPORT:
3870                         case O_IP_DSTPORT:
3871                                 /*
3872                                  * offset == 0 && proto != 0 is enough
3873                                  * to guarantee that we have an IPv4
3874                                  * packet with port info.
3875                                  */
3876                                 if ((lc.proto==IPPROTO_UDP ||
3877                                      lc.proto==IPPROTO_TCP)
3878                                     && lc.offset == 0) {
3879                                         uint16_t x =
3880                                             (cmd->opcode == O_IP_SRCPORT) ?
3881                                                 lc.src_port : lc.dst_port;
3882                                         uint16_t *p =
3883                                             ((ipfw_insn_u16 *)cmd)->ports;
3884                                         int i;
3885
3886                                         for (i = cmdlen - 1; !match && i > 0;
3887                                              i--, p += 2) {
3888                                                 match =
3889                                                 (x >= p[0] && x <= p[1]);
3890                                         }
3891                                 }
3892                                 break;
3893
3894                         case O_ICMPCODE:
3895                                 match = (lc.offset == 0 &&
3896                                     lc.proto==IPPROTO_ICMP &&
3897                                     icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3898                                 break;
3899
3900                         case O_ICMPTYPE:
3901                                 match = (lc.offset == 0 &&
3902                                     lc.proto==IPPROTO_ICMP &&
3903                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3904                                 break;
3905
3906                         case O_IPOPT:
3907                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3908                                 break;
3909
3910                         case O_IPVER:
3911                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3912                                 break;
3913
3914                         case O_IPTTL:
3915                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3916                                 break;
3917
3918                         case O_IPID:
3919                                 match = (hlen > 0 &&
3920                                     cmd->arg1 == ntohs(ip->ip_id));
3921                                 break;
3922
3923                         case O_IPLEN:
3924                                 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3925                                 break;
3926
3927                         case O_IPPRECEDENCE:
3928                                 match = (hlen > 0 &&
3929                                     (cmd->arg1 == (ip->ip_tos & 0xe0)));
3930                                 break;
3931
3932                         case O_IPTOS:
3933                                 match = (hlen > 0 &&
3934                                     flags_match(cmd, ip->ip_tos));
3935                                 break;
3936
3937                         case O_TCPFLAGS:
3938                                 match = (lc.proto == IPPROTO_TCP &&
3939                                     lc.offset == 0 &&
3940                                     flags_match(cmd,
3941                                         L3HDR(struct tcphdr,ip)->th_flags));
3942                                 break;
3943
3944                         case O_TCPOPTS:
3945                                 match = (lc.proto == IPPROTO_TCP &&
3946                                     lc.offset == 0 && tcpopts_match(ip, cmd));
3947                                 break;
3948
3949                         case O_TCPSEQ:
3950                                 match = (lc.proto == IPPROTO_TCP &&
3951                                     lc.offset == 0 &&
3952                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3953                                         L3HDR(struct tcphdr,ip)->th_seq);
3954                                 break;
3955
3956                         case O_TCPACK:
3957                                 match = (lc.proto == IPPROTO_TCP &&
3958                                     lc.offset == 0 &&
3959                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3960                                         L3HDR(struct tcphdr,ip)->th_ack);
3961                                 break;
3962
3963                         case O_TCPWIN:
3964                                 match = (lc.proto == IPPROTO_TCP &&
3965                                     lc.offset == 0 &&
3966                                     cmd->arg1 ==
3967                                         L3HDR(struct tcphdr,ip)->th_win);
3968                                 break;
3969
3970                         case O_ESTAB:
3971                                 /* reject packets which have SYN only */
3972                                 /* XXX should i also check for TH_ACK ? */
3973                                 match = (lc.proto == IPPROTO_TCP &&
3974                                     lc.offset == 0 &&
3975                                     (L3HDR(struct tcphdr,ip)->th_flags &
3976                                      (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3977                                 break;
3978
3979                         case O_LOG:
3980                                 if (fw_verbose) {
3981                                         ipfw_log(ctx, f, hlen, args->eh, m,
3982                                             oif);
3983                                 }
3984                                 match = 1;
3985                                 break;
3986
3987                         case O_PROB:
3988                                 match = (krandom() <
3989                                         ((ipfw_insn_u32 *)cmd)->d[0]);
3990                                 break;
3991
3992                         /*
3993                          * The second set of opcodes represents 'actions',
3994                          * i.e. the terminal part of a rule once the packet
3995                          * matches all previous patterns.
3996                          * Typically there is only one action for each rule,
3997                          * and the opcode is stored at the end of the rule
3998                          * (but there are exceptions -- see below).
3999                          *
4000                          * In general, here we set retval and terminate the
4001                          * outer loop (would be a 'break 3' in some language,
4002                          * but we need to do a 'goto done').
4003                          *
4004                          * Exceptions:
4005                          * O_COUNT and O_SKIPTO actions:
4006                          *   instead of terminating, we jump to the next rule
4007                          *   ('goto next_rule', equivalent to a 'break 2'),
4008                          *   or to the SKIPTO target ('goto again' after
4009                          *   having set f, cmd and l), respectively.
4010                          *
4011                          * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
4012                          *   are not real 'actions', and are stored right
4013                          *   before the 'action' part of the rule.
4014                          *   These opcodes try to install an entry in the
4015                          *   state tables; if successful, we continue with
4016                          *   the next opcode (match=1; break;), otherwise
4017                          *   the packet must be dropped ('goto done' after
4018                          *   setting retval).  If static rules are changed
4019                          *   during the state installation, the packet will
4020                          *   be dropped and rule's stats will not beupdated
4021                          *   ('return IP_FW_DENY').
4022                          *
4023                          * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4024                          *   cause a lookup of the state table, and a jump
4025                          *   to the 'action' part of the parent rule
4026                          *   ('goto check_body') if an entry is found, or
4027                          *   (CHECK_STATE only) a jump to the next rule if
4028                          *   the entry is not found ('goto next_rule').
4029                          *   The result of the lookup is cached to make
4030                          *   further instances of these opcodes are
4031                          *   effectively NOPs.  If static rules are changed
4032                          *   during the state looking up, the packet will
4033                          *   be dropped and rule's stats will not be updated
4034                          *   ('return IP_FW_DENY').
4035                          */
4036                         case O_REDIRECT:
4037                                 if (f->cross_rules == NULL) {
4038                                         /*
4039                                          * This rule was not completely setup;
4040                                          * move on to the next rule.
4041                                          */
4042                                         goto next_rule;
4043                                 }
4044                                 /*
4045                                  * Apply redirect only on input path and
4046                                  * only to non-fragment TCP segments or
4047                                  * UDP datagrams.
4048                                  *
4049                                  * Does _not_ work with layer2 filtering.
4050                                  */
4051                                 if (oif != NULL || args->eh != NULL ||
4052                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4053                                     (lc.proto != IPPROTO_TCP &&
4054                                      lc.proto != IPPROTO_UDP))
4055                                         break;
4056                                 /* FALL THROUGH */
4057                         case O_LIMIT:
4058                         case O_KEEP_STATE:
4059                                 if (hlen == 0)
4060                                         break;
4061                                 s = ipfw_state_install(ctx, f,
4062                                     (ipfw_insn_limit *)cmd, args, lc.tcp);
4063                                 if (s == NULL) {
4064                                         retval = IP_FW_DENY;
4065                                         goto done; /* error/limit violation */
4066                                 }
4067                                 s->st_pcnt++;
4068                                 s->st_bcnt += lc.ip_len;
4069
4070                                 if (s->st_type == O_REDIRECT) {
4071                                         struct in_addr oaddr;
4072                                         uint16_t oport;
4073                                         struct ipfw_xlat *slave_x, *x;
4074                                         struct ipfw_state *dup;
4075
4076                                         x = (struct ipfw_xlat *)s;
4077                                         ipfw_xlate(x, m, &oaddr, &oport);
4078                                         m = ipfw_rehashm(m, hlen, args, &lc,
4079                                             &ip);
4080                                         if (m == NULL) {
4081                                                 ipfw_state_del(ctx, s);
4082                                                 goto pullup_failed;
4083                                         }
4084
4085                                         cpuid = netisr_hashcpu(
4086                                             m->m_pkthdr.hash);
4087
4088                                         slave_x = (struct ipfw_xlat *)
4089                                             ipfw_state_alloc(ctx, &args->f_id,
4090                                             O_REDIRECT, f->cross_rules[cpuid],
4091                                             lc.tcp);
4092                                         if (slave_x == NULL) {
4093                                                 ipfw_state_del(ctx, s);
4094                                                 retval = IP_FW_DENY;
4095                                                 goto done;
4096                                         }
4097                                         slave_x->xlat_addr = oaddr.s_addr;
4098                                         slave_x->xlat_port = oport;
4099                                         slave_x->xlat_dir = MATCH_REVERSE;
4100                                         slave_x->xlat_flags |=
4101                                             IPFW_STATE_F_XLATSRC |
4102                                             IPFW_STATE_F_XLATSLAVE;
4103
4104                                         slave_x->xlat_pair = x;
4105                                         slave_x->xlat_pcpu = mycpuid;
4106                                         x->xlat_pair = slave_x;
4107                                         x->xlat_pcpu = cpuid;
4108
4109                                         ctx->ipfw_xlated++;
4110                                         if (cpuid != mycpuid) {
4111                                                 ctx->ipfw_xlate_split++;
4112                                                 ipfw_xlate_redispatch(
4113                                                     m, cpuid, x,
4114                                                     IPFW_XLATE_INSERT |
4115                                                     IPFW_XLATE_FORWARD);
4116                                                 args->m = NULL;
4117                                                 return (IP_FW_REDISPATCH);
4118                                         }
4119
4120                                         dup = ipfw_state_link(ctx,
4121                                             &slave_x->xlat_st);
4122                                         if (dup != NULL) {
4123                                                 ctx->ipfw_xlate_conflicts++;
4124                                                 if (IPFW_STATE_ISDEAD(dup)) {
4125                                                         ipfw_state_remove(ctx,
4126                                                             dup);
4127                                                         dup = ipfw_state_link(
4128                                                         ctx, &slave_x->xlat_st);
4129                                                 }
4130                                                 if (dup != NULL) {
4131                                                         if (bootverbose) {
4132                                                             kprintf("ipfw: "
4133                                                             "slave %u state "
4134                                                             "conflicts "
4135                                                             "%u state\n",
4136                                                             x->xlat_type,
4137                                                             s->st_type);
4138                                                         }
4139                                                         ipfw_state_del(ctx, s);
4140                                                         return (IP_FW_DENY);
4141                                                 }
4142                                                 ctx->ipfw_xlate_cresolved++;
4143                                         }
4144                                 }
4145                                 match = 1;
4146                                 break;
4147
4148                         case O_PROBE_STATE:
4149                         case O_CHECK_STATE:
4150                                 /*
4151                                  * States are checked at the first keep-state 
4152                                  * check-state occurrence, with the result
4153                                  * being stored in dyn_dir.  The compiler
4154                                  * introduces a PROBE_STATE instruction for
4155                                  * us when we have a KEEP_STATE/LIMIT/RDR
4156                                  * (because PROBE_STATE needs to be run first).
4157                                  */
4158                                 s = NULL;
4159                                 if (dyn_dir == MATCH_UNKNOWN) {
4160                                         s = ipfw_state_lookup(ctx,
4161                                             &args->f_id, &dyn_dir, lc.tcp);
4162                                 }
4163                                 if (s == NULL ||
4164                                     (s->st_type == O_REDIRECT &&
4165                                      (args->eh != NULL ||
4166                                       (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4167                                       (lc.proto != IPPROTO_TCP &&
4168                                        lc.proto != IPPROTO_UDP)))) {
4169                                         /*
4170                                          * State not found. If CHECK_STATE,
4171                                          * skip to next rule, if PROBE_STATE
4172                                          * just ignore and continue with next
4173                                          * opcode.
4174                                          */
4175                                         if (cmd->opcode == O_CHECK_STATE)
4176                                                 goto next_rule;
4177                                         match = 1;
4178                                         break;
4179                                 }
4180
4181                                 s->st_pcnt++;
4182                                 s->st_bcnt += lc.ip_len;
4183
4184                                 if (s->st_type == O_REDIRECT) {
4185                                         struct ipfw_xlat *x =
4186                                             (struct ipfw_xlat *)s;
4187
4188                                         if (oif != NULL &&
4189                                             x->xlat_ifp == NULL) {
4190                                                 KASSERT(x->xlat_flags &
4191                                                     IPFW_STATE_F_XLATSLAVE,
4192                                                     ("master rdr state "
4193                                                      "missing ifp"));
4194                                                 x->xlat_ifp = oif;
4195                                         } else if (
4196                                             (oif != NULL && x->xlat_ifp!=oif) ||
4197                                             (oif == NULL &&
4198                                              x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4199                                                 retval = IP_FW_DENY;
4200                                                 goto done;
4201                                         }
4202                                         if (x->xlat_dir != dyn_dir)
4203                                                 goto skip_xlate;
4204
4205                                         ipfw_xlate(x, m, NULL, NULL);
4206                                         m = ipfw_rehashm(m, hlen, args, &lc,
4207                                             &ip);
4208                                         if (m == NULL)
4209                                                 goto pullup_failed;
4210
4211                                         cpuid = netisr_hashcpu(
4212                                             m->m_pkthdr.hash);
4213                                         if (cpuid != mycpuid) {
4214                                                 uint32_t xlate = 0;
4215
4216                                                 if (oif != NULL) {
4217                                                         xlate |=
4218                                                             IPFW_XLATE_OUTPUT;
4219                                                 }
4220                                                 if (dyn_dir == MATCH_FORWARD) {
4221                                                         xlate |=
4222                                                             IPFW_XLATE_FORWARD;
4223                                                 }
4224                                                 ipfw_xlate_redispatch(m, cpuid,
4225                                                     x, xlate);
4226                                                 args->m = NULL;
4227                                                 return (IP_FW_REDISPATCH);
4228                                         }
4229
4230                                         KKASSERT(x->xlat_pcpu == mycpuid);
4231                                         ipfw_state_update(&args->f_id, dyn_dir,
4232                                             lc.tcp, &x->xlat_pair->xlat_st);
4233                                 }
4234 skip_xlate:
4235                                 /*
4236                                  * Found a rule from a state; jump to the
4237                                  * 'action' part of the rule.
4238                                  */
4239                                 f = s->st_rule;
4240                                 KKASSERT(f->cpuid == mycpuid);
4241
4242                                 cmd = ACTION_PTR(f);
4243                                 l = f->cmd_len - f->act_ofs;
4244                                 dyn_f = f;
4245                                 goto check_body;
4246
4247                         case O_ACCEPT:
4248                                 retval = IP_FW_PASS;    /* accept */
4249                                 goto done;
4250
4251                         case O_DEFRAG:
4252                                 if (f->cross_rules == NULL) {
4253                                         /*
4254                                          * This rule was not completely setup;
4255                                          * move on to the next rule.
4256                                          */
4257                                         goto next_rule;
4258                                 }
4259
4260                                 /*
4261                                  * Don't defrag for l2 packets, output packets
4262                                  * or non-fragments.
4263                                  */
4264                                 if (oif != NULL || args->eh != NULL ||
4265                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4266                                         goto next_rule;
4267
4268                                 ctx->ipfw_frags++;
4269                                 m = ip_reass(m);
4270                                 args->m = m;
4271                                 if (m == NULL) {
4272                                         retval = IP_FW_PASS;
4273                                         goto done;
4274                                 }
4275                                 ctx->ipfw_defraged++;
4276                                 KASSERT((m->m_flags & M_HASH) == 0,
4277                                     ("hash not cleared"));
4278
4279                                 /* Update statistics */
4280                                 f->pcnt++;
4281                                 f->bcnt += lc.ip_len;
4282                                 f->timestamp = time_second;
4283
4284                                 ip = mtod(m, struct ip *);
4285                                 hlen = ip->ip_hl << 2;
4286                                 ip->ip_len += hlen;
4287
4288                                 ip->ip_len = htons(ip->ip_len);
4289                                 ip->ip_off = htons(ip->ip_off);
4290
4291                                 ip_hashfn(&m, 0);
4292                                 args->m = m;
4293                                 if (m == NULL)
4294                                         goto pullup_failed;
4295
4296                                 KASSERT(m->m_flags & M_HASH, ("no hash"));
4297                                 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4298                                 if (cpuid != mycpuid) {
4299                                         /*
4300                                          * NOTE:
4301                                          * ip_len/ip_off are in network byte
4302                                          * order.
4303                                          */
4304                                         ctx->ipfw_defrag_remote++;
4305                                         ipfw_defrag_redispatch(m, cpuid, f);
4306                                         args->m = NULL;
4307                                         return (IP_FW_REDISPATCH);
4308                                 }
4309
4310                                 /* 'm' might be changed by ip_hashfn(). */
4311                                 ip = mtod(m, struct ip *);
4312                                 ip->ip_len = ntohs(ip->ip_len);
4313                                 ip->ip_off = ntohs(ip->ip_off);
4314
4315                                 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4316                                 if (m == NULL)
4317                                         goto pullup_failed;
4318
4319                                 /* Move on. */
4320                                 goto next_rule;
4321
4322                         case O_PIPE:
4323                         case O_QUEUE:
4324                                 args->rule = f; /* report matching rule */
4325                                 args->cookie = cmd->arg1;
4326                                 retval = IP_FW_DUMMYNET;
4327                                 goto done;
4328
4329                         case O_DIVERT:
4330                         case O_TEE:
4331                                 if (args->eh) /* not on layer 2 */
4332                                         break;
4333
4334                                 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4335                                     sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4336                                 if (mtag == NULL) {
4337                                         retval = IP_FW_DENY;
4338                                         goto done;
4339                                 }
4340                                 divinfo = m_tag_data(mtag);
4341
4342                                 divinfo->skipto = f->rulenum;
4343                                 divinfo->port = cmd->arg1;
4344                                 divinfo->tee = (cmd->opcode == O_TEE);
4345                                 m_tag_prepend(m, mtag);
4346
4347                                 args->cookie = cmd->arg1;
4348                                 retval = (cmd->opcode == O_DIVERT) ?
4349                                          IP_FW_DIVERT : IP_FW_TEE;
4350                                 goto done;
4351
4352                         case O_COUNT:
4353                         case O_SKIPTO:
4354                                 f->pcnt++;      /* update stats */
4355                                 f->bcnt += lc.ip_len;
4356                                 f->timestamp = time_second;
4357                                 if (cmd->opcode == O_COUNT)
4358                                         goto next_rule;
4359                                 /* handle skipto */
4360                                 if (f->next_rule == NULL)
4361                                         lookup_next_rule(f);
4362                                 f = f->next_rule;
4363                                 goto again;
4364
4365                         case O_REJECT:
4366                                 /*
4367                                  * Drop the packet and send a reject notice
4368                                  * if the packet is not ICMP (or is an ICMP
4369                                  * query), and it is not multicast/broadcast.
4370                                  */
4371                                 if (hlen > 0 &&
4372                                     (lc.proto != IPPROTO_ICMP ||
4373                                      is_icmp_query(ip)) &&
4374                                     !(m->m_flags & (M_BCAST|M_MCAST)) &&
4375                                     !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4376                                         send_reject(args, cmd->arg1,
4377                                             lc.offset, lc.ip_len);
4378                                         retval = IP_FW_DENY;
4379                                         goto done;
4380                                 }
4381                                 /* FALLTHROUGH */
4382                         case O_DENY:
4383                                 retval = IP_FW_DENY;
4384                                 goto done;
4385
4386                         case O_FORWARD_IP:
4387                                 if (args->eh)   /* not valid on layer2 pkts */
4388                                         break;
4389                                 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4390                                         struct sockaddr_in *sin;
4391
4392                                         mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4393                                             sizeof(*sin), M_INTWAIT | M_NULLOK);
4394                                         if (mtag == NULL) {
4395                                                 retval = IP_FW_DENY;
4396                                                 goto done;
4397                                         }
4398                                         sin = m_tag_data(mtag);
4399
4400                                         /* Structure copy */
4401                                         *sin = ((ipfw_insn_sa *)cmd)->sa;
4402
4403                                         m_tag_prepend(m, mtag);
4404                                         m->m_pkthdr.fw_flags |=
4405                                                 IPFORWARD_MBUF_TAGGED;
4406                                         m->m_pkthdr.fw_flags &=
4407                                                 ~BRIDGE_MBUF_TAGGED;
4408                                 }
4409                                 retval = IP_FW_PASS;
4410                                 goto done;
4411
4412                         default:
4413                                 panic("-- unknown opcode %d", cmd->opcode);
4414                         } /* end of switch() on opcodes */
4415
4416                         if (cmd->len & F_NOT)
4417                                 match = !match;
4418
4419                         if (match) {
4420                                 if (cmd->len & F_OR)
4421                                         skip_or = 1;
4422                         } else {
4423                                 if (!(cmd->len & F_OR)) /* not an OR block, */
4424                                         break;          /* try next rule    */
4425                         }
4426
4427                 }       /* end of inner for, scan opcodes */
4428
4429 next_rule:;             /* try next rule                */
4430
4431         }               /* end of outer for, scan rules */
4432         kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4433         return IP_FW_DENY;
4434
4435 done:
4436         /* Update statistics */
4437         f->pcnt++;
4438         f->bcnt += lc.ip_len;
4439         f->timestamp = time_second;
4440         return retval;
4441
4442 pullup_failed:
4443         if (fw_verbose)
4444                 kprintf("pullup failed\n");
4445         return IP_FW_DENY;
4446 }
4447
4448 static struct mbuf *
4449 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4450 {
4451         struct m_tag *mtag;
4452         struct dn_pkt *pkt;
4453         ipfw_insn *cmd;
4454         const struct ipfw_flow_id *id;
4455         struct dn_flow_id *fid;
4456
4457         M_ASSERTPKTHDR(m);
4458
4459         mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4460             M_INTWAIT | M_NULLOK);
4461         if (mtag == NULL) {
4462                 m_freem(m);
4463                 return (NULL);
4464         }
4465         m_tag_prepend(m, mtag);
4466
4467         pkt = m_tag_data(mtag);
4468         bzero(pkt, sizeof(*pkt));
4469
4470         cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4471         if (cmd->opcode == O_LOG)
4472                 cmd += F_LEN(cmd);
4473         KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4474                 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4475
4476         pkt->dn_m = m;
4477         pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4478         pkt->ifp = fwa->oif;
4479         pkt->pipe_nr = pipe_nr;
4480
4481         pkt->cpuid = mycpuid;
4482         pkt->msgport = netisr_curport();
4483
4484         id = &fwa->f_id;
4485         fid = &pkt->id;
4486         fid->fid_dst_ip = id->dst_ip;
4487         fid->fid_src_ip = id->src_ip;
4488         fid->fid_dst_port = id->dst_port;
4489         fid->fid_src_port = id->src_port;
4490         fid->fid_proto = id->proto;
4491         fid->fid_flags = id->flags;
4492
4493         ipfw_ref_rule(fwa->rule);
4494         pkt->dn_priv = fwa->rule;
4495         pkt->dn_unref_priv = ipfw_unref_rule;
4496
4497         if (cmd->opcode == O_PIPE)
4498                 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4499
4500         m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4501         return (m);
4502 }
4503
4504 /*
4505  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4506  * These will be reconstructed on the fly as packets are matched.
4507  */
4508 static void
4509 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4510 {
4511         struct ip_fw *rule;
4512
4513         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4514                 rule->next_rule = NULL;
4515 }
4516
4517 static void
4518 ipfw_inc_static_count(struct ip_fw *rule)
4519 {
4520         /* Static rule's counts are updated only on CPU0 */
4521         KKASSERT(mycpuid == 0);
4522
4523         static_count++;
4524         static_ioc_len += IOC_RULESIZE(rule);
4525 }
4526
4527 static void
4528 ipfw_dec_static_count(struct ip_fw *rule)
4529 {
4530         int l = IOC_RULESIZE(rule);
4531
4532         /* Static rule's counts are updated only on CPU0 */
4533         KKASSERT(mycpuid == 0);
4534
4535         KASSERT(static_count > 0, ("invalid static count %u", static_count));
4536         static_count--;
4537
4538         KASSERT(static_ioc_len >= l,
4539                 ("invalid static len %u", static_ioc_len));
4540         static_ioc_len -= l;
4541 }
4542
4543 static void
4544 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4545 {
4546         if (fwmsg->sibling != NULL) {
4547                 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4548                 fwmsg->sibling->sibling = rule;
4549         }
4550         fwmsg->sibling = rule;
4551 }
4552
4553 static struct ip_fw *
4554 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4555 {
4556         struct ip_fw *rule;
4557
4558         rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4559
4560         rule->act_ofs = ioc_rule->act_ofs;
4561         rule->cmd_len = ioc_rule->cmd_len;
4562         rule->rulenum = ioc_rule->rulenum;
4563         rule->set = ioc_rule->set;
4564         rule->usr_flags = ioc_rule->usr_flags;
4565
4566         bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4567
4568         rule->refcnt = 1;
4569         rule->cpuid = mycpuid;
4570         rule->rule_flags = rule_flags;
4571
4572         return rule;
4573 }
4574
4575 static void
4576 ipfw_add_rule_dispatch(netmsg_t nmsg)
4577 {
4578         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4579         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4580         struct ip_fw *rule;
4581
4582         ASSERT_NETISR_NCPUS(mycpuid);
4583
4584         rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4585
4586         /*
4587          * Insert rule into the pre-determined position
4588          */
4589         if (fwmsg->prev_rule != NULL) {
4590                 struct ip_fw *prev, *next;
4591
4592                 prev = fwmsg->prev_rule;
4593                 KKASSERT(prev->cpuid == mycpuid);
4594
4595                 next = fwmsg->next_rule;
4596                 KKASSERT(next->cpuid == mycpuid);
4597
4598                 rule->next = next;
4599                 prev->next = rule;
4600
4601                 /*
4602                  * Move to the position on the next CPU
4603                  * before the msg is forwarded.
4604                  */
4605                 fwmsg->prev_rule = prev->sibling;
4606                 fwmsg->next_rule = next->sibling;
4607         } else {
4608                 KKASSERT(fwmsg->next_rule == NULL);
4609                 rule->next = ctx->ipfw_layer3_chain;
4610                 ctx->ipfw_layer3_chain = rule;
4611         }
4612
4613         /* Link rule CPU sibling */
4614         ipfw_link_sibling(fwmsg, rule);
4615
4616         ipfw_flush_rule_ptrs(ctx);
4617
4618         if (mycpuid == 0) {
4619                 /* Statistics only need to be updated once */
4620                 ipfw_inc_static_count(rule);
4621
4622                 /* Return the rule on CPU0 */
4623                 nmsg->lmsg.u.ms_resultp = rule;
4624         }
4625
4626         if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4627                 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4628
4629         if (fwmsg->cross_rules != NULL) {
4630                 /* Save rules for later use. */
4631                 fwmsg->cross_rules[mycpuid] = rule;
4632         }
4633
4634         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4635 }
4636
4637 static void
4638 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4639 {
4640         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4641         struct ip_fw *rule = fwmsg->sibling;
4642         int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4643
4644         ASSERT_NETISR_NCPUS(mycpuid);
4645         KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4646             ("not crossref rule"));
4647
4648         rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4649         memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4650
4651         fwmsg->sibling = rule->sibling;
4652         netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4653 }
4654
4655 /*
4656  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4657  * then possibly create a rule number and add the rule to the list.
4658  * Update the rule_number in the input struct so the caller knows
4659  * it as well.
4660  */
4661 static void
4662 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4663 {
4664         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4665         struct netmsg_ipfw fwmsg;
4666         struct ip_fw *f, *prev, *rule;
4667
4668         ASSERT_NETISR0;
4669
4670         /*
4671          * If rulenum is 0, find highest numbered rule before the
4672          * default rule, and add rule number incremental step.
4673          */
4674         if (ioc_rule->rulenum == 0) {
4675                 int step = autoinc_step;
4676
4677                 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4678                          step <= IPFW_AUTOINC_STEP_MAX);
4679
4680                 /*
4681                  * Locate the highest numbered rule before default
4682                  */
4683                 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4684                         if (f->rulenum == IPFW_DEFAULT_RULE)
4685                                 break;
4686                         ioc_rule->rulenum = f->rulenum;
4687                 }
4688                 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4689                         ioc_rule->rulenum += step;
4690         }
4691         KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4692                 ioc_rule->rulenum != 0,
4693                 ("invalid rule num %d", ioc_rule->rulenum));
4694
4695         /*
4696          * Now find the right place for the new rule in the sorted list.
4697          */
4698         for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4699              prev = f, f = f->next) {
4700                 if (f->rulenum > ioc_rule->rulenum) {
4701                         /* Found the location */
4702                         break;
4703                 }
4704         }
4705         KASSERT(f != NULL, ("no default rule?!"));
4706
4707         /*
4708          * Duplicate the rule onto each CPU.
4709          * The rule duplicated on CPU0 will be returned.
4710          */
4711         bzero(&fwmsg, sizeof(fwmsg));
4712         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4713             ipfw_add_rule_dispatch);
4714         fwmsg.ioc_rule = ioc_rule;
4715         fwmsg.prev_rule = prev;
4716         fwmsg.next_rule = prev == NULL ? NULL : f;
4717         fwmsg.rule_flags = rule_flags;
4718         if (rule_flags & IPFW_RULE_F_CROSSREF) {
4719                 fwmsg.cross_rules = kmalloc(
4720                     sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4721                     M_WAITOK | M_ZERO);
4722         }
4723
4724         netisr_domsg_global(&fwmsg.base);
4725         KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4726
4727         rule = fwmsg.base.lmsg.u.ms_resultp;
4728         KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4729
4730         if (fwmsg.cross_rules != NULL) {
4731                 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4732                     MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4733                 fwmsg.sibling = rule;
4734                 netisr_domsg_global(&fwmsg.base);
4735                 KKASSERT(fwmsg.sibling == NULL);
4736
4737                 kfree(fwmsg.cross_rules, M_TEMP);
4738
4739 #ifdef KLD_MODULE
4740                 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4741 #endif
4742         }
4743
4744         DPRINTF("++ installed rule %d, static count now %d\n",
4745                 rule->rulenum, static_count);
4746 }
4747
4748 /*
4749  * Free storage associated with a static rule (including derived
4750  * states/tracks).
4751  * The caller is in charge of clearing rule pointers to avoid
4752  * dangling pointers.
4753  * @return a pointer to the next entry.
4754  * Arguments are not checked, so they better be correct.
4755  */
4756 static struct ip_fw *
4757 ipfw_delete_rule(struct ipfw_context *ctx,
4758                  struct ip_fw *prev, struct ip_fw *rule)
4759 {
4760         struct ip_fw *n;
4761
4762         n = rule->next;
4763         if (prev == NULL)
4764                 ctx->ipfw_layer3_chain = n;
4765         else
4766                 prev->next = n;
4767
4768         /* Mark the rule as invalid */
4769         rule->rule_flags |= IPFW_RULE_F_INVALID;
4770         rule->next_rule = NULL;
4771         rule->sibling = NULL;
4772 #ifdef foo
4773         /* Don't reset cpuid here; keep various assertion working */
4774         rule->cpuid = -1;
4775 #endif
4776
4777         /* Statistics only need to be updated once */
4778         if (mycpuid == 0)
4779                 ipfw_dec_static_count(rule);
4780
4781         if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4782                 /* Try to free this rule */
4783                 ipfw_free_rule(rule);
4784         } else {
4785                 /* TODO: check staging area. */
4786                 if (mycpuid == 0) {
4787                         rule->next = ipfw_gd.ipfw_crossref_free;
4788                         ipfw_gd.ipfw_crossref_free = rule;
4789                 }
4790         }
4791
4792         /* Return the next rule */
4793         return n;
4794 }
4795
4796 static void
4797 ipfw_flush_dispatch(netmsg_t nmsg)
4798 {
4799         int kill_default = nmsg->lmsg.u.ms_result;
4800         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4801         struct ip_fw *rule;
4802
4803         ASSERT_NETISR_NCPUS(mycpuid);
4804
4805         /*
4806          * Flush states.
4807          */
4808         ipfw_state_flush(ctx, NULL);
4809         KASSERT(ctx->ipfw_state_cnt == 0,
4810             ("%d pcpu states remain", ctx->ipfw_state_cnt));
4811         ctx->ipfw_state_loosecnt = 0;
4812         ctx->ipfw_state_lastexp = 0;
4813
4814         /*
4815          * Flush tracks.
4816          */
4817         ipfw_track_flush(ctx, NULL);
4818         ctx->ipfw_track_lastexp = 0;
4819         if (ctx->ipfw_trkcnt_spare != NULL) {
4820                 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4821                 ctx->ipfw_trkcnt_spare = NULL;
4822         }
4823
4824         ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4825
4826         while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4827                (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4828                 ipfw_delete_rule(ctx, NULL, rule);
4829
4830         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4831 }
4832
4833 /*
4834  * Deletes all rules from a chain (including the default rule
4835  * if the second argument is set).
4836  */
4837 static void
4838 ipfw_flush(int kill_default)
4839 {
4840         struct netmsg_base nmsg;
4841 #ifdef INVARIANTS
4842         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4843         int state_cnt;
4844 #endif
4845
4846         ASSERT_NETISR0;
4847
4848         /*
4849          * If 'kill_default' then caller has done the necessary
4850          * msgport syncing; unnecessary to do it again.
4851          */
4852         if (!kill_default) {
4853                 /*
4854                  * Let ipfw_chk() know the rules are going to
4855                  * be flushed, so it could jump directly to
4856                  * the default rule.
4857                  */
4858                 ipfw_flushing = 1;
4859                 /* XXX use priority sync */
4860                 netmsg_service_sync();
4861         }
4862
4863         /*
4864          * Press the 'flush' button
4865          */
4866         bzero(&nmsg, sizeof(nmsg));
4867         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4868             ipfw_flush_dispatch);
4869         nmsg.lmsg.u.ms_result = kill_default;
4870         netisr_domsg_global(&nmsg);
4871         ipfw_gd.ipfw_state_loosecnt = 0;
4872         ipfw_gd.ipfw_state_globexp = 0;
4873         ipfw_gd.ipfw_track_globexp = 0;
4874
4875 #ifdef INVARIANTS
4876         state_cnt = ipfw_state_cntcoll();
4877         KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4878
4879         KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4880             ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4881
4882         if (kill_default) {
4883                 KASSERT(static_count == 0,
4884                         ("%u static rules remain", static_count));
4885                 KASSERT(static_ioc_len == 0,
4886                         ("%u bytes of static rules remain", static_ioc_len));
4887         } else {
4888                 KASSERT(static_count == 1,
4889                         ("%u static rules remain", static_count));
4890                 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4891                         ("%u bytes of static rules remain, should be %lu",
4892                          static_ioc_len,
4893                          (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4894         }
4895 #endif
4896
4897         /* Flush is done */
4898         ipfw_flushing = 0;
4899 }
4900
4901 static void
4902 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4903 {
4904         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4905         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4906         struct ip_fw *rule, *prev;
4907
4908         ASSERT_NETISR_NCPUS(mycpuid);
4909
4910         rule = dmsg->start_rule;
4911         KKASSERT(rule->cpuid == mycpuid);
4912         dmsg->start_rule = rule->sibling;
4913
4914         prev = dmsg->prev_rule;
4915         if (prev != NULL) {
4916                 KKASSERT(prev->cpuid == mycpuid);
4917
4918                 /*
4919                  * Move to the position on the next CPU
4920                  * before the msg is forwarded.
4921                  */
4922                 dmsg->prev_rule = prev->sibling;
4923         }
4924
4925         /*
4926          * flush pointers outside the loop, then delete all matching
4927          * rules.  'prev' remains the same throughout the cycle.
4928          */
4929         ipfw_flush_rule_ptrs(ctx);
4930         while (rule && rule->rulenum == dmsg->rulenum) {
4931                 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4932                         /* Flush states generated by this rule. */
4933                         ipfw_state_flush(ctx, rule);
4934                 }
4935                 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4936                         /* Flush tracks generated by this rule. */
4937                         ipfw_track_flush(ctx, rule);
4938                 }
4939                 rule = ipfw_delete_rule(ctx, prev, rule);
4940         }
4941
4942         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4943 }
4944
4945 static int
4946 ipfw_alt_delete_rule(uint16_t rulenum)
4947 {
4948         struct ip_fw *prev, *rule;
4949         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4950         struct netmsg_del dmsg;
4951
4952         ASSERT_NETISR0;
4953
4954         /*
4955          * Locate first rule to delete
4956          */
4957         for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4958              rule && rule->rulenum < rulenum;
4959              prev = rule, rule = rule->next)
4960                 ; /* EMPTY */
4961         if (rule->rulenum != rulenum)
4962                 return EINVAL;
4963
4964         /*
4965          * Get rid of the rule duplications on all CPUs
4966          */
4967         bzero(&dmsg, sizeof(dmsg));
4968         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4969             ipfw_alt_delete_rule_dispatch);
4970         dmsg.prev_rule = prev;
4971         dmsg.start_rule = rule;
4972         dmsg.rulenum = rulenum;
4973
4974         netisr_domsg_global(&dmsg.base);
4975         KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4976         return 0;
4977 }
4978
4979 static void
4980 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4981 {
4982         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4983         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4984         struct ip_fw *prev, *rule;
4985 #ifdef INVARIANTS
4986         int del = 0;
4987 #endif
4988
4989         ASSERT_NETISR_NCPUS(mycpuid);
4990
4991         ipfw_flush_rule_ptrs(ctx);
4992
4993         prev = NULL;
4994         rule = ctx->ipfw_layer3_chain;
4995         while (rule != NULL) {
4996                 if (rule->set == dmsg->from_set) {
4997                         if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4998                                 /* Flush states generated by this rule. */
4999                                 ipfw_state_flush(ctx, rule);
5000                         }
5001                         if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
5002                                 /* Flush tracks generated by this rule. */
5003                                 ipfw_track_flush(ctx, rule);
5004                         }
5005                         rule = ipfw_delete_rule(ctx, prev, rule);
5006 #ifdef INVARIANTS
5007                         del = 1;
5008 #endif
5009                 } else {
5010                         prev = rule;
5011                         rule = rule->next;
5012                 }
5013         }
5014         KASSERT(del, ("no match set?!"));
5015
5016         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5017 }
5018
5019 static int
5020 ipfw_alt_delete_ruleset(uint8_t set)
5021 {
5022         struct netmsg_del dmsg;
5023         int del;
5024         struct ip_fw *rule;
5025         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5026
5027         ASSERT_NETISR0;
5028
5029         /*
5030          * Check whether the 'set' exists.  If it exists,
5031          * then check whether any rules within the set will
5032          * try to create states.
5033          */
5034         del = 0;
5035         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5036                 if (rule->set == set)
5037                         del = 1;
5038         }
5039         if (!del)
5040                 return 0; /* XXX EINVAL? */
5041
5042         /*
5043          * Delete this set
5044          */
5045         bzero(&dmsg, sizeof(dmsg));
5046         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5047             ipfw_alt_delete_ruleset_dispatch);
5048         dmsg.from_set = set;
5049         netisr_domsg_global(&dmsg.base);
5050
5051         return 0;
5052 }
5053
5054 static void
5055 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5056 {
5057         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5058         struct ip_fw *rule;
5059
5060         ASSERT_NETISR_NCPUS(mycpuid);
5061
5062         rule = dmsg->start_rule;
5063         KKASSERT(rule->cpuid == mycpuid);
5064
5065         /*
5066          * Move to the position on the next CPU
5067          * before the msg is forwarded.
5068          */
5069         dmsg->start_rule = rule->sibling;
5070
5071         while (rule && rule->rulenum <= dmsg->rulenum) {
5072                 if (rule->rulenum == dmsg->rulenum)
5073                         rule->set = dmsg->to_set;
5074                 rule = rule->next;
5075         }
5076         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5077 }
5078
5079 static int
5080 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5081 {
5082         struct netmsg_del dmsg;
5083         struct netmsg_base *nmsg;
5084         struct ip_fw *rule;
5085         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5086
5087         ASSERT_NETISR0;
5088
5089         /*
5090          * Locate first rule to move
5091          */
5092         for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5093              rule = rule->next) {
5094                 if (rule->rulenum == rulenum && rule->set != set)
5095                         break;
5096         }
5097         if (rule == NULL || rule->rulenum > rulenum)
5098                 return 0; /* XXX error? */
5099
5100         bzero(&dmsg, sizeof(dmsg));
5101         nmsg = &dmsg.base;
5102         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5103             ipfw_alt_move_rule_dispatch);
5104         dmsg.start_rule = rule;
5105         dmsg.rulenum = rulenum;
5106         dmsg.to_set = set;
5107
5108         netisr_domsg_global(nmsg);
5109         KKASSERT(dmsg.start_rule == NULL);
5110         return 0;
5111 }
5112
5113 static void
5114 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5115 {
5116         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5117         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5118         struct ip_fw *rule;
5119
5120         ASSERT_NETISR_NCPUS(mycpuid);
5121
5122         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5123                 if (rule->set == dmsg->from_set)
5124                         rule->set = dmsg->to_set;
5125         }
5126         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5127 }
5128
5129 static int
5130 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5131 {
5132         struct netmsg_del dmsg;
5133         struct netmsg_base *nmsg;
5134
5135         ASSERT_NETISR0;
5136
5137         bzero(&dmsg, sizeof(dmsg));
5138         nmsg = &dmsg.base;
5139         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5140             ipfw_alt_move_ruleset_dispatch);
5141         dmsg.from_set = from_set;
5142         dmsg.to_set = to_set;
5143
5144         netisr_domsg_global(nmsg);
5145         return 0;
5146 }
5147
5148 static void
5149 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5150 {
5151         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5152         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5153         struct ip_fw *rule;
5154
5155         ASSERT_NETISR_NCPUS(mycpuid);
5156
5157         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5158                 if (rule->set == dmsg->from_set)
5159                         rule->set = dmsg->to_set;
5160                 else if (rule->set == dmsg->to_set)
5161                         rule->set = dmsg->from_set;
5162         }
5163         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5164 }
5165
5166 static int
5167 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5168 {
5169         struct netmsg_del dmsg;
5170         struct netmsg_base *nmsg;
5171
5172         ASSERT_NETISR0;
5173
5174         bzero(&dmsg, sizeof(dmsg));
5175         nmsg = &dmsg.base;
5176         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5177             ipfw_alt_swap_ruleset_dispatch);
5178         dmsg.from_set = set1;
5179         dmsg.to_set = set2;
5180
5181         netisr_domsg_global(nmsg);
5182         return 0;
5183 }
5184
5185 /*
5186  * Remove all rules with given number, and also do set manipulation.
5187  *
5188  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5189  * the next 8 bits are the new set, the top 8 bits are the command:
5190  *
5191  *      0       delete rules with given number
5192  *      1       delete rules with given set number
5193  *      2       move rules with given number to new set
5194  *      3       move rules with given set number to new set
5195  *      4       swap sets with given numbers
5196  */
5197 static int
5198 ipfw_ctl_alter(uint32_t arg)
5199 {
5200         uint16_t rulenum;
5201         uint8_t cmd, new_set;
5202         int error = 0;
5203
5204         ASSERT_NETISR0;
5205
5206         rulenum = arg & 0xffff;
5207         cmd = (arg >> 24) & 0xff;
5208         new_set = (arg >> 16) & 0xff;
5209
5210         if (cmd > 4)
5211                 return EINVAL;
5212         if (new_set >= IPFW_DEFAULT_SET)
5213                 return EINVAL;
5214         if (cmd == 0 || cmd == 2) {
5215                 if (rulenum == IPFW_DEFAULT_RULE)
5216                         return EINVAL;
5217         } else {
5218                 if (rulenum >= IPFW_DEFAULT_SET)
5219                         return EINVAL;
5220         }
5221
5222         switch (cmd) {
5223         case 0: /* delete rules with given number */
5224                 error = ipfw_alt_delete_rule(rulenum);
5225                 break;
5226
5227         case 1: /* delete all rules with given set number */
5228                 error = ipfw_alt_delete_ruleset(rulenum);
5229                 break;
5230
5231         case 2: /* move rules with given number to new set */
5232                 error = ipfw_alt_move_rule(rulenum, new_set);
5233                 break;
5234
5235         case 3: /* move rules with given set number to new set */
5236                 error = ipfw_alt_move_ruleset(rulenum, new_set);
5237                 break;
5238
5239         case 4: /* swap two sets */
5240                 error = ipfw_alt_swap_ruleset(rulenum, new_set);
5241                 break;
5242         }
5243         return error;
5244 }
5245
5246 /*
5247  * Clear counters for a specific rule.
5248  */
5249 static void
5250 clear_counters(struct ip_fw *rule, int log_only)
5251 {
5252         ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5253
5254         if (log_only == 0) {
5255                 rule->bcnt = rule->pcnt = 0;
5256                 rule->timestamp = 0;
5257         }
5258         if (l->o.opcode == O_LOG)
5259                 l->log_left = l->max_log;
5260 }
5261
5262 static void
5263 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5264 {
5265         struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5266         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5267         struct ip_fw *rule;
5268
5269         ASSERT_NETISR_NCPUS(mycpuid);
5270
5271         if (zmsg->rulenum == 0) {
5272                 KKASSERT(zmsg->start_rule == NULL);
5273
5274                 ctx->ipfw_norule_counter = 0;
5275                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5276                         clear_counters(rule, zmsg->log_only);
5277         } else {
5278                 struct ip_fw *start = zmsg->start_rule;
5279
5280                 KKASSERT(start->cpuid == mycpuid);
5281                 KKASSERT(start->rulenum == zmsg->rulenum);
5282
5283                 /*
5284                  * We can have multiple rules with the same number, so we
5285                  * need to clear them all.
5286                  */
5287                 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5288                      rule = rule->next)
5289                         clear_counters(rule, zmsg->log_only);
5290
5291                 /*
5292                  * Move to the position on the next CPU
5293                  * before the msg is forwarded.
5294                  */
5295                 zmsg->start_rule = start->sibling;
5296         }
5297         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5298 }
5299
5300 /*
5301  * Reset some or all counters on firewall rules.
5302  * @arg frwl is null to clear all entries, or contains a specific
5303  * rule number.
5304  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5305  */
5306 static int
5307 ipfw_ctl_zero_entry(int rulenum, int log_only)
5308 {
5309         struct netmsg_zent zmsg;
5310         struct netmsg_base *nmsg;
5311         const char *msg;
5312         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5313
5314         ASSERT_NETISR0;
5315
5316         bzero(&zmsg, sizeof(zmsg));
5317         nmsg = &zmsg.base;
5318         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5319             ipfw_zero_entry_dispatch);
5320         zmsg.log_only = log_only;
5321
5322         if (rulenum == 0) {
5323                 msg = log_only ? "ipfw: All logging counts reset.\n"
5324                                : "ipfw: Accounting cleared.\n";
5325         } else {
5326                 struct ip_fw *rule;
5327
5328                 /*
5329                  * Locate the first rule with 'rulenum'
5330                  */
5331                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5332                         if (rule->rulenum == rulenum)
5333                                 break;
5334                 }
5335                 if (rule == NULL) /* we did not find any matching rules */
5336                         return (EINVAL);
5337                 zmsg.start_rule = rule;
5338                 zmsg.rulenum = rulenum;
5339
5340                 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5341                                : "ipfw: Entry %d cleared.\n";
5342         }
5343         netisr_domsg_global(nmsg);
5344         KKASSERT(zmsg.start_rule == NULL);
5345
5346         if (fw_verbose)
5347                 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5348         return (0);
5349 }
5350
5351 /*
5352  * Check validity of the structure before insert.
5353  * Fortunately rules are simple, so this mostly need to check rule sizes.
5354  */
5355 static int
5356 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5357 {
5358         int l, cmdlen = 0;
5359         int have_action = 0;
5360         ipfw_insn *cmd;
5361
5362         *rule_flags = 0;
5363
5364         /* Check for valid size */
5365         if (size < sizeof(*rule)) {
5366                 kprintf("ipfw: rule too short\n");
5367                 return EINVAL;
5368         }
5369         l = IOC_RULESIZE(rule);
5370         if (l != size) {
5371                 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5372                 return EINVAL;
5373         }
5374
5375         /* Check rule number */
5376         if (rule->rulenum == IPFW_DEFAULT_RULE) {
5377                 kprintf("ipfw: invalid rule number\n");
5378                 return EINVAL;
5379         }
5380
5381         /*
5382          * Now go for the individual checks. Very simple ones, basically only
5383          * instruction sizes.
5384          */
5385         for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5386              l -= cmdlen, cmd += cmdlen) {
5387                 cmdlen = F_LEN(cmd);
5388                 if (cmdlen > l) {
5389                         kprintf("ipfw: opcode %d size truncated\n",
5390                                 cmd->opcode);
5391                         return EINVAL;
5392                 }
5393
5394                 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5395
5396                 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5397                     IPFW_ISXLAT(cmd->opcode)) {
5398                         /* This rule will generate states. */
5399                         *rule_flags |= IPFW_RULE_F_GENSTATE;
5400                         if (cmd->opcode == O_LIMIT)
5401                                 *rule_flags |= IPFW_RULE_F_GENTRACK;
5402                 }
5403                 if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5404                         *rule_flags |= IPFW_RULE_F_CROSSREF;
5405                 if (cmd->opcode == O_IP_SRC_IFIP ||
5406                     cmd->opcode == O_IP_DST_IFIP) {
5407                         *rule_flags |= IPFW_RULE_F_DYNIFADDR;
5408                         cmd->arg1 &= IPFW_IFIP_SETTINGS;
5409                 }
5410
5411                 switch (cmd->opcode) {
5412                 case O_NOP:
5413                 case O_PROBE_STATE:
5414                 case O_KEEP_STATE:
5415                 case O_PROTO:
5416                 case O_IP_SRC_ME:
5417                 case O_IP_DST_ME:
5418                 case O_LAYER2:
5419                 case O_IN:
5420                 case O_FRAG:
5421                 case O_IPFRAG:
5422                 case O_IPOPT:
5423                 case O_IPLEN:
5424                 case O_IPID:
5425                 case O_IPTOS:
5426                 case O_IPPRECEDENCE:
5427                 case O_IPTTL:
5428                 case O_IPVER:
5429                 case O_TCPWIN:
5430                 case O_TCPFLAGS:
5431                 case O_TCPOPTS:
5432                 case O_ESTAB:
5433                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5434                                 goto bad_size;
5435                         break;
5436
5437                 case O_IP_SRC_TABLE:
5438                 case O_IP_DST_TABLE:
5439                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5440                                 goto bad_size;
5441                         if (cmd->arg1 >= ipfw_table_max) {
5442                                 kprintf("ipfw: invalid table id %u, max %d\n",
5443                                     cmd->arg1, ipfw_table_max);
5444                                 return EINVAL;
5445                         }
5446                         break;
5447
5448                 case O_IP_SRC_IFIP:
5449                 case O_IP_DST_IFIP:
5450                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5451                                 goto bad_size;
5452                         break;
5453
5454                 case O_ICMPCODE:
5455                 case O_ICMPTYPE:
5456                         if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5457                                 goto bad_size;
5458                         break;
5459
5460                 case O_UID:
5461                 case O_GID:
5462                 case O_IP_SRC:
5463                 case O_IP_DST:
5464                 case O_TCPSEQ:
5465                 case O_TCPACK:
5466                 case O_PROB:
5467                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5468                                 goto bad_size;
5469                         break;
5470
5471                 case O_LIMIT:
5472                         if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5473                                 goto bad_size;
5474                         break;
5475                 case O_REDIRECT:
5476                         if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5477                                 goto bad_size;
5478                         break;
5479
5480                 case O_LOG:
5481                         if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5482                                 goto bad_size;
5483
5484                         ((ipfw_insn_log *)cmd)->log_left =
5485                             ((ipfw_insn_log *)cmd)->max_log;
5486
5487                         break;
5488
5489                 case O_IP_SRC_MASK:
5490                 case O_IP_DST_MASK:
5491                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5492                                 goto bad_size;
5493                         if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5494                                 kprintf("ipfw: opcode %d, useless rule\n",
5495                                         cmd->opcode);
5496                                 return EINVAL;
5497                         }
5498                         break;
5499
5500                 case O_IP_SRC_SET:
5501                 case O_IP_DST_SET:
5502                         if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5503                                 kprintf("ipfw: invalid set size %d\n",
5504                                         cmd->arg1);
5505                                 return EINVAL;
5506                         }
5507                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5508                             (cmd->arg1+31)/32 )
5509                                 goto bad_size;
5510                         break;
5511
5512                 case O_MACADDR2:
5513                         if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5514                                 goto bad_size;
5515                         break;
5516
5517                 case O_MAC_TYPE:
5518                 case O_IP_SRCPORT:
5519                 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5520                         if (cmdlen < 2 || cmdlen > 31)
5521                                 goto bad_size;
5522                         break;
5523
5524                 case O_RECV:
5525                 case O_XMIT:
5526                 case O_VIA:
5527                         if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5528                                 goto bad_size;
5529                         break;
5530
5531                 case O_PIPE:
5532                 case O_QUEUE:
5533                         if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5534                                 goto bad_size;
5535                         goto check_action;
5536
5537                 case O_FORWARD_IP:
5538                         if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5539                                 goto bad_size;
5540                         } else {
5541                                 in_addr_t fwd_addr;
5542
5543                                 fwd_addr = ((ipfw_insn_sa *)cmd)->
5544                                            sa.sin_addr.s_addr;
5545                                 if (IN_MULTICAST(ntohl(fwd_addr))) {
5546                                         kprintf("ipfw: try forwarding to "
5547                                                 "multicast address\n");
5548                                         return EINVAL;
5549                                 }
5550                         }
5551                         goto check_action;
5552
5553                 case O_FORWARD_MAC: /* XXX not implemented yet */
5554                 case O_CHECK_STATE:
5555                 case O_COUNT:
5556                 case O_ACCEPT:
5557                 case O_DENY:
5558                 case O_REJECT:
5559                 case O_SKIPTO:
5560                 case O_DIVERT:
5561                 case O_TEE:
5562                 case O_DEFRAG:
5563                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5564                                 goto bad_size;
5565 check_action:
5566                         if (have_action) {
5567                                 kprintf("ipfw: opcode %d, multiple actions"
5568                                         " not allowed\n",
5569                                         cmd->opcode);
5570                                 return EINVAL;
5571                         }
5572                         have_action = 1;
5573                         if (l != cmdlen) {
5574                                 kprintf("ipfw: opcode %d, action must be"
5575                                         " last opcode\n",
5576                                         cmd->opcode);
5577                                 return EINVAL;
5578                         }
5579                         break;
5580                 default:
5581                         kprintf("ipfw: opcode %d, unknown opcode\n",
5582                                 cmd->opcode);
5583                         return EINVAL;
5584                 }
5585         }
5586         if (have_action == 0) {
5587                 kprintf("ipfw: missing action\n");
5588                 return EINVAL;
5589         }
5590         return 0;
5591
5592 bad_size:
5593         kprintf("ipfw: opcode %d size %d wrong\n",
5594                 cmd->opcode, cmdlen);
5595         return EINVAL;
5596 }
5597
5598 static int
5599 ipfw_ctl_add_rule(struct sockopt *sopt)
5600 {
5601         struct ipfw_ioc_rule *ioc_rule;
5602         size_t size;
5603         uint32_t rule_flags;
5604         int error;
5605
5606         ASSERT_NETISR0;
5607         
5608         size = sopt->sopt_valsize;
5609         if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5610             size < sizeof(*ioc_rule)) {
5611                 return EINVAL;
5612         }
5613         if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5614                 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5615                                           IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5616         }
5617         ioc_rule = sopt->sopt_val;
5618
5619         error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5620         if (error)
5621                 return error;
5622
5623         ipfw_add_rule(ioc_rule, rule_flags);
5624
5625         if (sopt->sopt_dir == SOPT_GET)
5626                 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5627         return 0;
5628 }
5629
5630 static void *
5631 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5632     struct ipfw_ioc_rule *ioc_rule)
5633 {
5634         const struct ip_fw *sibling;
5635 #ifdef INVARIANTS
5636         int i;
5637 #endif
5638
5639         ASSERT_NETISR0;
5640         KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5641
5642         ioc_rule->act_ofs = rule->act_ofs;
5643         ioc_rule->cmd_len = rule->cmd_len;
5644         ioc_rule->rulenum = rule->rulenum;
5645         ioc_rule->set = rule->set;
5646         ioc_rule->usr_flags = rule->usr_flags;
5647
5648         ioc_rule->set_disable = ctx->ipfw_set_disable;
5649         ioc_rule->static_count = static_count;
5650         ioc_rule->static_len = static_ioc_len;
5651
5652         /*
5653          * Visit (read-only) all of the rule's duplications to get
5654          * the necessary statistics
5655          */
5656 #ifdef INVARIANTS
5657         i = 0;
5658 #endif
5659         ioc_rule->pcnt = 0;
5660         ioc_rule->bcnt = 0;
5661         ioc_rule->timestamp = 0;
5662         for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5663                 ioc_rule->pcnt += sibling->pcnt;
5664                 ioc_rule->bcnt += sibling->bcnt;
5665                 if (sibling->timestamp > ioc_rule->timestamp)
5666                         ioc_rule->timestamp = sibling->timestamp;
5667 #ifdef INVARIANTS
5668                 ++i;
5669 #endif
5670         }
5671         KASSERT(i == netisr_ncpus,
5672             ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5673
5674         bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5675
5676         return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5677 }
5678
5679 static boolean_t
5680 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5681 {
5682         struct ipfw_ioc_flowid *ioc_id;
5683
5684         if (trk->tc_expire == 0) {
5685                 /* Not a scanned one. */
5686                 return (FALSE);
5687         }
5688
5689         ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5690             0 : trk->tc_expire - time_uptime;
5691         ioc_state->pcnt = 0;
5692         ioc_state->bcnt = 0;
5693
5694         ioc_state->dyn_type = O_LIMIT_PARENT;
5695         ioc_state->count = trk->tc_count;
5696
5697         ioc_state->rulenum = trk->tc_rulenum;
5698
5699         ioc_id = &ioc_state->id;
5700         ioc_id->type = ETHERTYPE_IP;
5701         ioc_id->u.ip.proto = trk->tc_proto;
5702         ioc_id->u.ip.src_ip = trk->tc_saddr;
5703         ioc_id->u.ip.dst_ip = trk->tc_daddr;
5704         ioc_id->u.ip.src_port = trk->tc_sport;
5705         ioc_id->u.ip.dst_port = trk->tc_dport;
5706
5707         return (TRUE);
5708 }
5709
5710 static boolean_t
5711 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5712 {
5713         struct ipfw_ioc_flowid *ioc_id;
5714
5715         if (IPFW_STATE_SCANSKIP(s))
5716                 return (FALSE);
5717
5718         ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5719             0 : s->st_expire - time_uptime;
5720         ioc_state->pcnt = s->st_pcnt;
5721         ioc_state->bcnt = s->st_bcnt;
5722
5723         ioc_state->dyn_type = s->st_type;
5724         ioc_state->count = 0;
5725
5726         ioc_state->rulenum = s->st_rule->rulenum;
5727
5728         ioc_id = &ioc_state->id;
5729         ioc_id->type = ETHERTYPE_IP;
5730         ioc_id->u.ip.proto = s->st_proto;
5731         ipfw_key_4tuple(&s->st_key,
5732             &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5733             &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5734
5735         if (IPFW_ISXLAT(s->st_type)) {
5736                 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5737
5738                 if (x->xlat_port == 0)
5739                         ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5740                 else
5741                         ioc_state->xlat_port = ntohs(x->xlat_port);
5742                 ioc_state->xlat_addr = ntohl(x->xlat_addr);
5743
5744                 ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5745                 ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5746         }
5747
5748         return (TRUE);
5749 }
5750
5751 static void
5752 ipfw_state_copy_dispatch(netmsg_t nmsg)
5753 {
5754         struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5755         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5756         const struct ipfw_state *s;
5757         const struct ipfw_track *t;
5758
5759         ASSERT_NETISR_NCPUS(mycpuid);
5760         KASSERT(nm->state_cnt < nm->state_cntmax,
5761             ("invalid state count %d, max %d",
5762              nm->state_cnt, nm->state_cntmax));
5763
5764         TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5765                 if (ipfw_state_copy(s, nm->ioc_state)) {
5766                         nm->ioc_state++;
5767                         nm->state_cnt++;
5768                         if (nm->state_cnt == nm->state_cntmax)
5769                                 goto done;
5770                 }
5771         }
5772
5773         /*
5774          * Prepare tracks in the global track tree for userland.
5775          */
5776         TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5777                 struct ipfw_trkcnt *trk;
5778
5779                 if (t->t_count == NULL) /* anchor */
5780                         continue;
5781                 trk = t->t_trkcnt;
5782
5783                 /*
5784                  * Only one netisr can run this function at
5785                  * any time, and only this function accesses
5786                  * trkcnt's tc_expire, so this is safe w/o
5787                  * ipfw_gd.ipfw_trkcnt_token.
5788                  */
5789                 if (trk->tc_expire > t->t_expire)
5790                         continue;
5791                 trk->tc_expire = t->t_expire;
5792         }
5793
5794         /*
5795          * Copy tracks in the global track tree to userland in
5796          * the last netisr.
5797          */
5798         if (mycpuid == netisr_ncpus - 1) {
5799                 struct ipfw_trkcnt *trk;
5800
5801                 KASSERT(nm->state_cnt < nm->state_cntmax,
5802                     ("invalid state count %d, max %d",
5803                      nm->state_cnt, nm->state_cntmax));
5804
5805                 IPFW_TRKCNT_TOKGET;
5806                 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5807                         if (ipfw_track_copy(trk, nm->ioc_state)) {
5808                                 nm->ioc_state++;
5809                                 nm->state_cnt++;
5810                                 if (nm->state_cnt == nm->state_cntmax) {
5811                                         IPFW_TRKCNT_TOKREL;
5812                                         goto done;
5813                                 }
5814                         }
5815                 }
5816                 IPFW_TRKCNT_TOKREL;
5817         }
5818 done:
5819         if (nm->state_cnt == nm->state_cntmax) {
5820                 /* No more space; done. */
5821                 netisr_replymsg(&nm->base, 0);
5822         } else {
5823                 netisr_forwardmsg(&nm->base, mycpuid + 1);
5824         }
5825 }
5826
5827 static int
5828 ipfw_ctl_get_rules(struct sockopt *sopt)
5829 {
5830         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5831         struct ip_fw *rule;
5832         void *bp;
5833         size_t size;
5834         int state_cnt;
5835
5836         ASSERT_NETISR0;
5837
5838         /*
5839          * pass up a copy of the current rules. Static rules
5840          * come first (the last of which has number IPFW_DEFAULT_RULE),
5841          * followed by a possibly empty list of states.
5842          */
5843
5844         size = static_ioc_len;  /* size of static rules */
5845
5846         /*
5847          * Size of the states.
5848          * XXX take tracks as state for userland compat.
5849          */
5850         state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5851         state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5852         size += state_cnt * sizeof(struct ipfw_ioc_state);
5853
5854         if (sopt->sopt_valsize < size) {
5855                 /* short length, no need to return incomplete rules */
5856                 /* XXX: if superuser, no need to zero buffer */
5857                 bzero(sopt->sopt_val, sopt->sopt_valsize); 
5858                 return 0;
5859         }
5860         bp = sopt->sopt_val;
5861
5862         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5863                 bp = ipfw_copy_rule(ctx, rule, bp);
5864
5865         if (state_cnt) {
5866                 struct netmsg_cpstate nm;
5867 #ifdef INVARIANTS
5868                 size_t old_size = size;
5869 #endif
5870
5871                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5872                     MSGF_PRIORITY, ipfw_state_copy_dispatch);
5873                 nm.ioc_state = bp;
5874                 nm.state_cntmax = state_cnt;
5875                 nm.state_cnt = 0;
5876                 netisr_domsg_global(&nm.base);
5877
5878                 /*
5879                  * The # of states may be shrinked after the snapshot
5880                  * of the state count was taken.  To give user a correct
5881                  * state count, nm->state_cnt is used to recalculate
5882                  * the actual size.
5883                  */
5884                 size = static_ioc_len +
5885                     (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5886                 KKASSERT(size <= old_size);
5887         }
5888
5889         sopt->sopt_valsize = size;
5890         return 0;
5891 }
5892
5893 static void
5894 ipfw_set_disable_dispatch(netmsg_t nmsg)
5895 {
5896         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5897
5898         ASSERT_NETISR_NCPUS(mycpuid);
5899
5900         ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5901         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5902 }
5903
5904 static void
5905 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5906 {
5907         struct netmsg_base nmsg;
5908         uint32_t set_disable;
5909
5910         ASSERT_NETISR0;
5911
5912         /* IPFW_DEFAULT_SET is always enabled */
5913         enable |= (1 << IPFW_DEFAULT_SET);
5914         set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5915
5916         bzero(&nmsg, sizeof(nmsg));
5917         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5918             ipfw_set_disable_dispatch);
5919         nmsg.lmsg.u.ms_result32 = set_disable;
5920
5921         netisr_domsg_global(&nmsg);
5922 }
5923
5924 static void
5925 ipfw_table_create_dispatch(netmsg_t nm)
5926 {
5927         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5928         int tblid = nm->lmsg.u.ms_result;
5929
5930         ASSERT_NETISR_NCPUS(mycpuid);
5931
5932         if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5933             rn_cpumaskhead(mycpuid), 32))
5934                 panic("ipfw: create table%d failed", tblid);
5935
5936         netisr_forwardmsg(&nm->base, mycpuid + 1);
5937 }
5938
5939 static int
5940 ipfw_table_create(struct sockopt *sopt)
5941 {
5942         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5943         struct ipfw_ioc_table *tbl;
5944         struct netmsg_base nm;
5945
5946         ASSERT_NETISR0;
5947
5948         if (sopt->sopt_valsize != sizeof(*tbl))
5949                 return (EINVAL);
5950
5951         tbl = sopt->sopt_val;
5952         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5953                 return (EINVAL);
5954
5955         if (ctx->ipfw_tables[tbl->tableid] != NULL)
5956                 return (EEXIST);
5957
5958         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5959             ipfw_table_create_dispatch);
5960         nm.lmsg.u.ms_result = tbl->tableid;
5961         netisr_domsg_global(&nm);
5962
5963         return (0);
5964 }
5965
5966 static void
5967 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5968 {
5969         struct radix_node *ret;
5970
5971         ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5972         if (ret != rn)
5973                 panic("deleted other table entry");
5974         kfree(ret, M_IPFW);
5975 }
5976
5977 static int
5978 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5979 {
5980
5981         ipfw_table_killrn(xrnh, rn);
5982         return (0);
5983 }
5984
5985 static void
5986 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5987     int destroy)
5988 {
5989         struct radix_node_head *rnh;
5990
5991         ASSERT_NETISR_NCPUS(mycpuid);
5992
5993         rnh = ctx->ipfw_tables[tableid];
5994         rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5995         if (destroy) {
5996                 Free(rnh);
5997                 ctx->ipfw_tables[tableid] = NULL;
5998         }
5999 }
6000
6001 static void
6002 ipfw_table_flush_dispatch(netmsg_t nmsg)
6003 {
6004         struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
6005         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6006
6007         ASSERT_NETISR_NCPUS(mycpuid);
6008
6009         ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
6010         netisr_forwardmsg(&nm->base, mycpuid + 1);
6011 }
6012
6013 static void
6014 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
6015 {
6016         int i;
6017
6018         ASSERT_NETISR_NCPUS(mycpuid);
6019
6020         for (i = 0; i < ipfw_table_max; ++i) {
6021                 if (ctx->ipfw_tables[i] != NULL)
6022                         ipfw_table_flush_oncpu(ctx, i, destroy);
6023         }
6024 }
6025
6026 static void
6027 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6028 {
6029         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6030
6031         ASSERT_NETISR_NCPUS(mycpuid);
6032
6033         ipfw_table_flushall_oncpu(ctx, 0);
6034         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6035 }
6036
6037 static int
6038 ipfw_table_flush(struct sockopt *sopt)
6039 {
6040         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6041         struct ipfw_ioc_table *tbl;
6042         struct netmsg_tblflush nm;
6043
6044         ASSERT_NETISR0;
6045
6046         if (sopt->sopt_valsize != sizeof(*tbl))
6047                 return (EINVAL);
6048
6049         tbl = sopt->sopt_val;
6050         if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6051                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6052                     MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6053                 netisr_domsg_global(&nm.base);
6054                 return (0);
6055         }
6056
6057         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6058                 return (EINVAL);
6059
6060         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6061                 return (ENOENT);
6062
6063         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6064             ipfw_table_flush_dispatch);
6065         nm.tableid = tbl->tableid;
6066         nm.destroy = 0;
6067         if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6068                 nm.destroy = 1;
6069         netisr_domsg_global(&nm.base);
6070
6071         return (0);
6072 }
6073
6074 static int
6075 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6076 {
6077         int *cnt = xcnt;
6078
6079         (*cnt)++;
6080         return (0);
6081 }
6082
6083 static int
6084 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6085 {
6086         struct ipfw_table_cp *cp = xcp;
6087         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6088         struct ipfw_ioc_tblent *ioc_te;
6089 #ifdef INVARIANTS
6090         int cnt;
6091 #endif
6092
6093         KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6094             cp->te_idx, cp->te_cnt));
6095         ioc_te = &cp->te[cp->te_idx];
6096
6097         if (te->te_nodes->rn_mask != NULL) {
6098                 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6099                     *te->te_nodes->rn_mask);
6100         } else {
6101                 ioc_te->netmask.sin_len = 0;
6102         }
6103         memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6104
6105         ioc_te->use = te->te_use;
6106         ioc_te->last_used = te->te_lastuse;
6107 #ifdef INVARIANTS
6108         cnt = 1;
6109 #endif
6110
6111         while ((te = te->te_sibling) != NULL) {
6112 #ifdef INVARIANTS
6113                 ++cnt;
6114 #endif
6115                 ioc_te->use += te->te_use;
6116                 if (te->te_lastuse > ioc_te->last_used)
6117                         ioc_te->last_used = te->te_lastuse;
6118         }
6119         KASSERT(cnt == netisr_ncpus,
6120             ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6121
6122         cp->te_idx++;
6123
6124         return (0);
6125 }
6126
6127 static int
6128 ipfw_table_get(struct sockopt *sopt)
6129 {
6130         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6131         struct radix_node_head *rnh;
6132         struct ipfw_ioc_table *tbl;
6133         struct ipfw_ioc_tblcont *cont;
6134         struct ipfw_table_cp cp;
6135         int cnt = 0, sz;
6136
6137         ASSERT_NETISR0;
6138
6139         if (sopt->sopt_valsize < sizeof(*tbl))
6140                 return (EINVAL);
6141
6142         tbl = sopt->sopt_val;
6143         if (tbl->tableid < 0) {
6144                 struct ipfw_ioc_tbllist *list;
6145                 int i;
6146
6147                 /*
6148                  * List available table ids.
6149                  */
6150                 for (i = 0; i < ipfw_table_max; ++i) {
6151                         if (ctx->ipfw_tables[i] != NULL)
6152                                 ++cnt;
6153                 }
6154
6155                 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6156                 if (sopt->sopt_valsize < sz) {
6157                         bzero(sopt->sopt_val, sopt->sopt_valsize);
6158                         return (E2BIG);
6159                 }
6160                 list = sopt->sopt_val;
6161                 list->tablecnt = cnt;
6162
6163                 cnt = 0;
6164                 for (i = 0; i < ipfw_table_max; ++i) {
6165                         if (ctx->ipfw_tables[i] != NULL) {
6166                                 KASSERT(cnt < list->tablecnt,
6167                                     ("invalid idx %d, cnt %d",
6168                                      cnt, list->tablecnt));
6169                                 list->tables[cnt++] = i;
6170                         }
6171                 }
6172                 sopt->sopt_valsize = sz;
6173                 return (0);
6174         } else if (tbl->tableid >= ipfw_table_max) {
6175                 return (EINVAL);
6176         }
6177
6178         rnh = ctx->ipfw_tables[tbl->tableid];
6179         if (rnh == NULL)
6180                 return (ENOENT);
6181         rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6182
6183         sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6184         if (sopt->sopt_valsize < sz) {
6185                 bzero(sopt->sopt_val, sopt->sopt_valsize);
6186                 return (E2BIG);
6187         }
6188         cont = sopt->sopt_val;
6189         cont->entcnt = cnt;
6190
6191         cp.te = cont->ent;
6192         cp.te_idx = 0;
6193         cp.te_cnt = cnt;
6194         rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6195
6196         sopt->sopt_valsize = sz;
6197         return (0);
6198 }
6199
6200 static void
6201 ipfw_table_add_dispatch(netmsg_t nmsg)
6202 {
6203         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6204         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6205         struct radix_node_head *rnh;
6206         struct ipfw_tblent *te;
6207
6208         ASSERT_NETISR_NCPUS(mycpuid);
6209
6210         rnh = ctx->ipfw_tables[nm->tableid];
6211
6212         te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6213         te->te_nodes->rn_key = (char *)&te->te_key;
6214         memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6215
6216         if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6217             te->te_nodes) == NULL) {
6218                 if (mycpuid == 0) {
6219                         kfree(te, M_IPFW);
6220                         netisr_replymsg(&nm->base, EEXIST);
6221                         return;
6222                 }
6223                 panic("rnh_addaddr failed");
6224         }
6225
6226         /* Link siblings. */
6227         if (nm->sibling != NULL)
6228                 nm->sibling->te_sibling = te;
6229         nm->sibling = te;
6230
6231         netisr_forwardmsg(&nm->base, mycpuid + 1);
6232 }
6233
6234 static void
6235 ipfw_table_del_dispatch(netmsg_t nmsg)
6236 {
6237         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6238         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6239         struct radix_node_head *rnh;
6240         struct radix_node *rn;
6241
6242         ASSERT_NETISR_NCPUS(mycpuid);
6243
6244         rnh = ctx->ipfw_tables[nm->tableid];
6245         rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6246         if (rn == NULL) {
6247                 if (mycpuid == 0) {
6248                         netisr_replymsg(&nm->base, ESRCH);
6249                         return;
6250                 }
6251                 panic("rnh_deladdr failed");
6252         }
6253         kfree(rn, M_IPFW);
6254
6255         netisr_forwardmsg(&nm->base, mycpuid + 1);
6256 }
6257
6258 static int
6259 ipfw_table_alt(struct sockopt *sopt)
6260 {
6261         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6262         struct ipfw_ioc_tblcont *tbl;
6263         struct ipfw_ioc_tblent *te;
6264         struct sockaddr_in key0;
6265         struct sockaddr *netmask = NULL, *key;
6266         struct netmsg_tblent nm;
6267
6268         ASSERT_NETISR0;
6269
6270         if (sopt->sopt_valsize != sizeof(*tbl))
6271                 return (EINVAL);
6272         tbl = sopt->sopt_val;
6273
6274         if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6275                 return (EINVAL);
6276         if (tbl->entcnt != 1)
6277                 return (EINVAL);
6278
6279         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6280                 return (ENOENT);
6281         te = &tbl->ent[0];
6282
6283         if (te->key.sin_family != AF_INET ||
6284             te->key.sin_port != 0 ||
6285             te->key.sin_len != sizeof(struct sockaddr_in))
6286                 return (EINVAL);
6287         key = (struct sockaddr *)&te->key;
6288
6289         if (te->netmask.sin_len != 0) {
6290                 if (te->netmask.sin_port != 0 ||
6291                     te->netmask.sin_len > sizeof(struct sockaddr_in))
6292                         return (EINVAL);
6293                 netmask = (struct sockaddr *)&te->netmask;
6294                 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6295                 key = (struct sockaddr *)&key0;
6296         }
6297
6298         if (sopt->sopt_name == IP_FW_TBL_ADD) {
6299                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6300                     MSGF_PRIORITY, ipfw_table_add_dispatch);
6301         } else {
6302                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6303                     MSGF_PRIORITY, ipfw_table_del_dispatch);
6304         }
6305         nm.key = key;
6306         nm.netmask = netmask;
6307         nm.tableid = tbl->tableid;
6308         nm.sibling = NULL;
6309         return (netisr_domsg_global(&nm.base));
6310 }
6311
6312 static int
6313 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6314 {
6315         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6316
6317         te->te_use = 0;
6318         te->te_lastuse = 0;
6319         return (0);
6320 }
6321
6322 static void
6323 ipfw_table_zero_dispatch(netmsg_t nmsg)
6324 {
6325         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6326         struct radix_node_head *rnh;
6327
6328         ASSERT_NETISR_NCPUS(mycpuid);
6329
6330         rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6331         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6332
6333         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6334 }
6335
6336 static void
6337 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6338 {
6339         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6340         int i;
6341
6342         ASSERT_NETISR_NCPUS(mycpuid);
6343
6344         for (i = 0; i < ipfw_table_max; ++i) {
6345                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6346
6347                 if (rnh != NULL)
6348                         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6349         }
6350         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6351 }
6352
6353 static int
6354 ipfw_table_zero(struct sockopt *sopt)
6355 {
6356         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6357         struct netmsg_base nm;
6358         struct ipfw_ioc_table *tbl;
6359
6360         ASSERT_NETISR0;
6361
6362         if (sopt->sopt_valsize != sizeof(*tbl))
6363                 return (EINVAL);
6364         tbl = sopt->sopt_val;
6365
6366         if (tbl->tableid < 0) {
6367                 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6368                     ipfw_table_zeroall_dispatch);
6369                 netisr_domsg_global(&nm);
6370                 return (0);
6371         } else if (tbl->tableid >= ipfw_table_max) {
6372                 return (EINVAL);
6373         } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6374                 return (ENOENT);
6375         }
6376
6377         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6378             ipfw_table_zero_dispatch);
6379         nm.lmsg.u.ms_result = tbl->tableid;
6380         netisr_domsg_global(&nm);
6381
6382         return (0);
6383 }
6384
6385 static int
6386 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6387 {
6388         struct netmsg_tblexp *nm = xnm;
6389         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6390
6391         if (te->te_expired) {
6392                 ipfw_table_killrn(nm->rnh, rn);
6393                 nm->expcnt++;
6394         }
6395         return (0);
6396 }
6397
6398 static void
6399 ipfw_table_expire_dispatch(netmsg_t nmsg)
6400 {
6401         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6402         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6403         struct radix_node_head *rnh;
6404
6405         ASSERT_NETISR_NCPUS(mycpuid);
6406
6407         rnh = ctx->ipfw_tables[nm->tableid];
6408         nm->rnh = rnh;
6409         rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6410
6411         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6412             ("not all expired addresses (%d) were deleted (%d)",
6413              nm->cnt * (mycpuid + 1), nm->expcnt));
6414
6415         netisr_forwardmsg(&nm->base, mycpuid + 1);
6416 }
6417
6418 static void
6419 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6420 {
6421         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6422         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6423         int i;
6424
6425         ASSERT_NETISR_NCPUS(mycpuid);
6426
6427         for (i = 0; i < ipfw_table_max; ++i) {
6428                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6429
6430                 if (rnh == NULL)
6431                         continue;
6432                 nm->rnh = rnh;
6433                 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6434         }
6435
6436         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6437             ("not all expired addresses (%d) were deleted (%d)",
6438              nm->cnt * (mycpuid + 1), nm->expcnt));
6439
6440         netisr_forwardmsg(&nm->base, mycpuid + 1);
6441 }
6442
6443 static int
6444 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6445 {
6446         struct netmsg_tblexp *nm = xnm;
6447         struct ipfw_tblent *te;
6448         time_t lastuse;
6449
6450         te = (struct ipfw_tblent *)rn;
6451         lastuse = te->te_lastuse;
6452
6453         while ((te = te->te_sibling) != NULL) {
6454                 if (te->te_lastuse > lastuse)
6455                         lastuse = te->te_lastuse;
6456         }
6457         if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6458                 /* Not expired */
6459                 return (0);
6460         }
6461
6462         te = (struct ipfw_tblent *)rn;
6463         te->te_expired = 1;
6464         while ((te = te->te_sibling) != NULL)
6465                 te->te_expired = 1;
6466         nm->cnt++;
6467
6468         return (0);
6469 }
6470
6471 static int
6472 ipfw_table_expire(struct sockopt *sopt)
6473 {
6474         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6475         struct netmsg_tblexp nm;
6476         struct ipfw_ioc_tblexp *tbl;
6477         struct radix_node_head *rnh;
6478
6479         ASSERT_NETISR0;
6480
6481         if (sopt->sopt_valsize != sizeof(*tbl))
6482                 return (EINVAL);
6483         tbl = sopt->sopt_val;
6484         tbl->expcnt = 0;
6485
6486         nm.expcnt = 0;
6487         nm.cnt = 0;
6488         nm.expire = tbl->expire;
6489
6490         if (tbl->tableid < 0) {
6491                 int i;
6492
6493                 for (i = 0; i < ipfw_table_max; ++i) {
6494                         rnh = ctx->ipfw_tables[i];
6495                         if (rnh == NULL)
6496                                 continue;
6497                         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6498                 }
6499                 if (nm.cnt == 0) {
6500                         /* No addresses can be expired. */
6501                         return (0);
6502                 }
6503                 tbl->expcnt = nm.cnt;
6504
6505                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6506                     MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6507                 nm.tableid = -1;
6508                 netisr_domsg_global(&nm.base);
6509                 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6510                     ("not all expired addresses (%d) were deleted (%d)",
6511                      nm.cnt * netisr_ncpus, nm.expcnt));
6512
6513                 return (0);
6514         } else if (tbl->tableid >= ipfw_table_max) {
6515                 return (EINVAL);
6516         }
6517
6518         rnh = ctx->ipfw_tables[tbl->tableid];
6519         if (rnh == NULL)
6520                 return (ENOENT);
6521         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6522         if (nm.cnt == 0) {
6523                 /* No addresses can be expired. */
6524                 return (0);
6525         }
6526         tbl->expcnt = nm.cnt;
6527
6528         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6529             ipfw_table_expire_dispatch);
6530         nm.tableid = tbl->tableid;
6531         netisr_domsg_global(&nm.base);
6532         KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6533             ("not all expired addresses (%d) were deleted (%d)",
6534              nm.cnt * netisr_ncpus, nm.expcnt));
6535         return (0);
6536 }
6537
6538 static void
6539 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6540 {
6541         struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6542
6543         KKASSERT((rule->rule_flags &
6544             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6545             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6546         ipfw_free_rule(rule);
6547
6548         netisr_replymsg(&nmsg->base, 0);
6549 }
6550
6551 static void
6552 ipfw_crossref_reap(void)
6553 {
6554         struct ip_fw *rule, *prev = NULL;
6555
6556         ASSERT_NETISR0;
6557
6558         rule = ipfw_gd.ipfw_crossref_free;
6559         while (rule != NULL) {
6560                 uint64_t inflight = 0;
6561                 int i;
6562
6563                 for (i = 0; i < netisr_ncpus; ++i)
6564                         inflight += rule->cross_rules[i]->cross_refs;
6565                 if (inflight == 0) {
6566                         struct ip_fw *f = rule;
6567
6568                         /*
6569                          * Unlink.
6570                          */
6571                         rule = rule->next;
6572                         if (prev != NULL)
6573                                 prev->next = rule;
6574                         else
6575                                 ipfw_gd.ipfw_crossref_free = rule;
6576
6577                         /*
6578                          * Free.
6579                          */
6580                         for (i = 1; i < netisr_ncpus; ++i) {
6581                                 struct netmsg_base nm;
6582
6583                                 netmsg_init(&nm, NULL, &curthread->td_msgport,
6584                                     MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6585                                 nm.lmsg.u.ms_resultp = f->cross_rules[i];
6586                                 netisr_domsg(&nm, i);
6587                         }
6588                         KKASSERT((f->rule_flags &
6589                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6590                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6591                         ipfw_unref_rule(f);
6592                 } else {
6593                         prev = rule;
6594                         rule = rule->next;
6595                 }
6596         }
6597
6598         if (ipfw_gd.ipfw_crossref_free != NULL) {
6599                 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6600                     ipfw_crossref_timeo, NULL);
6601         }
6602 }
6603
6604 /*
6605  * {set|get}sockopt parser.
6606  */
6607 static int
6608 ipfw_ctl(struct sockopt *sopt)
6609 {
6610         int error, rulenum;
6611         uint32_t *masks;
6612         size_t size;
6613
6614         ASSERT_NETISR0;
6615
6616         error = 0;
6617
6618         switch (sopt->sopt_name) {
6619         case IP_FW_GET:
6620                 error = ipfw_ctl_get_rules(sopt);
6621                 break;
6622
6623         case IP_FW_FLUSH:
6624                 ipfw_flush(0 /* keep default rule */);
6625                 break;
6626
6627         case IP_FW_ADD:
6628                 error = ipfw_ctl_add_rule(sopt);
6629                 break;
6630
6631         case IP_FW_DEL:
6632                 /*
6633                  * IP_FW_DEL is used for deleting single rules or sets,
6634                  * and (ab)used to atomically manipulate sets.
6635                  * Argument size is used to distinguish between the two:
6636                  *    sizeof(uint32_t)
6637                  *      delete single rule or set of rules,
6638                  *      or reassign rules (or sets) to a different set.
6639                  *    2 * sizeof(uint32_t)
6640                  *      atomic disable/enable sets.
6641                  *      first uint32_t contains sets to be disabled,
6642                  *      second uint32_t contains sets to be enabled.
6643                  */
6644                 masks = sopt->sopt_val;
6645                 size = sopt->sopt_valsize;
6646                 if (size == sizeof(*masks)) {
6647                         /*
6648                          * Delete or reassign static rule
6649                          */
6650                         error = ipfw_ctl_alter(masks[0]);
6651                 } else if (size == (2 * sizeof(*masks))) {
6652                         /*
6653                          * Set enable/disable
6654                          */
6655                         ipfw_ctl_set_disable(masks[0], masks[1]);
6656                 } else {
6657                         error = EINVAL;
6658                 }
6659                 break;
6660
6661         case IP_FW_ZERO:
6662         case IP_FW_RESETLOG: /* argument is an int, the rule number */
6663                 rulenum = 0;
6664
6665                 if (sopt->sopt_val != 0) {
6666                     error = soopt_to_kbuf(sopt, &rulenum,
6667                             sizeof(int), sizeof(int));
6668                     if (error)
6669                         break;
6670                 }
6671                 error = ipfw_ctl_zero_entry(rulenum,
6672                         sopt->sopt_name == IP_FW_RESETLOG);
6673                 break;
6674
6675         case IP_FW_TBL_CREATE:
6676                 error = ipfw_table_create(sopt);
6677                 break;
6678
6679         case IP_FW_TBL_ADD:
6680         case IP_FW_TBL_DEL:
6681                 error = ipfw_table_alt(sopt);
6682                 break;
6683
6684         case IP_FW_TBL_FLUSH:
6685         case IP_FW_TBL_DESTROY:
6686                 error = ipfw_table_flush(sopt);
6687                 break;
6688
6689         case IP_FW_TBL_GET:
6690                 error = ipfw_table_get(sopt);
6691                 break;
6692
6693         case IP_FW_TBL_ZERO:
6694                 error = ipfw_table_zero(sopt);
6695                 break;
6696
6697         case IP_FW_TBL_EXPIRE:
6698                 error = ipfw_table_expire(sopt);
6699                 break;
6700
6701         default:
6702                 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6703                 error = EINVAL;
6704         }
6705
6706         ipfw_crossref_reap();
6707         return error;
6708 }
6709
6710 static void
6711 ipfw_keepalive_done(struct ipfw_context *ctx)
6712 {
6713
6714         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6715             ("keepalive is not in progress"));
6716         ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6717         callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6718             ipfw_keepalive, NULL);
6719 }
6720
6721 static void
6722 ipfw_keepalive_more(struct ipfw_context *ctx)
6723 {
6724         struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6725
6726         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6727             ("keepalive is not in progress"));
6728         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6729             ("keepalive more did not finish"));
6730         netisr_sendmsg_oncpu(nm);
6731 }
6732
6733 static void
6734 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6735 {
6736         struct ipfw_state *s;
6737         int scanned = 0, expired = 0, kept = 0;
6738
6739         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6740             ("keepalive is not in progress"));
6741
6742         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6743                 uint32_t ack_rev, ack_fwd;
6744                 struct ipfw_flow_id id;
6745                 uint8_t send_dir;
6746
6747                 if (scanned++ >= ipfw_state_scan_max) {
6748                         ipfw_keepalive_more(ctx);
6749                         return;
6750                 }
6751
6752                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6753                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6754
6755                 /*
6756                  * NOTE:
6757                  * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6758                  * on slave xlat.
6759                  */
6760                 if (s->st_type == O_ANCHOR)
6761                         continue;
6762
6763                 if (IPFW_STATE_ISDEAD(s)) {
6764                         ipfw_state_remove(ctx, s);
6765                         if (++expired >= ipfw_state_expire_max) {
6766                                 ipfw_keepalive_more(ctx);
6767                                 return;
6768                         }
6769                         continue;
6770                 }
6771
6772                 /*
6773                  * Keep alive processing
6774                  */
6775
6776                 if (s->st_proto != IPPROTO_TCP)
6777                         continue;
6778                 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6779                         continue;
6780                 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6781                     s->st_expire))
6782                         continue;       /* too early */
6783
6784                 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6785                     &id.dst_ip, &id.dst_port);
6786                 ack_rev = s->st_ack_rev;
6787                 ack_fwd = s->st_ack_fwd;
6788
6789 #define SEND_FWD        0x1
6790 #define SEND_REV        0x2
6791
6792                 if (IPFW_ISXLAT(s->st_type)) {
6793                         const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6794
6795                         if (x->xlat_dir == MATCH_FORWARD)
6796                                 send_dir = SEND_FWD;
6797                         else
6798                                 send_dir = SEND_REV;
6799                 } else {
6800                         send_dir = SEND_FWD | SEND_REV;
6801                 }
6802
6803                 if (send_dir & SEND_REV)
6804                         send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6805                 if (send_dir & SEND_FWD)
6806                         send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6807
6808 #undef SEND_FWD
6809 #undef SEND_REV
6810
6811                 if (++kept >= ipfw_keepalive_max) {
6812                         ipfw_keepalive_more(ctx);
6813                         return;
6814                 }
6815         }
6816         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6817         ipfw_keepalive_done(ctx);
6818 }
6819
6820 static void
6821 ipfw_keepalive_more_dispatch(netmsg_t nm)
6822 {
6823         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6824         struct ipfw_state *anchor;
6825
6826         ASSERT_NETISR_NCPUS(mycpuid);
6827         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6828             ("keepalive is not in progress"));
6829
6830         /* Reply ASAP */
6831         netisr_replymsg(&nm->base, 0);
6832
6833         anchor = &ctx->ipfw_keepalive_anch;
6834         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6835                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6836                 ipfw_keepalive_done(ctx);
6837                 return;
6838         }
6839         ipfw_keepalive_loop(ctx, anchor);
6840 }
6841
6842 /*
6843  * This procedure is only used to handle keepalives. It is invoked
6844  * every dyn_keepalive_period
6845  */
6846 static void
6847 ipfw_keepalive_dispatch(netmsg_t nm)
6848 {
6849         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6850         struct ipfw_state *anchor;
6851
6852         ASSERT_NETISR_NCPUS(mycpuid);
6853         KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6854             ("keepalive is in progress"));
6855         ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6856
6857         /* Reply ASAP */
6858         crit_enter();
6859         netisr_replymsg(&nm->base, 0);
6860         crit_exit();
6861
6862         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6863                 ipfw_keepalive_done(ctx);
6864                 return;
6865         }
6866
6867         anchor = &ctx->ipfw_keepalive_anch;
6868         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6869         ipfw_keepalive_loop(ctx, anchor);
6870 }
6871
6872 /*
6873  * This procedure is only used to handle keepalives. It is invoked
6874  * every dyn_keepalive_period
6875  */
6876 static void
6877 ipfw_keepalive(void *dummy __unused)
6878 {
6879         struct netmsg_base *msg;
6880
6881         KKASSERT(mycpuid < netisr_ncpus);
6882         msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6883
6884         crit_enter();
6885         if (msg->lmsg.ms_flags & MSGF_DONE)
6886                 netisr_sendmsg_oncpu(msg);
6887         crit_exit();
6888 }
6889
6890 static void
6891 ipfw_ip_input_dispatch(netmsg_t nmsg)
6892 {
6893         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6894         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6895         struct mbuf *m = nm->m;
6896         struct ip_fw *rule = nm->arg1;
6897
6898         ASSERT_NETISR_NCPUS(mycpuid);
6899         KASSERT(rule->cpuid == mycpuid,
6900             ("rule does not belong to cpu%d", mycpuid));
6901         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6902             ("mbuf does not have ipfw continue rule"));
6903
6904         KASSERT(ctx->ipfw_cont_rule == NULL,
6905             ("pending ipfw continue rule"));
6906         ctx->ipfw_cont_rule = rule;
6907         ip_input(m);
6908
6909         /* May not be cleared, if ipfw was unload/disabled. */
6910         ctx->ipfw_cont_rule = NULL;
6911
6912         /*
6913          * This rule is no longer used; decrement its cross_refs,
6914          * so this rule can be deleted.
6915          */
6916         rule->cross_refs--;
6917 }
6918
6919 static void
6920 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6921 {
6922         struct netmsg_genpkt *nm;
6923
6924         KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6925
6926         /*
6927          * NOTE:
6928          * Bump cross_refs to prevent this rule and its siblings
6929          * from being deleted, while this mbuf is inflight.  The
6930          * cross_refs of the sibling rule on the target cpu will
6931          * be decremented, once this mbuf is going to be filtered
6932          * on the target cpu.
6933          */
6934         rule->cross_refs++;
6935         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6936
6937         nm = &m->m_hdr.mh_genmsg;
6938         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6939             ipfw_ip_input_dispatch);
6940         nm->m = m;
6941         nm->arg1 = rule->cross_rules[cpuid];
6942         netisr_sendmsg(&nm->base, cpuid);
6943 }
6944
6945 static void
6946 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6947 {
6948
6949         args->flags = 0;
6950         args->rule = NULL;
6951         args->xlat = NULL;
6952
6953         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6954                 struct m_tag *mtag;
6955
6956                 /* Extract info from dummynet tag */
6957                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6958                 KKASSERT(mtag != NULL);
6959                 args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6960                 KKASSERT(args->rule != NULL);
6961
6962                 m_tag_delete(m, mtag);
6963                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6964         } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6965                 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6966
6967                 KKASSERT(ctx->ipfw_cont_rule != NULL);
6968                 args->rule = ctx->ipfw_cont_rule;
6969                 ctx->ipfw_cont_rule = NULL;
6970
6971                 if (ctx->ipfw_cont_xlat != NULL) {
6972                         args->xlat = ctx->ipfw_cont_xlat;
6973                         ctx->ipfw_cont_xlat = NULL;
6974                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6975                                 args->flags |= IP_FWARG_F_XLATINS;
6976                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6977                         }
6978                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6979                                 args->flags |= IP_FWARG_F_XLATFWD;
6980                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6981                         }
6982                 }
6983                 KKASSERT((m->m_pkthdr.fw_flags &
6984                     (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6985
6986                 args->flags |= IP_FWARG_F_CONT;
6987                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6988         }
6989
6990         args->eh = NULL;
6991         args->oif = oif;
6992         args->m = m;
6993 }
6994
6995 static int
6996 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6997 {
6998         struct ip_fw_args args;
6999         struct mbuf *m = *m0;
7000         int tee = 0, error = 0, ret;
7001
7002         ipfw_init_args(&args, m, NULL);
7003
7004         ret = ipfw_chk(&args);
7005         m = args.m;
7006         if (m == NULL) {
7007                 if (ret != IP_FW_REDISPATCH)
7008                         error = EACCES;
7009                 goto back;
7010         }
7011
7012         switch (ret) {
7013         case IP_FW_PASS:
7014                 break;
7015
7016         case IP_FW_DENY:
7017                 m_freem(m);
7018                 m = NULL;
7019                 error = EACCES;
7020                 break;
7021
7022         case IP_FW_DUMMYNET:
7023                 /* Send packet to the appropriate pipe */
7024                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
7025                 break;
7026
7027         case IP_FW_TEE:
7028                 tee = 1;
7029                 /* FALL THROUGH */
7030
7031         case IP_FW_DIVERT:
7032                 /*
7033                  * Must clear bridge tag when changing
7034                  */
7035                 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7036                 if (ip_divert_p != NULL) {
7037                         m = ip_divert_p(m, tee, 1);
7038                 } else {
7039                         m_freem(m);
7040                         m = NULL;
7041                         /* not sure this is the right error msg */
7042                         error = EACCES;
7043                 }
7044                 break;
7045
7046         default:
7047                 panic("unknown ipfw return value: %d", ret);
7048         }
7049 back:
7050         *m0 = m;
7051         return error;
7052 }
7053
7054 static int
7055 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7056 {
7057         struct ip_fw_args args;
7058         struct mbuf *m = *m0;
7059         int tee = 0, error = 0, ret;
7060
7061         ipfw_init_args(&args, m, ifp);
7062
7063         ret = ipfw_chk(&args);
7064         m = args.m;
7065         if (m == NULL) {
7066                 if (ret != IP_FW_REDISPATCH)
7067                         error = EACCES;
7068                 goto back;
7069         }
7070
7071         switch (ret) {
7072         case IP_FW_PASS:
7073                 break;
7074
7075         case IP_FW_DENY:
7076                 m_freem(m);
7077                 m = NULL;
7078                 error = EACCES;
7079                 break;
7080
7081         case IP_FW_DUMMYNET:
7082                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7083                 break;
7084
7085         case IP_FW_TEE:
7086                 tee = 1;
7087                 /* FALL THROUGH */
7088
7089         case IP_FW_DIVERT:
7090                 if (ip_divert_p != NULL) {
7091                         m = ip_divert_p(m, tee, 0);
7092                 } else {
7093                         m_freem(m);
7094                         m = NULL;
7095                         /* not sure this is the right error msg */
7096                         error = EACCES;
7097                 }
7098                 break;
7099
7100         default:
7101                 panic("unknown ipfw return value: %d", ret);
7102         }
7103 back:
7104         *m0 = m;
7105         return error;
7106 }
7107
7108 static void
7109 ipfw_hook(void)
7110 {
7111         struct pfil_head *pfh;
7112
7113         ASSERT_NETISR0;
7114
7115         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7116         if (pfh == NULL)
7117                 return;
7118
7119         pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7120         pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7121 }
7122
7123 static void
7124 ipfw_dehook(void)
7125 {
7126         struct pfil_head *pfh;
7127
7128         ASSERT_NETISR0;
7129
7130         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7131         if (pfh == NULL)
7132                 return;
7133
7134         pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7135         pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7136 }
7137
7138 static int
7139 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7140 {
7141         int dyn_cnt;
7142
7143         dyn_cnt = ipfw_state_cntcoll();
7144         dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7145
7146         return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7147 }
7148
7149 static int
7150 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7151 {
7152         int state_cnt;
7153
7154         state_cnt = ipfw_state_cntcoll();
7155         return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7156 }
7157
7158 static int
7159 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7160 {
7161         int state_max, error;
7162
7163         state_max = ipfw_state_max;
7164         error = sysctl_handle_int(oidp, &state_max, 0, req);
7165         if (error || req->newptr == NULL)
7166                 return (error);
7167
7168         if (state_max < 1)
7169                 return (EINVAL);
7170
7171         ipfw_state_max_set(state_max);
7172         return (0);
7173 }
7174
7175 static int
7176 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7177 {
7178         int dyn_max, error;
7179
7180         dyn_max = ipfw_state_max + ipfw_track_max;
7181
7182         error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7183         if (error || req->newptr == NULL)
7184                 return (error);
7185
7186         if (dyn_max < 2)
7187                 return (EINVAL);
7188
7189         ipfw_state_max_set(dyn_max / 2);
7190         ipfw_track_max = dyn_max / 2;
7191         return (0);
7192 }
7193
7194 static void
7195 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7196 {
7197         int enable = nmsg->lmsg.u.ms_result;
7198
7199         ASSERT_NETISR0;
7200
7201         if (fw_enable == enable)
7202                 goto reply;
7203
7204         fw_enable = enable;
7205         if (fw_enable)
7206                 ipfw_hook();
7207         else
7208                 ipfw_dehook();
7209 reply:
7210         netisr_replymsg(&nmsg->base, 0);
7211 }
7212
7213 static int
7214 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7215 {
7216         struct netmsg_base nmsg;
7217         int enable, error;
7218
7219         enable = fw_enable;
7220         error = sysctl_handle_int(oidp, &enable, 0, req);
7221         if (error || req->newptr == NULL)
7222                 return error;
7223
7224         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7225             ipfw_sysctl_enable_dispatch);
7226         nmsg.lmsg.u.ms_result = enable;
7227
7228         return netisr_domsg(&nmsg, 0);
7229 }
7230
7231 static int
7232 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7233 {
7234         return sysctl_int_range(oidp, arg1, arg2, req,
7235                IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7236 }
7237
7238 static int
7239 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7240 {
7241
7242         return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7243 }
7244
7245 static int
7246 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7247 {
7248         u_long stat = 0;
7249         int cpu, error;
7250
7251         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7252                 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7253
7254         error = sysctl_handle_long(oidp, &stat, 0, req);
7255         if (error || req->newptr == NULL)
7256                 return (error);
7257
7258         /* Zero out this stat. */
7259         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7260                 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7261         return (0);
7262 }
7263
7264 static void
7265 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7266 {
7267         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7268         struct ipfw_context *ctx;
7269         struct ip_fw *def_rule;
7270
7271         ASSERT_NETISR_NCPUS(mycpuid);
7272
7273         ctx = kmalloc(__offsetof(struct ipfw_context,
7274             ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7275
7276         RB_INIT(&ctx->ipfw_state_tree);
7277         TAILQ_INIT(&ctx->ipfw_state_list);
7278
7279         RB_INIT(&ctx->ipfw_track_tree);
7280         TAILQ_INIT(&ctx->ipfw_track_list);
7281
7282         callout_init_mp(&ctx->ipfw_stateto_ch);
7283         netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7284             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7285         ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7286         netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7287             MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7288
7289         callout_init_mp(&ctx->ipfw_trackto_ch);
7290         netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7291             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7292         netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7293             MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7294
7295         callout_init_mp(&ctx->ipfw_keepalive_ch);
7296         netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7297             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7298         ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7299         netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7300             MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7301
7302         callout_init_mp(&ctx->ipfw_xlatreap_ch);
7303         netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7304             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7305         TAILQ_INIT(&ctx->ipfw_xlatreap);
7306
7307         ipfw_ctx[mycpuid] = ctx;
7308
7309         def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7310
7311         def_rule->act_ofs = 0;
7312         def_rule->rulenum = IPFW_DEFAULT_RULE;
7313         def_rule->cmd_len = 1;
7314         def_rule->set = IPFW_DEFAULT_SET;
7315
7316         def_rule->cmd[0].len = 1;
7317 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7318         def_rule->cmd[0].opcode = O_ACCEPT;
7319 #else
7320         if (filters_default_to_accept)
7321                 def_rule->cmd[0].opcode = O_ACCEPT;
7322         else
7323                 def_rule->cmd[0].opcode = O_DENY;
7324 #endif
7325
7326         def_rule->refcnt = 1;
7327         def_rule->cpuid = mycpuid;
7328
7329         /* Install the default rule */
7330         ctx->ipfw_default_rule = def_rule;
7331         ctx->ipfw_layer3_chain = def_rule;
7332
7333         /* Link rule CPU sibling */
7334         ipfw_link_sibling(fwmsg, def_rule);
7335
7336         /* Statistics only need to be updated once */
7337         if (mycpuid == 0)
7338                 ipfw_inc_static_count(def_rule);
7339
7340         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7341 }
7342
7343 static void
7344 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7345 {
7346
7347         crit_enter();
7348         /* Reply ASAP */
7349         netisr_replymsg(&nmsg->base, 0);
7350         crit_exit();
7351         ipfw_crossref_reap();
7352 }
7353
7354 static void
7355 ipfw_crossref_timeo(void *dummy __unused)
7356 {
7357         struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7358
7359         KKASSERT(mycpuid == 0);
7360
7361         crit_enter();
7362         if (msg->lmsg.ms_flags & MSGF_DONE)
7363                 netisr_sendmsg_oncpu(msg);
7364         crit_exit();
7365 }
7366
7367 static void
7368 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7369 {
7370         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7371         struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7372         struct ip_fw *f;
7373
7374         ASSERT_NETISR_NCPUS(mycpuid);
7375
7376         for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7377                 int l, cmdlen;
7378                 ipfw_insn *cmd;
7379
7380                 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7381                         continue;
7382
7383                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
7384                      l -= cmdlen, cmd += cmdlen) {
7385                         cmdlen = F_LEN(cmd);
7386                         if (cmd->opcode == O_IP_SRC_IFIP ||
7387                             cmd->opcode == O_IP_DST_IFIP) {
7388                                 if (strncmp(ifp->if_xname,
7389                                     ((ipfw_insn_ifip *)cmd)->ifname,
7390                                     IFNAMSIZ) == 0)
7391                                         cmd->arg1 &= ~IPFW_IFIP_VALID;
7392                         }
7393                 }
7394         }
7395         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7396 }
7397
7398 static void
7399 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7400     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7401 {
7402         struct netmsg_base nm;
7403
7404         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7405             ipfw_ifaddr_dispatch);
7406         nm.lmsg.u.ms_resultp = ifp;
7407         netisr_domsg_global(&nm);
7408 }
7409
7410 static void
7411 ipfw_init_dispatch(netmsg_t nmsg)
7412 {
7413         struct netmsg_ipfw fwmsg;
7414         int error = 0, cpu;
7415
7416         ASSERT_NETISR0;
7417
7418         if (IPFW_LOADED) {
7419                 kprintf("IP firewall already loaded\n");
7420                 error = EEXIST;
7421                 goto reply;
7422         }
7423
7424         if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7425                 ipfw_table_max = UINT16_MAX;
7426
7427         /* Initialize global track tree. */
7428         RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7429         IPFW_TRKCNT_TOKINIT;
7430
7431         /* GC for freed crossref rules. */
7432         callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7433         netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7434             MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7435
7436         ipfw_state_max_set(ipfw_state_max);
7437         ipfw_state_headroom = 8 * netisr_ncpus;
7438
7439         bzero(&fwmsg, sizeof(fwmsg));
7440         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7441             ipfw_ctx_init_dispatch);
7442         netisr_domsg_global(&fwmsg.base);
7443
7444         ip_fw_chk_ptr = ipfw_chk;
7445         ip_fw_ctl_ptr = ipfw_ctl;
7446         ip_fw_dn_io_ptr = ipfw_dummynet_io;
7447
7448         kprintf("ipfw2 initialized, default to %s, logging ",
7449                 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7450                 O_ACCEPT ? "accept" : "deny");
7451
7452 #ifdef IPFIREWALL_VERBOSE
7453         fw_verbose = 1;
7454 #endif
7455 #ifdef IPFIREWALL_VERBOSE_LIMIT
7456         verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7457 #endif
7458         if (fw_verbose == 0) {
7459                 kprintf("disabled\n");
7460         } else if (verbose_limit == 0) {
7461                 kprintf("unlimited\n");
7462         } else {
7463                 kprintf("limited to %d packets/entry by default\n",
7464                         verbose_limit);
7465         }
7466
7467         ip_fw_loaded = 1;
7468         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7469                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7470                     ipfw_state_expire_ipifunc, NULL, cpu);
7471                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7472                     ipfw_track_expire_ipifunc, NULL, cpu);
7473                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7474                     ipfw_keepalive, NULL, cpu);
7475         }
7476
7477         if (fw_enable)
7478                 ipfw_hook();
7479
7480         ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7481             NULL, EVENTHANDLER_PRI_ANY);
7482         if (ipfw_ifaddr_event == NULL)
7483                 kprintf("ipfw: ifaddr_event register failed\n");
7484
7485 reply:
7486         netisr_replymsg(&nmsg->base, error);
7487 }
7488
7489 static int
7490 ipfw_init(void)
7491 {
7492         struct netmsg_base smsg;
7493
7494         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7495             ipfw_init_dispatch);
7496         return netisr_domsg(&smsg, 0);
7497 }
7498
7499 #ifdef KLD_MODULE
7500
7501 static void
7502 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7503 {
7504         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7505
7506         ASSERT_NETISR_NCPUS(mycpuid);
7507
7508         callout_cancel(&ctx->ipfw_stateto_ch);
7509         callout_cancel(&ctx->ipfw_trackto_ch);
7510         callout_cancel(&ctx->ipfw_keepalive_ch);
7511         callout_cancel(&ctx->ipfw_xlatreap_ch);
7512
7513         crit_enter();
7514         netisr_dropmsg(&ctx->ipfw_stateexp_more);
7515         netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7516         netisr_dropmsg(&ctx->ipfw_trackexp_more);
7517         netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7518         netisr_dropmsg(&ctx->ipfw_keepalive_more);
7519         netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7520         netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7521         crit_exit();
7522
7523         ipfw_table_flushall_oncpu(ctx, 1);
7524
7525         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7526 }
7527
7528 static void
7529 ipfw_fini_dispatch(netmsg_t nmsg)
7530 {
7531         struct netmsg_base nm;
7532         int error = 0, cpu;
7533
7534         ASSERT_NETISR0;
7535
7536         ipfw_crossref_reap();
7537
7538         if (ipfw_gd.ipfw_refcnt != 0) {
7539                 error = EBUSY;
7540                 goto reply;
7541         }
7542
7543         ip_fw_loaded = 0;
7544         ipfw_dehook();
7545
7546         /* Synchronize any inflight state/track expire IPIs. */
7547         lwkt_synchronize_ipiqs("ipfwfini");
7548
7549         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7550             ipfw_ctx_fini_dispatch);
7551         netisr_domsg_global(&nm);
7552
7553         callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7554         crit_enter();
7555         netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7556         crit_exit();
7557
7558         if (ipfw_ifaddr_event != NULL)
7559                 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7560
7561         ip_fw_chk_ptr = NULL;
7562         ip_fw_ctl_ptr = NULL;
7563         ip_fw_dn_io_ptr = NULL;
7564         ipfw_flush(1 /* kill default rule */);
7565
7566         /* Free pre-cpu context */
7567         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7568                 kfree(ipfw_ctx[cpu], M_IPFW);
7569
7570         kprintf("IP firewall unloaded\n");
7571 reply:
7572         netisr_replymsg(&nmsg->base, error);
7573 }
7574
7575 static void
7576 ipfw_fflush_dispatch(netmsg_t nmsg)
7577 {
7578
7579         ipfw_flush(0 /* keep default rule */);
7580         ipfw_crossref_reap();
7581         netisr_replymsg(&nmsg->base, 0);
7582 }
7583
7584 static int
7585 ipfw_fini(void)
7586 {
7587         struct netmsg_base smsg;
7588         int i = 0;
7589
7590         for (;;) {
7591                 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7592                     ipfw_fflush_dispatch);
7593                 netisr_domsg(&smsg, 0);
7594
7595                 if (ipfw_gd.ipfw_refcnt == 0)
7596                         break;
7597                 kprintf("ipfw: flush pending %d\n", ++i);
7598                 tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7599         }
7600
7601         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7602             ipfw_fini_dispatch);
7603         return netisr_domsg(&smsg, 0);
7604 }
7605
7606 #endif  /* KLD_MODULE */
7607
7608 static int
7609 ipfw_modevent(module_t mod, int type, void *unused)
7610 {
7611         int err = 0;
7612
7613         switch (type) {
7614         case MOD_LOAD:
7615                 err = ipfw_init();
7616                 break;
7617
7618         case MOD_UNLOAD:
7619 #ifndef KLD_MODULE
7620                 kprintf("ipfw statically compiled, cannot unload\n");
7621                 err = EBUSY;
7622 #else
7623                 err = ipfw_fini();
7624 #endif
7625                 break;
7626         default:
7627                 break;
7628         }
7629         return err;
7630 }
7631
7632 static moduledata_t ipfwmod = {
7633         "ipfw",
7634         ipfw_modevent,
7635         0
7636 };
7637 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7638 MODULE_VERSION(ipfw, 1);