nrelease - fix/improve livecd
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27
28 /*
29  * Implement IP packet firewall (new version)
30  */
31
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58
59 #include <sys/thread2.h>
60 #include <net/netmsg2.h>
61
62 #include <netinet/in.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in_var.h>
65 #include <netinet/in_pcb.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_icmp.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/tcpip.h>
74 #include <netinet/udp.h>
75 #include <netinet/udp_var.h>
76 #include <netinet/ip_divert.h>
77 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
78
79 #include <net/ipfw/ip_fw2.h>
80
81 #ifdef IPFIREWALL_DEBUG
82 #define DPRINTF(fmt, ...) \
83 do { \
84         if (fw_debug > 0) \
85                 kprintf(fmt, __VA_ARGS__); \
86 } while (0)
87 #else
88 #define DPRINTF(fmt, ...)       ((void)0)
89 #endif
90
91 /*
92  * Description about per-CPU rule duplication:
93  *
94  * Module loading/unloading and all ioctl operations are serialized
95  * by netisr0, so we don't have any ordering or locking problems.
96  *
97  * Following graph shows how operation on per-CPU rule list is
98  * performed [2 CPU case]:
99  *
100  *   CPU0                 CPU1
101  *
102  * netisr0 <------------------------------------+
103  *  domsg                                       |
104  *    :                                         |
105  *    :(delete/add...)                          |
106  *    :                                         |
107  *    :         netmsg                          | netmsg
108  *  forwardmsg---------->netisr1                |
109  *                          :                   |
110  *                          :(delete/add...)    |
111  *                          :                   |
112  *                          :                   |
113  *                        replymsg--------------+
114  *
115  *
116  *
117  * Rule structure [2 CPU case]
118  *
119  *    CPU0               CPU1
120  *
121  * layer3_chain       layer3_chain
122  *     |                  |
123  *     V                  V
124  * +-------+ sibling  +-------+ sibling
125  * | rule1 |--------->| rule1 |--------->NULL
126  * +-------+          +-------+
127  *     |                  |
128  *     |next              |next
129  *     V                  V
130  * +-------+ sibling  +-------+ sibling
131  * | rule2 |--------->| rule2 |--------->NULL
132  * +-------+          +-------+
133  *
134  * ip_fw.sibling:
135  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
136  *    iterate layer3_chain in netisr0; the current rule's duplication
137  *    to the other CPUs could safely be read-only accessed through
138  *    ip_fw.sibling.
139  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
140  *    a) In netisr0 rule3 is determined to be inserted between rule1
141  *       and rule2.  To make this decision we need to iterate the
142  *       layer3_chain in netisr0.  The netmsg, which is used to insert
143  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
144  *       in netisr0 as next_rule.
145  *    b) After the insertion in netisr0 is done, we will move on to
146  *       netisr1.  But instead of relocating the rule3's position in
147  *       netisr1 by iterating the layer3_chain in netisr1, we set the
148  *       netmsg's prev_rule to rule1->sibling and next_rule to
149  *       rule2->sibling before the netmsg is forwarded to netisr1 from
150  *       netisr0.
151  */
152
153 /*
154  * Description of states and tracks.
155  *
156  * Both states and tracks are stored in per-cpu RB trees instead of
157  * per-cpu hash tables to avoid the worst case hash degeneration.
158  *
159  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
160  * measured in seconds and depending on the flags.
161  *
162  * When a packet is received, its address fields are first masked with
163  * the mask defined for the rule, then matched against the entries in
164  * the per-cpu state RB tree.  States are generated by 'keep-state'
165  * and 'limit' options.
166  *
167  * The max number of states is ipfw_state_max.  When we reach the
168  * maximum number of states we do not create anymore.  This is done to
169  * avoid consuming too much memory, but also too much time when
170  * searching on each packet.
171  *
172  * Each state holds a pointer to the parent ipfw rule of the current
173  * CPU so we know what action to perform.  States are removed when the
174  * parent rule is deleted.  XXX we should make them survive.
175  *
176  * There are some limitations with states -- we do not obey the
177  * 'randomized match', and we do not do multiple passes through the
178  * firewall.  XXX check the latter!!!
179  *
180  * States grow independently on each CPU, e.g. 2 CPU case:
181  *
182  *        CPU0                     CPU1
183  * ...................      ...................
184  * :  state RB tree  :      :  state RB tree  :
185  * :                 :      :                 :
186  * : state1   state2 :      :      state3     :
187  * :     |    |      :      :        |        :
188  * :.....|....|......:      :........|........:
189  *       |    |                      |
190  *       |    |                      |st_rule
191  *       |    |                      |
192  *       V    V                      V
193  *     +-------+                 +-------+
194  *     | rule1 |                 | rule1 |
195  *     +-------+                 +-------+
196  *
197  * Tracks are used to enforce limits on the number of sessions.  Tracks
198  * are generated by 'limit' option.
199  *
200  * The max number of tracks is ipfw_track_max.  When we reach the
201  * maximum number of tracks we do not create anymore.  This is done to
202  * avoid consuming too much memory.
203  *
204  * Tracks are organized into two layers, track counter RB tree is
205  * shared between CPUs, track RB tree is per-cpu.  States generated by
206  * 'limit' option are linked to the track in addition to the per-cpu
207  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
208  *
209  *             ..............................
210  *             :    track counter RB tree   :
211  *             :                            :
212  *             :        +-----------+       :
213  *             :        |  trkcnt1  |       :
214  *             :        |           |       :
215  *             :      +--->counter<----+    :
216  *             :      | |           |  |    :
217  *             :      | +-----------+  |    :
218  *             :......|................|....:
219  *                    |                |
220  *        CPU0        |                |         CPU1
221  * .................  |t_count         |  .................
222  * : track RB tree :  |                |  : track RB tree :
223  * :               :  |                |  :               :
224  * : +-->track1-------+                +--------track2    :
225  * : |     A       :                      :               :
226  * : |     |       :                      :               :
227  * :.|.....|.......:                      :...............:
228  *   |     +----------------+
229  *   | .................... |
230  *   | :   state RB tree  : |st_track
231  *   | :                  : |
232  *   +---state1    state2---+
233  *     :     |       |    :
234  *     :.....|.......|....:
235  *           |       |
236  *           |       |st_rule
237  *           V       V
238  *         +----------+
239  *         |   rule1  |
240  *         +----------+
241  */
242
243 #define IPFW_AUTOINC_STEP_MIN   1
244 #define IPFW_AUTOINC_STEP_MAX   1000
245 #define IPFW_AUTOINC_STEP_DEF   100
246
247 #define IPFW_TABLE_MAX_DEF      64
248
249 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
250 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
251
252 #define MATCH_REVERSE           0
253 #define MATCH_FORWARD           1
254 #define MATCH_NONE              2
255 #define MATCH_UNKNOWN           3
256
257 #define TIME_LEQ(a, b)          ((a) - (b) <= 0)
258
259 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
260 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
261                                  (IPFW_STATE_TCPFLAGS << 8))
262
263 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
264 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
265 #define BOTH_RST                (TH_RST | (TH_RST << 8))
266 /* TH_ACK here means FIN was ACKed. */
267 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
268
269 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
270                                  (((s)->st_state & BOTH_RST) ||         \
271                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
272
273 #define O_ANCHOR                O_NOP
274
275 #define IPFW_ISXLAT(type)       ((type) == O_REDIRECT)
276 #define IPFW_XLAT_INVALID(s)    (IPFW_ISXLAT((s)->st_type) &&   \
277                                  ((struct ipfw_xlat *)(s))->xlat_invalid)
278
279 #define IPFW_MBUF_XLATINS       FW_MBUF_PRIVATE1
280 #define IPFW_MBUF_XLATFWD       FW_MBUF_PRIVATE2
281
282 #define IPFW_XLATE_INSERT       0x0001
283 #define IPFW_XLATE_FORWARD      0x0002
284 #define IPFW_XLATE_OUTPUT       0x0004
285
286 struct netmsg_ipfw {
287         struct netmsg_base      base;
288         const struct ipfw_ioc_rule *ioc_rule;
289         struct ip_fw            *next_rule;
290         struct ip_fw            *prev_rule;
291         struct ip_fw            *sibling;
292         uint32_t                rule_flags;
293         struct ip_fw            **cross_rules;
294 };
295
296 struct netmsg_del {
297         struct netmsg_base      base;
298         struct ip_fw            *start_rule;
299         struct ip_fw            *prev_rule;
300         uint16_t                rulenum;
301         uint8_t                 from_set;
302         uint8_t                 to_set;
303 };
304
305 struct netmsg_zent {
306         struct netmsg_base      base;
307         struct ip_fw            *start_rule;
308         uint16_t                rulenum;
309         uint16_t                log_only;
310 };
311
312 struct netmsg_cpstate {
313         struct netmsg_base      base;
314         struct ipfw_ioc_state   *ioc_state;
315         int                     state_cntmax;
316         int                     state_cnt;
317 };
318
319 struct netmsg_tblent {
320         struct netmsg_base      base;
321         struct sockaddr         *key;
322         struct sockaddr         *netmask;
323         struct ipfw_tblent      *sibling;
324         int                     tableid;
325 };
326
327 struct netmsg_tblflush {
328         struct netmsg_base      base;
329         int                     tableid;
330         int                     destroy;
331 };
332
333 struct netmsg_tblexp {
334         struct netmsg_base      base;
335         time_t                  expire;
336         int                     tableid;
337         int                     cnt;
338         int                     expcnt;
339         struct radix_node_head  *rnh;
340 };
341
342 struct ipfw_table_cp {
343         struct ipfw_ioc_tblent  *te;
344         int                     te_idx;
345         int                     te_cnt;
346 };
347
348 struct ip_fw_local {
349         /*
350          * offset       The offset of a fragment. offset != 0 means that
351          *      we have a fragment at this offset of an IPv4 packet.
352          *      offset == 0 means that (if this is an IPv4 packet)
353          *      this is the first or only fragment.
354          */
355         u_short                 offset;
356
357         /*
358          * Local copies of addresses. They are only valid if we have
359          * an IP packet.
360          *
361          * proto        The protocol. Set to 0 for non-ip packets,
362          *      or to the protocol read from the packet otherwise.
363          *      proto != 0 means that we have an IPv4 packet.
364          *
365          * src_port, dst_port   port numbers, in HOST format. Only
366          *      valid for TCP and UDP packets.
367          *
368          * src_ip, dst_ip       ip addresses, in NETWORK format.
369          *      Only valid for IPv4 packets.
370          */
371         uint8_t                 proto;
372         uint16_t                src_port;       /* NOTE: host format    */
373         uint16_t                dst_port;       /* NOTE: host format    */
374         struct in_addr          src_ip;         /* NOTE: network format */
375         struct in_addr          dst_ip;         /* NOTE: network format */
376         uint16_t                ip_len;         /* NOTE: host format    */
377         struct tcphdr           *tcp;
378 };
379
380 struct ipfw_addrs {
381         uint32_t                addr1;  /* host byte order */
382         uint32_t                addr2;  /* host byte order */
383 };
384
385 struct ipfw_ports {
386         uint16_t                port1;  /* host byte order */
387         uint16_t                port2;  /* host byte order */
388 };
389
390 struct ipfw_key {
391         union {
392                 struct ipfw_addrs addrs;
393                 uint64_t        value;
394         } addr_u;
395         union {
396                 struct ipfw_ports ports;
397                 uint32_t        value;
398         } port_u;
399         uint8_t                 proto;
400         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
401         uint16_t                rsvd2;
402 };
403
404 #define IPFW_KEY_SWAP_ADDRS     0x1
405 #define IPFW_KEY_SWAP_PORTS     0x2
406 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
407
408 struct ipfw_trkcnt {
409         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
410         struct ipfw_key         tc_key;
411         uintptr_t               tc_ruleid;
412         int                     tc_refs;
413         int                     tc_count;
414         time_t                  tc_expire;      /* userland get-only */
415         uint16_t                tc_rulenum;     /* userland get-only */
416 } __cachealign;
417
418 #define tc_addrs                tc_key.addr_u.value
419 #define tc_ports                tc_key.port_u.value
420 #define tc_proto                tc_key.proto
421 #define tc_saddr                tc_key.addr_u.addrs.addr1
422 #define tc_daddr                tc_key.addr_u.addrs.addr2
423 #define tc_sport                tc_key.port_u.ports.port1
424 #define tc_dport                tc_key.port_u.ports.port2
425
426 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
427
428 struct ipfw_state;
429
430 struct ipfw_track {
431         RB_ENTRY(ipfw_track)    t_rblink;
432         struct ipfw_key         t_key;
433         struct ip_fw            *t_rule;
434         time_t                  t_lastexp;
435         LIST_HEAD(, ipfw_state) t_state_list;
436         time_t                  t_expire;
437         volatile int            *t_count;
438         struct ipfw_trkcnt      *t_trkcnt;
439         TAILQ_ENTRY(ipfw_track) t_link;
440 };
441
442 #define t_addrs                 t_key.addr_u.value
443 #define t_ports                 t_key.port_u.value
444 #define t_proto                 t_key.proto
445 #define t_saddr                 t_key.addr_u.addrs.addr1
446 #define t_daddr                 t_key.addr_u.addrs.addr2
447 #define t_sport                 t_key.port_u.ports.port1
448 #define t_dport                 t_key.port_u.ports.port2
449
450 RB_HEAD(ipfw_track_tree, ipfw_track);
451 TAILQ_HEAD(ipfw_track_list, ipfw_track);
452
453 struct ipfw_state {
454         RB_ENTRY(ipfw_state)    st_rblink;
455         struct ipfw_key         st_key;
456
457         time_t                  st_expire;      /* expire time */
458         struct ip_fw            *st_rule;
459
460         uint64_t                st_pcnt;        /* packets */
461         uint64_t                st_bcnt;        /* bytes */
462
463         /*
464          * st_state:
465          * State of this rule, typically a combination of TCP flags.
466          *
467          * st_ack_fwd/st_ack_rev:
468          * Most recent ACKs in forward and reverse direction.  They
469          * are used to generate keepalives.
470          */
471         uint32_t                st_state;
472         uint32_t                st_ack_fwd;     /* host byte order */
473         uint32_t                st_seq_fwd;     /* host byte order */
474         uint32_t                st_ack_rev;     /* host byte order */
475         uint32_t                st_seq_rev;     /* host byte order */
476
477         uint16_t                st_flags;       /* IPFW_STATE_F_ */
478         uint16_t                st_type;        /* KEEP_STATE/LIMIT/RDR */
479         struct ipfw_track       *st_track;
480
481         LIST_ENTRY(ipfw_state)  st_trklink;
482         TAILQ_ENTRY(ipfw_state) st_link;
483 };
484
485 #define st_addrs                st_key.addr_u.value
486 #define st_ports                st_key.port_u.value
487 #define st_proto                st_key.proto
488 #define st_swap                 st_key.swap
489
490 #define IPFW_STATE_F_ACKFWD     0x0001
491 #define IPFW_STATE_F_SEQFWD     0x0002
492 #define IPFW_STATE_F_ACKREV     0x0004
493 #define IPFW_STATE_F_SEQREV     0x0008
494 #define IPFW_STATE_F_XLATSRC    0x0010
495 #define IPFW_STATE_F_XLATSLAVE  0x0020
496 #define IPFW_STATE_F_LINKED     0x0040
497
498 #define IPFW_STATE_SCANSKIP(s)  ((s)->st_type == O_ANCHOR ||    \
499                                  ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
500
501 /* Expired or being deleted. */
502 #define IPFW_STATE_ISDEAD(s)    (TIME_LEQ((s)->st_expire, time_uptime) || \
503                                  IPFW_XLAT_INVALID((s)))
504
505 TAILQ_HEAD(ipfw_state_list, ipfw_state);
506 RB_HEAD(ipfw_state_tree, ipfw_state);
507
508 struct ipfw_xlat {
509         struct ipfw_state       xlat_st;        /* MUST be the first field */
510         uint32_t                xlat_addr;      /* network byte order */
511         uint16_t                xlat_port;      /* network byte order */
512         uint16_t                xlat_dir;       /* MATCH_ */
513         struct ifnet            *xlat_ifp;      /* matching ifnet */
514         struct ipfw_xlat        *xlat_pair;     /* paired state */
515         int                     xlat_pcpu;      /* paired cpu */
516         volatile int            xlat_invalid;   /* invalid, but not dtor yet */
517         volatile uint64_t       xlat_crefs;     /* cross references */
518         struct netmsg_base      xlat_freenm;    /* for remote free */
519 };
520
521 #define xlat_type               xlat_st.st_type
522 #define xlat_flags              xlat_st.st_flags
523 #define xlat_rule               xlat_st.st_rule
524 #define xlat_bcnt               xlat_st.st_bcnt
525 #define xlat_pcnt               xlat_st.st_pcnt
526
527 struct ipfw_tblent {
528         struct radix_node       te_nodes[2];
529         struct sockaddr_in      te_key;
530         u_long                  te_use;
531         time_t                  te_lastuse;
532         struct ipfw_tblent      *te_sibling;
533         volatile int            te_expired;
534 };
535
536 struct ipfw_context {
537         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
538         struct ip_fw            *ipfw_default_rule;     /* default rule */
539         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
540
541         /*
542          * ipfw_set_disable contains one bit per set value (0..31).
543          * If the bit is set, all rules with the corresponding set
544          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
545          * default rule and CANNOT be disabled.
546          */
547         uint32_t                ipfw_set_disable;
548
549         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
550
551         struct ip_fw            *ipfw_cont_rule;
552         struct ipfw_xlat        *ipfw_cont_xlat;
553
554         struct ipfw_state_tree  ipfw_state_tree;
555         struct ipfw_state_list  ipfw_state_list;
556         int                     ipfw_state_loosecnt;
557         int                     ipfw_state_cnt;
558
559         union {
560                 struct ipfw_state state;
561                 struct ipfw_track track;
562                 struct ipfw_trkcnt trkcnt;
563         } ipfw_tmpkey;
564
565         struct ipfw_track_tree  ipfw_track_tree;
566         struct ipfw_track_list  ipfw_track_list;
567         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
568
569         struct callout          ipfw_stateto_ch;
570         time_t                  ipfw_state_lastexp;
571         struct netmsg_base      ipfw_stateexp_nm;
572         struct netmsg_base      ipfw_stateexp_more;
573         struct ipfw_state       ipfw_stateexp_anch;
574
575         struct callout          ipfw_trackto_ch;
576         time_t                  ipfw_track_lastexp;
577         struct netmsg_base      ipfw_trackexp_nm;
578         struct netmsg_base      ipfw_trackexp_more;
579         struct ipfw_track       ipfw_trackexp_anch;
580
581         struct callout          ipfw_keepalive_ch;
582         struct netmsg_base      ipfw_keepalive_nm;
583         struct netmsg_base      ipfw_keepalive_more;
584         struct ipfw_state       ipfw_keepalive_anch;
585
586         struct callout          ipfw_xlatreap_ch;
587         struct netmsg_base      ipfw_xlatreap_nm;
588         struct ipfw_state_list  ipfw_xlatreap;
589
590         /*
591          * Statistics
592          */
593         u_long                  ipfw_sts_reap;
594         u_long                  ipfw_sts_reapfailed;
595         u_long                  ipfw_sts_overflow;
596         u_long                  ipfw_sts_nomem;
597         u_long                  ipfw_sts_tcprecycled;
598
599         u_long                  ipfw_tks_nomem;
600         u_long                  ipfw_tks_reap;
601         u_long                  ipfw_tks_reapfailed;
602         u_long                  ipfw_tks_overflow;
603         u_long                  ipfw_tks_cntnomem;
604
605         u_long                  ipfw_frags;
606         u_long                  ipfw_defraged;
607         u_long                  ipfw_defrag_remote;
608
609         u_long                  ipfw_xlated;
610         u_long                  ipfw_xlate_split;
611         u_long                  ipfw_xlate_conflicts;
612         u_long                  ipfw_xlate_cresolved;
613
614         /* Last field */
615         struct radix_node_head  *ipfw_tables[];
616 };
617
618 #define IPFW_FLAG_KEEPALIVE     0x01
619 #define IPFW_FLAG_STATEEXP      0x02
620 #define IPFW_FLAG_TRACKEXP      0x04
621 #define IPFW_FLAG_STATEREAP     0x08
622 #define IPFW_FLAG_TRACKREAP     0x10
623
624 #define ipfw_state_tmpkey       ipfw_tmpkey.state
625 #define ipfw_track_tmpkey       ipfw_tmpkey.track
626 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
627
628 struct ipfw_global {
629         int                     ipfw_state_loosecnt;    /* cache aligned */
630         time_t                  ipfw_state_globexp __cachealign;
631
632         struct lwkt_token       ipfw_trkcnt_token __cachealign;
633         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
634         int                     ipfw_trkcnt_cnt;
635         time_t                  ipfw_track_globexp;
636
637         /* Accessed in netisr0. */
638         struct ip_fw            *ipfw_crossref_free __cachealign;
639         struct callout          ipfw_crossref_ch;
640         struct netmsg_base      ipfw_crossref_nm;
641
642 #ifdef KLD_MODULE
643         /*
644          * Module can not be unloaded, if there are references to
645          * certains rules of ipfw(4), e.g. dummynet(4)
646          */
647         int                     ipfw_refcnt __cachealign;
648 #endif
649 } __cachealign;
650
651 static struct ipfw_context      *ipfw_ctx[MAXCPU];
652
653 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
654
655 /*
656  * Following two global variables are accessed and updated only
657  * in netisr0.
658  */
659 static uint32_t static_count;   /* # of static rules */
660 static uint32_t static_ioc_len; /* bytes of static rules */
661
662 /*
663  * If 1, then ipfw static rules are being flushed,
664  * ipfw_chk() will skip to the default rule.
665  */
666 static int ipfw_flushing;
667
668 static int fw_verbose;
669 static int verbose_limit;
670
671 static int fw_debug;
672 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
673
674 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
675
676 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
677 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
678
679 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
680
681 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
682 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
683     "Firewall statistics");
684
685 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
686     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
687 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
688     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
689     "Rule number autincrement step");
690 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
691     &fw_one_pass, 0,
692     "Only do a single pass through ipfw when using dummynet(4)");
693 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
694     &fw_debug, 0, "Enable printing of debug ip_fw statements");
695 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
696     &fw_verbose, 0, "Log matches to ipfw rules");
697 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
698     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
699 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
700     &ipfw_table_max, 0, "Max # of tables");
701
702 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
703 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
704 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
705 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
706 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
707 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
708
709 /*
710  * Timeouts for various events in handing states.
711  *
712  * NOTE:
713  * 1 == 0~1 second.
714  * 2 == 1~2 second(s).
715  *
716  * We use 2 seconds for FIN lifetime, so that the states will not be
717  * ripped prematurely.
718  */
719 static uint32_t dyn_ack_lifetime = 300;
720 static uint32_t dyn_syn_lifetime = 20;
721 static uint32_t dyn_finwait_lifetime = 20;
722 static uint32_t dyn_fin_lifetime = 2;
723 static uint32_t dyn_rst_lifetime = 2;
724 static uint32_t dyn_udp_lifetime = 10;
725 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
726
727 /*
728  * Keepalives are sent if dyn_keepalive is set. They are sent every
729  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
730  * seconds of lifetime of a rule.
731  */
732 static uint32_t dyn_keepalive_interval = 20;
733 static uint32_t dyn_keepalive_period = 5;
734 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
735
736 static struct ipfw_global       ipfw_gd;
737 static int      ipfw_state_loosecnt_updthr;
738 static int      ipfw_state_max = 4096;  /* max # of states */
739 static int      ipfw_track_max = 4096;  /* max # of tracks */
740
741 static int      ipfw_state_headroom;    /* setup at module load time */
742 static int      ipfw_state_reap_min = 8;
743 static int      ipfw_state_expire_max = 32;
744 static int      ipfw_state_scan_max = 256;
745 static int      ipfw_keepalive_max = 8;
746 static int      ipfw_track_reap_max = 4;
747 static int      ipfw_track_expire_max = 16;
748 static int      ipfw_track_scan_max = 128;
749
750 static eventhandler_tag ipfw_ifaddr_event;
751
752 /* Compat */
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
754     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
755     "Number of states and tracks");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
757     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
758     "Max number of states and tracks");
759
760 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
761     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
762     "Number of states");
763 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
764     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
765     "Max number of states");
766 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
767     &ipfw_state_headroom, 0, "headroom for state reap");
768 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
769     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
770 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
771     &ipfw_track_max, 0, "Max number of tracks");
772 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
773     &static_count, 0, "Number of static rules");
774 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
775     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
776 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
777     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
778 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
779     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
780 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
781     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
782 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
783     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
784 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
785     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
786 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
787     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
788 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
789     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
790 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
791     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
792     "I", "# of states to scan for each expire iteration");
793 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
794     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
795     "I", "# of states to expire for each expire iteration");
796 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
797     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
798     "I", "# of states to expire for each expire iteration");
799 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
800     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
801     "I", "# of states to reap for state shortage");
802 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
803     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
804     "I", "# of tracks to scan for each expire iteration");
805 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
806     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
807     "I", "# of tracks to expire for each expire iteration");
808 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
809     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
810     "I", "# of tracks to reap for track shortage");
811
812 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
813     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
814     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
815     "LU", "# of state reaps due to states shortage");
816 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
817     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
818     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
819     "LU", "# of state reap failure");
820 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
821     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
822     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
823     "LU", "# of state overflow");
824 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
825     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
826     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
827     "LU", "# of state allocation failure");
828 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
829     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
830     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
831     "LU", "# of state deleted due to fast TCP port recycling");
832
833 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
834     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
835     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
836     "LU", "# of track allocation failure");
837 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
838     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
839     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
840     "LU", "# of track reap due to tracks shortage");
841 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
842     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
843     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
844     "LU", "# of track reap failure");
845 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
846     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
847     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
848     "LU", "# of track overflow");
849 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
850     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
851     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
852     "LU", "# of track counter allocation failure");
853 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
854     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
855     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
856     "LU", "# of IP fragements defraged");
857 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
858     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
859     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
860     "LU", "# of IP packets after defrag");
861 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
862     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
863     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
864     "LU", "# of IP packets after defrag dispatched to remote cpus");
865 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
866     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
867     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
868     "LU", "# address/port translations");
869 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
870     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
871     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
872     "LU", "# address/port translations split between different cpus");
873 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
874     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
875     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
876     "LU", "# address/port translations conflicts on remote cpu");
877 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
878     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
879     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
880     "LU", "# address/port translations conflicts resolved on remote cpu");
881
882 static int              ipfw_state_cmp(struct ipfw_state *,
883                             struct ipfw_state *);
884 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
885                             struct ipfw_trkcnt *);
886 static int              ipfw_track_cmp(struct ipfw_track *,
887                             struct ipfw_track *);
888
889 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
890 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891
892 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
893 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894
895 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
896 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897
898 static int              ipfw_chk(struct ip_fw_args *);
899 static void             ipfw_track_expire_ipifunc(void *);
900 static void             ipfw_state_expire_ipifunc(void *);
901 static void             ipfw_keepalive(void *);
902 static int              ipfw_state_expire_start(struct ipfw_context *,
903                             int, int);
904 static void             ipfw_crossref_timeo(void *);
905 static void             ipfw_state_remove(struct ipfw_context *,
906                             struct ipfw_state *);
907 static void             ipfw_xlat_reap_timeo(void *);
908 static void             ipfw_defrag_redispatch(struct mbuf *, int,
909                             struct ip_fw *);
910
911 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
912 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKINIT     \
914         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
915
916 static void
917 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
918     const struct sockaddr *netmask)
919 {
920         const u_char *cp1 = (const u_char *)src;
921         u_char *cp2 = (u_char *)dst;
922         const u_char *cp3 = (const u_char *)netmask;
923         u_char *cplim = cp2 + *cp3;
924         u_char *cplim2 = cp2 + *cp1;
925
926         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
927         cp3 += 2;
928         if (cplim > cplim2)
929                 cplim = cplim2;
930         while (cp2 < cplim)
931                 *cp2++ = *cp1++ & *cp3++;
932         if (cp2 < cplim2)
933                 bzero(cp2, cplim2 - cp2);
934 }
935
936 static __inline uint16_t
937 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
938 {
939         uint32_t l;
940
941         if (udp && !cksum)
942                 return (0x0000);
943         l = cksum + old - new;
944         l = (l >> 16) + (l & 65535);
945         l = l & 65535;
946         if (udp && !l)
947                 return (0xFFFF);
948         return (l);
949 }
950
951 static __inline void
952 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
953     in_addr_t daddr, uint16_t dport, uint8_t proto)
954 {
955
956         key->proto = proto;
957         key->swap = 0;
958
959         if (saddr < daddr) {
960                 key->addr_u.addrs.addr1 = daddr;
961                 key->addr_u.addrs.addr2 = saddr;
962                 key->swap |= IPFW_KEY_SWAP_ADDRS;
963         } else {
964                 key->addr_u.addrs.addr1 = saddr;
965                 key->addr_u.addrs.addr2 = daddr;
966         }
967
968         if (sport < dport) {
969                 key->port_u.ports.port1 = dport;
970                 key->port_u.ports.port2 = sport;
971                 key->swap |= IPFW_KEY_SWAP_PORTS;
972         } else {
973                 key->port_u.ports.port1 = sport;
974                 key->port_u.ports.port2 = dport;
975         }
976
977         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
978                 key->swap |= IPFW_KEY_SWAP_PORTS;
979         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
980                 key->swap |= IPFW_KEY_SWAP_ADDRS;
981 }
982
983 static __inline void
984 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
985     in_addr_t *daddr, uint16_t *dport)
986 {
987
988         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
989                 *saddr = key->addr_u.addrs.addr2;
990                 *daddr = key->addr_u.addrs.addr1;
991         } else {
992                 *saddr = key->addr_u.addrs.addr1;
993                 *daddr = key->addr_u.addrs.addr2;
994         }
995
996         if (key->swap & IPFW_KEY_SWAP_PORTS) {
997                 *sport = key->port_u.ports.port2;
998                 *dport = key->port_u.ports.port1;
999         } else {
1000                 *sport = key->port_u.ports.port1;
1001                 *dport = key->port_u.ports.port2;
1002         }
1003 }
1004
1005 static int
1006 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1007 {
1008
1009         if (s1->st_proto > s2->st_proto)
1010                 return (1);
1011         if (s1->st_proto < s2->st_proto)
1012                 return (-1);
1013
1014         if (s1->st_addrs > s2->st_addrs)
1015                 return (1);
1016         if (s1->st_addrs < s2->st_addrs)
1017                 return (-1);
1018
1019         if (s1->st_ports > s2->st_ports)
1020                 return (1);
1021         if (s1->st_ports < s2->st_ports)
1022                 return (-1);
1023
1024         if (s1->st_swap == s2->st_swap ||
1025             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1026                 return (0);
1027
1028         if (s1->st_swap > s2->st_swap)
1029                 return (1);
1030         else
1031                 return (-1);
1032 }
1033
1034 static int
1035 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1036 {
1037
1038         if (t1->tc_proto > t2->tc_proto)
1039                 return (1);
1040         if (t1->tc_proto < t2->tc_proto)
1041                 return (-1);
1042
1043         if (t1->tc_addrs > t2->tc_addrs)
1044                 return (1);
1045         if (t1->tc_addrs < t2->tc_addrs)
1046                 return (-1);
1047
1048         if (t1->tc_ports > t2->tc_ports)
1049                 return (1);
1050         if (t1->tc_ports < t2->tc_ports)
1051                 return (-1);
1052
1053         if (t1->tc_ruleid > t2->tc_ruleid)
1054                 return (1);
1055         if (t1->tc_ruleid < t2->tc_ruleid)
1056                 return (-1);
1057
1058         return (0);
1059 }
1060
1061 static int
1062 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1063 {
1064
1065         if (t1->t_proto > t2->t_proto)
1066                 return (1);
1067         if (t1->t_proto < t2->t_proto)
1068                 return (-1);
1069
1070         if (t1->t_addrs > t2->t_addrs)
1071                 return (1);
1072         if (t1->t_addrs < t2->t_addrs)
1073                 return (-1);
1074
1075         if (t1->t_ports > t2->t_ports)
1076                 return (1);
1077         if (t1->t_ports < t2->t_ports)
1078                 return (-1);
1079
1080         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1081                 return (1);
1082         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1083                 return (-1);
1084
1085         return (0);
1086 }
1087
1088 static __inline struct ipfw_state *
1089 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1090 {
1091         struct ipfw_state *dup;
1092
1093         KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1094             ("state %p was linked", s));
1095         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1096         if (dup == NULL) {
1097                 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1098                 s->st_flags |= IPFW_STATE_F_LINKED;
1099         }
1100         return (dup);
1101 }
1102
1103 static __inline void
1104 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1105 {
1106
1107         KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1108             ("state %p was not linked", s));
1109         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1110         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1111         s->st_flags &= ~IPFW_STATE_F_LINKED;
1112 }
1113
1114 static void
1115 ipfw_state_max_set(int state_max)
1116 {
1117
1118         ipfw_state_max = state_max;
1119         /* Allow 5% states over-allocation. */
1120         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1121 }
1122
1123 static __inline int
1124 ipfw_state_cntcoll(void)
1125 {
1126         int cpu, state_cnt = 0;
1127
1128         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1129                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1130         return (state_cnt);
1131 }
1132
1133 static __inline int
1134 ipfw_state_cntsync(void)
1135 {
1136         int state_cnt;
1137
1138         state_cnt = ipfw_state_cntcoll();
1139         ipfw_gd.ipfw_state_loosecnt = state_cnt;
1140         return (state_cnt);
1141 }
1142
1143 static __inline int
1144 ipfw_free_rule(struct ip_fw *rule)
1145 {
1146         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1147         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1148         rule->refcnt--;
1149         if (rule->refcnt == 0) {
1150                 if (rule->cross_rules != NULL)
1151                         kfree(rule->cross_rules, M_IPFW);
1152                 kfree(rule, M_IPFW);
1153                 return 1;
1154         }
1155         return 0;
1156 }
1157
1158 static void
1159 ipfw_unref_rule(void *priv)
1160 {
1161         ipfw_free_rule(priv);
1162 #ifdef KLD_MODULE
1163         KASSERT(ipfw_gd.ipfw_refcnt > 0,
1164             ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1165         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1166 #endif
1167 }
1168
1169 static __inline void
1170 ipfw_ref_rule(struct ip_fw *rule)
1171 {
1172         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1173 #ifdef KLD_MODULE
1174         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1175 #endif
1176         rule->refcnt++;
1177 }
1178
1179 /*
1180  * This macro maps an ip pointer into a layer3 header pointer of type T
1181  */
1182 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1183
1184 static __inline int
1185 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1186 {
1187         int type = L3HDR(struct icmp,ip)->icmp_type;
1188         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1189         int idx = type / 32;
1190
1191         if (idx >= idx_max)
1192                 return (0);
1193         return (cmd->d[idx] & (1 << (type % 32)));
1194 }
1195
1196 static __inline int
1197 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1198 {
1199         int code = L3HDR(struct icmp,ip)->icmp_code;
1200         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1201         int idx = code / 32;
1202
1203         if (idx >= idx_max)
1204                 return (0);
1205         return (cmd->d[idx] & (1 << (code % 32)));
1206 }
1207
1208 #define TT      ((1 << ICMP_ECHO) | \
1209                  (1 << ICMP_ROUTERSOLICIT) | \
1210                  (1 << ICMP_TSTAMP) | \
1211                  (1 << ICMP_IREQ) | \
1212                  (1 << ICMP_MASKREQ))
1213
1214 static int
1215 is_icmp_query(struct ip *ip)
1216 {
1217         int type = L3HDR(struct icmp, ip)->icmp_type;
1218
1219         return (type < 32 && (TT & (1 << type)));
1220 }
1221
1222 #undef TT
1223
1224 /*
1225  * The following checks use two arrays of 8 or 16 bits to store the
1226  * bits that we want set or clear, respectively. They are in the
1227  * low and high half of cmd->arg1 or cmd->d[0].
1228  *
1229  * We scan options and store the bits we find set. We succeed if
1230  *
1231  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1232  *
1233  * The code is sometimes optimized not to store additional variables.
1234  */
1235 static int
1236 flags_match(ipfw_insn *cmd, uint8_t bits)
1237 {
1238         u_char want_clear;
1239         bits = ~bits;
1240
1241         if (((cmd->arg1 & 0xff) & bits) != 0)
1242                 return 0; /* some bits we want set were clear */
1243
1244         want_clear = (cmd->arg1 >> 8) & 0xff;
1245         if ((want_clear & bits) != want_clear)
1246                 return 0; /* some bits we want clear were set */
1247         return 1;
1248 }
1249
1250 static int
1251 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1252 {
1253         int optlen, bits = 0;
1254         u_char *cp = (u_char *)(ip + 1);
1255         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1256
1257         for (; x > 0; x -= optlen, cp += optlen) {
1258                 int opt = cp[IPOPT_OPTVAL];
1259
1260                 if (opt == IPOPT_EOL)
1261                         break;
1262
1263                 if (opt == IPOPT_NOP) {
1264                         optlen = 1;
1265                 } else {
1266                         optlen = cp[IPOPT_OLEN];
1267                         if (optlen <= 0 || optlen > x)
1268                                 return 0; /* invalid or truncated */
1269                 }
1270
1271                 switch (opt) {
1272                 case IPOPT_LSRR:
1273                         bits |= IP_FW_IPOPT_LSRR;
1274                         break;
1275
1276                 case IPOPT_SSRR:
1277                         bits |= IP_FW_IPOPT_SSRR;
1278                         break;
1279
1280                 case IPOPT_RR:
1281                         bits |= IP_FW_IPOPT_RR;
1282                         break;
1283
1284                 case IPOPT_TS:
1285                         bits |= IP_FW_IPOPT_TS;
1286                         break;
1287
1288                 default:
1289                         break;
1290                 }
1291         }
1292         return (flags_match(cmd, bits));
1293 }
1294
1295 static int
1296 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1297 {
1298         int optlen, bits = 0;
1299         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1300         u_char *cp = (u_char *)(tcp + 1);
1301         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1302
1303         for (; x > 0; x -= optlen, cp += optlen) {
1304                 int opt = cp[0];
1305
1306                 if (opt == TCPOPT_EOL)
1307                         break;
1308
1309                 if (opt == TCPOPT_NOP) {
1310                         optlen = 1;
1311                 } else {
1312                         optlen = cp[1];
1313                         if (optlen <= 0)
1314                                 break;
1315                 }
1316
1317                 switch (opt) {
1318                 case TCPOPT_MAXSEG:
1319                         bits |= IP_FW_TCPOPT_MSS;
1320                         break;
1321
1322                 case TCPOPT_WINDOW:
1323                         bits |= IP_FW_TCPOPT_WINDOW;
1324                         break;
1325
1326                 case TCPOPT_SACK_PERMITTED:
1327                 case TCPOPT_SACK:
1328                         bits |= IP_FW_TCPOPT_SACK;
1329                         break;
1330
1331                 case TCPOPT_TIMESTAMP:
1332                         bits |= IP_FW_TCPOPT_TS;
1333                         break;
1334
1335                 case TCPOPT_CC:
1336                 case TCPOPT_CCNEW:
1337                 case TCPOPT_CCECHO:
1338                         bits |= IP_FW_TCPOPT_CC;
1339                         break;
1340
1341                 default:
1342                         break;
1343                 }
1344         }
1345         return (flags_match(cmd, bits));
1346 }
1347
1348 static int
1349 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1350 {
1351         if (ifp == NULL)        /* no iface with this packet, match fails */
1352                 return 0;
1353
1354         /* Check by name or by IP address */
1355         if (cmd->name[0] != '\0') { /* match by name */
1356                 /* Check name */
1357                 if (cmd->p.glob) {
1358                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1359                                 return(1);
1360                 } else {
1361                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1362                                 return(1);
1363                 }
1364         } else {
1365                 struct ifaddr_container *ifac;
1366
1367                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1368                         struct ifaddr *ia = ifac->ifa;
1369
1370                         if (ia->ifa_addr == NULL)
1371                                 continue;
1372                         if (ia->ifa_addr->sa_family != AF_INET)
1373                                 continue;
1374                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1375                             (ia->ifa_addr))->sin_addr.s_addr)
1376                                 return(1);      /* match */
1377                 }
1378         }
1379         return(0);      /* no match, fail ... */
1380 }
1381
1382 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1383
1384 /*
1385  * We enter here when we have a rule with O_LOG.
1386  * XXX this function alone takes about 2Kbytes of code!
1387  */
1388 static void
1389 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1390     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1391 {
1392         char *action;
1393         int limit_reached = 0;
1394         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1395
1396         fragment[0] = '\0';
1397         proto[0] = '\0';
1398
1399         if (f == NULL) {        /* bogus pkt */
1400                 if (verbose_limit != 0 &&
1401                     ctx->ipfw_norule_counter >= verbose_limit)
1402                         return;
1403                 ctx->ipfw_norule_counter++;
1404                 if (ctx->ipfw_norule_counter == verbose_limit)
1405                         limit_reached = verbose_limit;
1406                 action = "Refuse";
1407         } else {        /* O_LOG is the first action, find the real one */
1408                 ipfw_insn *cmd = ACTION_PTR(f);
1409                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1410
1411                 if (l->max_log != 0 && l->log_left == 0)
1412                         return;
1413                 l->log_left--;
1414                 if (l->log_left == 0)
1415                         limit_reached = l->max_log;
1416                 cmd += F_LEN(cmd);      /* point to first action */
1417                 if (cmd->opcode == O_PROB)
1418                         cmd += F_LEN(cmd);
1419
1420                 action = action2;
1421                 switch (cmd->opcode) {
1422                 case O_DENY:
1423                         action = "Deny";
1424                         break;
1425
1426                 case O_REJECT:
1427                         if (cmd->arg1==ICMP_REJECT_RST) {
1428                                 action = "Reset";
1429                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1430                                 action = "Reject";
1431                         } else {
1432                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1433                                           cmd->arg1);
1434                         }
1435                         break;
1436
1437                 case O_ACCEPT:
1438                         action = "Accept";
1439                         break;
1440
1441                 case O_COUNT:
1442                         action = "Count";
1443                         break;
1444
1445                 case O_DIVERT:
1446                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1447                         break;
1448
1449                 case O_TEE:
1450                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1451                         break;
1452
1453                 case O_SKIPTO:
1454                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1455                         break;
1456
1457                 case O_PIPE:
1458                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1459                         break;
1460
1461                 case O_QUEUE:
1462                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1463                         break;
1464
1465                 case O_FORWARD_IP:
1466                         {
1467                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1468                                 int len;
1469
1470                                 len = ksnprintf(SNPARGS(action2, 0),
1471                                     "Forward to %s",
1472                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1473                                 if (sa->sa.sin_port) {
1474                                         ksnprintf(SNPARGS(action2, len), ":%d",
1475                                                   sa->sa.sin_port);
1476                                 }
1477                         }
1478                         break;
1479
1480                 default:
1481                         action = "UNKNOWN";
1482                         break;
1483                 }
1484         }
1485
1486         if (hlen == 0) {        /* non-ip */
1487                 ksnprintf(SNPARGS(proto, 0), "MAC");
1488         } else {
1489                 struct ip *ip = mtod(m, struct ip *);
1490                 /* these three are all aliases to the same thing */
1491                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1492                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1493                 struct udphdr *const udp = (struct udphdr *)icmp;
1494
1495                 int ip_off, offset, ip_len;
1496                 int len;
1497
1498                 ip_off = ntohs(ip->ip_off);
1499                 ip_len = ntohs(ip->ip_len);
1500                 offset = ip_off & IP_OFFMASK;
1501
1502                 switch (ip->ip_p) {
1503                 case IPPROTO_TCP:
1504                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1505                                         kinet_ntoa(ip->ip_src, abuf));
1506                         if (offset == 0) {
1507                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1508                                           ntohs(tcp->th_sport),
1509                                           kinet_ntoa(ip->ip_dst, abuf),
1510                                           ntohs(tcp->th_dport));
1511                         } else {
1512                                 ksnprintf(SNPARGS(proto, len), " %s",
1513                                           kinet_ntoa(ip->ip_dst, abuf));
1514                         }
1515                         break;
1516
1517                 case IPPROTO_UDP:
1518                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1519                                         kinet_ntoa(ip->ip_src, abuf));
1520                         if (offset == 0) {
1521                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1522                                           ntohs(udp->uh_sport),
1523                                           kinet_ntoa(ip->ip_dst, abuf),
1524                                           ntohs(udp->uh_dport));
1525                         } else {
1526                                 ksnprintf(SNPARGS(proto, len), " %s",
1527                                           kinet_ntoa(ip->ip_dst, abuf));
1528                         }
1529                         break;
1530
1531                 case IPPROTO_ICMP:
1532                         if (offset == 0) {
1533                                 len = ksnprintf(SNPARGS(proto, 0),
1534                                                 "ICMP:%u.%u ",
1535                                                 icmp->icmp_type,
1536                                                 icmp->icmp_code);
1537                         } else {
1538                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1539                         }
1540                         len += ksnprintf(SNPARGS(proto, len), "%s",
1541                                          kinet_ntoa(ip->ip_src, abuf));
1542                         ksnprintf(SNPARGS(proto, len), " %s",
1543                                   kinet_ntoa(ip->ip_dst, abuf));
1544                         break;
1545
1546                 default:
1547                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1548                                         kinet_ntoa(ip->ip_src, abuf));
1549                         ksnprintf(SNPARGS(proto, len), " %s",
1550                                   kinet_ntoa(ip->ip_dst, abuf));
1551                         break;
1552                 }
1553
1554                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1555                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1556                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1557                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1558                 }
1559         }
1560
1561         if (oif || m->m_pkthdr.rcvif) {
1562                 log(LOG_SECURITY | LOG_INFO,
1563                     "ipfw: %d %s %s %s via %s%s\n",
1564                     f ? f->rulenum : -1,
1565                     action, proto, oif ? "out" : "in",
1566                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1567                     fragment);
1568         } else {
1569                 log(LOG_SECURITY | LOG_INFO,
1570                     "ipfw: %d %s %s [no if info]%s\n",
1571                     f ? f->rulenum : -1,
1572                     action, proto, fragment);
1573         }
1574
1575         if (limit_reached) {
1576                 log(LOG_SECURITY | LOG_NOTICE,
1577                     "ipfw: limit %d reached on entry %d\n",
1578                     limit_reached, f ? f->rulenum : -1);
1579         }
1580 }
1581
1582 #undef SNPARGS
1583
1584 static void
1585 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1586 {
1587         struct ip_fw *rule = slave_x->xlat_rule;
1588
1589         KKASSERT(rule->cpuid == mycpuid);
1590
1591         /* No more cross references; free this pair now. */
1592         kfree(x, M_IPFW);
1593         kfree(slave_x, M_IPFW);
1594
1595         /* See the comment in ipfw_ip_xlate_dispatch(). */
1596         rule->cross_refs--;
1597 }
1598
1599 static void
1600 ipfw_xlat_reap_dispatch(netmsg_t nm)
1601 {
1602         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1603         struct ipfw_state *s, *ns;
1604
1605         ASSERT_NETISR_NCPUS(mycpuid);
1606
1607         crit_enter();
1608         /* Reply ASAP. */
1609         netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1610         crit_exit();
1611
1612         /* TODO: limit scanning depth */
1613         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1614                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1615                 struct ipfw_xlat *slave_x = x->xlat_pair;
1616                 uint64_t crefs;
1617
1618                 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1619                 if (crefs == 0) {
1620                         TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1621                         ipfw_xlat_reap(x, slave_x);
1622                 }
1623         }
1624         if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1625                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1626                     &ctx->ipfw_xlatreap_nm);
1627         }
1628 }
1629
1630 static void
1631 ipfw_xlat_reap_timeo(void *xnm)
1632 {
1633         struct netmsg_base *nm = xnm;
1634
1635         KKASSERT(mycpuid < netisr_ncpus);
1636
1637         crit_enter();
1638         if (nm->lmsg.ms_flags & MSGF_DONE)
1639                 netisr_sendmsg_oncpu(nm);
1640         crit_exit();
1641 }
1642
1643 static void
1644 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1645 {
1646         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1647         struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1648         struct ipfw_xlat *slave_x = x->xlat_pair;
1649         uint64_t crefs;
1650
1651         ASSERT_NETISR_NCPUS(mycpuid);
1652
1653         KKASSERT(slave_x != NULL);
1654         KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1655
1656         KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1657             ("master xlat is still linked"));
1658         if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1659                 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1660
1661         /* See the comment in ipfw_ip_xlate_dispatch(). */
1662         slave_x->xlat_crefs--;
1663
1664         crefs = slave_x->xlat_crefs + x->xlat_crefs;
1665         if (crefs == 0) {
1666                 ipfw_xlat_reap(x, slave_x);
1667                 return;
1668         }
1669
1670         if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1671                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1672                     &ctx->ipfw_xlatreap_nm);
1673         }
1674
1675         /*
1676          * This pair is still referenced; defer its destruction.
1677          * YYY reuse st_link.
1678          */
1679         TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1680 }
1681
1682 static __inline void
1683 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1684 {
1685
1686         x->xlat_invalid = 1;
1687         x->xlat_pair->xlat_invalid = 1;
1688 }
1689
1690 static void
1691 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1692 {
1693         struct ipfw_xlat *x, *slave_x;
1694         struct netmsg_base *nm;
1695
1696         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1697             IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1698         KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1699             ("delete slave xlat"));
1700
1701         KASSERT(ctx->ipfw_state_cnt > 0,
1702             ("invalid state count %d", ctx->ipfw_state_cnt));
1703         ctx->ipfw_state_cnt--;
1704         if (ctx->ipfw_state_loosecnt > 0)
1705                 ctx->ipfw_state_loosecnt--;
1706
1707         /*
1708          * Unhook this state.
1709          */
1710         if (s->st_track != NULL) {
1711                 struct ipfw_track *t = s->st_track;
1712
1713                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1714                     ("track state list is empty"));
1715                 LIST_REMOVE(s, st_trklink);
1716
1717                 KASSERT(*t->t_count > 0,
1718                     ("invalid track count %d", *t->t_count));
1719                 atomic_subtract_int(t->t_count, 1);
1720         }
1721         ipfw_state_unlink(ctx, s);
1722
1723         /*
1724          * Free this state.  Xlat requires special processing,
1725          * since xlat are paired state and they could be on
1726          * different cpus.
1727          */
1728
1729         if (!IPFW_ISXLAT(s->st_type)) {
1730                 /* Not xlat; free now. */
1731                 kfree(s, M_IPFW);
1732                 /* Done! */
1733                 return;
1734         }
1735         x = (struct ipfw_xlat *)s;
1736
1737         if (x->xlat_pair == NULL) {
1738                 /* Not setup yet; free now. */
1739                 kfree(x, M_IPFW);
1740                 /* Done! */
1741                 return;
1742         }
1743         slave_x = x->xlat_pair;
1744         KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1745
1746         if (x->xlat_pcpu == mycpuid) {
1747                 /*
1748                  * Paired states are on the same cpu; delete this
1749                  * pair now.
1750                  */
1751                 KKASSERT(x->xlat_crefs == 0);
1752                 KKASSERT(slave_x->xlat_crefs == 0);
1753                 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1754                         ipfw_state_unlink(ctx, &slave_x->xlat_st);
1755                 kfree(x, M_IPFW);
1756                 kfree(slave_x, M_IPFW);
1757                 return;
1758         }
1759
1760         /*
1761          * Free the paired states on the cpu owning the slave xlat.
1762          */
1763
1764         /*
1765          * Mark the state pair invalid; completely deleting them
1766          * may take some time.
1767          */
1768         ipfw_xlat_invalidate(x);
1769
1770         nm = &x->xlat_freenm;
1771         netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1772             ipfw_xlat_free_dispatch);
1773         nm->lmsg.u.ms_resultp = x;
1774
1775         /* See the comment in ipfw_xlate_redispatch(). */
1776         x->xlat_rule->cross_refs++;
1777         x->xlat_crefs++;
1778
1779         netisr_sendmsg(nm, x->xlat_pcpu);
1780 }
1781
1782 static void
1783 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1784 {
1785
1786         if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1787                 KKASSERT(IPFW_ISXLAT(s->st_type));
1788                 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1789                 ipfw_state_unlink(ctx, s);
1790                 return;
1791         }
1792         ipfw_state_del(ctx, s);
1793 }
1794
1795 static int
1796 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1797 {
1798         struct ipfw_state *s, *anchor;
1799         int expired;
1800
1801         if (reap_max < ipfw_state_reap_min)
1802                 reap_max = ipfw_state_reap_min;
1803
1804         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1805                 /*
1806                  * Kick start state expiring.  Ignore scan limit,
1807                  * we are short of states.
1808                  */
1809                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1810                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1811                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1812                 return (expired);
1813         }
1814
1815         /*
1816          * States are being expired.
1817          */
1818
1819         if (ctx->ipfw_state_cnt == 0)
1820                 return (0);
1821
1822         expired = 0;
1823         anchor = &ctx->ipfw_stateexp_anch;
1824         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1825                 /*
1826                  * Ignore scan limit; we are short of states.
1827                  */
1828
1829                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1830                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1831
1832                 if (IPFW_STATE_SCANSKIP(s))
1833                         continue;
1834
1835                 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1836                         ipfw_state_del(ctx, s);
1837                         if (++expired >= reap_max)
1838                                 break;
1839                         if ((expired & 0xff) == 0 &&
1840                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1841                             ipfw_state_max)
1842                                 break;
1843                 }
1844         }
1845         /*
1846          * NOTE:
1847          * Leave the anchor on the list, even if the end of the list has
1848          * been reached.  ipfw_state_expire_more_dispatch() will handle
1849          * the removal.
1850          */
1851         return (expired);
1852 }
1853
1854 static void
1855 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1856 {
1857         struct ipfw_state *s, *sn;
1858
1859         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1860                 if (IPFW_STATE_SCANSKIP(s))
1861                         continue;
1862                 if (rule != NULL && s->st_rule != rule)
1863                         continue;
1864                 ipfw_state_del(ctx, s);
1865         }
1866 }
1867
1868 static void
1869 ipfw_state_expire_done(struct ipfw_context *ctx)
1870 {
1871
1872         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1873             ("stateexp is not in progress"));
1874         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1875         callout_reset(&ctx->ipfw_stateto_ch, hz,
1876             ipfw_state_expire_ipifunc, NULL);
1877 }
1878
1879 static void
1880 ipfw_state_expire_more(struct ipfw_context *ctx)
1881 {
1882         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1883
1884         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1885             ("stateexp is not in progress"));
1886         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1887             ("stateexp more did not finish"));
1888         netisr_sendmsg_oncpu(nm);
1889 }
1890
1891 static int
1892 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1893     int scan_max, int expire_max)
1894 {
1895         struct ipfw_state *s;
1896         int scanned = 0, expired = 0;
1897
1898         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1899             ("stateexp is not in progress"));
1900
1901         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1902                 if (scanned++ >= scan_max) {
1903                         ipfw_state_expire_more(ctx);
1904                         return (expired);
1905                 }
1906
1907                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1908                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1909
1910                 if (IPFW_STATE_SCANSKIP(s))
1911                         continue;
1912
1913                 if (IPFW_STATE_ISDEAD(s) ||
1914                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1915                      IPFW_STATE_TCPCLOSED(s))) {
1916                         ipfw_state_del(ctx, s);
1917                         if (++expired >= expire_max) {
1918                                 ipfw_state_expire_more(ctx);
1919                                 return (expired);
1920                         }
1921                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1922                             (expired & 0xff) == 0 &&
1923                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1924                             ipfw_state_max) {
1925                                 ipfw_state_expire_more(ctx);
1926                                 return (expired);
1927                         }
1928                 }
1929         }
1930         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1931         ipfw_state_expire_done(ctx);
1932         return (expired);
1933 }
1934
1935 static void
1936 ipfw_state_expire_more_dispatch(netmsg_t nm)
1937 {
1938         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1939         struct ipfw_state *anchor;
1940
1941         ASSERT_NETISR_NCPUS(mycpuid);
1942         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1943             ("statexp is not in progress"));
1944
1945         /* Reply ASAP */
1946         netisr_replymsg(&nm->base, 0);
1947
1948         anchor = &ctx->ipfw_stateexp_anch;
1949         if (ctx->ipfw_state_cnt == 0) {
1950                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1951                 ipfw_state_expire_done(ctx);
1952                 return;
1953         }
1954         ipfw_state_expire_loop(ctx, anchor,
1955             ipfw_state_scan_max, ipfw_state_expire_max);
1956 }
1957
1958 static int
1959 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1960 {
1961         struct ipfw_state *anchor;
1962
1963         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1964             ("stateexp is in progress"));
1965         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1966
1967         if (ctx->ipfw_state_cnt == 0) {
1968                 ipfw_state_expire_done(ctx);
1969                 return (0);
1970         }
1971
1972         /*
1973          * Do not expire more than once per second, it is useless.
1974          */
1975         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1976             ctx->ipfw_state_lastexp == time_uptime) {
1977                 ipfw_state_expire_done(ctx);
1978                 return (0);
1979         }
1980         ctx->ipfw_state_lastexp = time_uptime;
1981
1982         anchor = &ctx->ipfw_stateexp_anch;
1983         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1984         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1985 }
1986
1987 static void
1988 ipfw_state_expire_dispatch(netmsg_t nm)
1989 {
1990         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1991
1992         ASSERT_NETISR_NCPUS(mycpuid);
1993
1994         /* Reply ASAP */
1995         crit_enter();
1996         netisr_replymsg(&nm->base, 0);
1997         crit_exit();
1998
1999         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2000                 /* Running; done. */
2001                 return;
2002         }
2003         ipfw_state_expire_start(ctx,
2004             ipfw_state_scan_max, ipfw_state_expire_max);
2005 }
2006
2007 static void
2008 ipfw_state_expire_ipifunc(void *dummy __unused)
2009 {
2010         struct netmsg_base *msg;
2011
2012         KKASSERT(mycpuid < netisr_ncpus);
2013         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2014
2015         crit_enter();
2016         if (msg->lmsg.ms_flags & MSGF_DONE)
2017                 netisr_sendmsg_oncpu(msg);
2018         crit_exit();
2019 }
2020
2021 static boolean_t
2022 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2023 {
2024         uint32_t seq = ntohl(tcp->th_seq);
2025         uint32_t ack = ntohl(tcp->th_ack);
2026
2027         if (tcp->th_flags & TH_RST)
2028                 return (TRUE);
2029
2030         if (dir == MATCH_FORWARD) {
2031                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2032                         s->st_flags |= IPFW_STATE_F_SEQFWD;
2033                         s->st_seq_fwd = seq;
2034                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2035                         s->st_seq_fwd = seq;
2036                 } else {
2037                         /* Out-of-sequence; done. */
2038                         return (FALSE);
2039                 }
2040                 if (tcp->th_flags & TH_ACK) {
2041                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2042                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
2043                                 s->st_ack_fwd = ack;
2044                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2045                                 s->st_ack_fwd = ack;
2046                         } else {
2047                                 /* Out-of-sequence; done. */
2048                                 return (FALSE);
2049                         }
2050
2051                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2052                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2053                                 s->st_state |= (TH_ACK << 8);
2054                 }
2055         } else {
2056                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2057                         s->st_flags |= IPFW_STATE_F_SEQREV;
2058                         s->st_seq_rev = seq;
2059                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2060                         s->st_seq_rev = seq;
2061                 } else {
2062                         /* Out-of-sequence; done. */
2063                         return (FALSE);
2064                 }
2065                 if (tcp->th_flags & TH_ACK) {
2066                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2067                                 s->st_flags |= IPFW_STATE_F_ACKREV;
2068                                 s->st_ack_rev= ack;
2069                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2070                                 s->st_ack_rev = ack;
2071                         } else {
2072                                 /* Out-of-sequence; done. */
2073                                 return (FALSE);
2074                         }
2075
2076                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2077                             s->st_ack_rev == s->st_seq_fwd + 1)
2078                                 s->st_state |= TH_ACK;
2079                 }
2080         }
2081         return (TRUE);
2082 }
2083
2084 static void
2085 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2086     const struct tcphdr *tcp, struct ipfw_state *s)
2087 {
2088
2089         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2090                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2091
2092                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2093                         return;
2094
2095                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2096                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2097                 case TH_SYN:                            /* opening */
2098                         s->st_expire = time_uptime + dyn_syn_lifetime;
2099                         break;
2100
2101                 case BOTH_SYN:                  /* move to established */
2102                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
2103                 case BOTH_SYN | (TH_FIN << 8):
2104                         s->st_expire = time_uptime + dyn_ack_lifetime;
2105                         break;
2106
2107                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
2108                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2109                                 /* And both FINs were ACKed. */
2110                                 s->st_expire = time_uptime + dyn_fin_lifetime;
2111                         } else {
2112                                 s->st_expire = time_uptime +
2113                                     dyn_finwait_lifetime;
2114                         }
2115                         break;
2116
2117                 default:
2118 #if 0
2119                         /*
2120                          * reset or some invalid combination, but can also
2121                          * occur if we use keep-state the wrong way.
2122                          */
2123                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2124                                 kprintf("invalid state: 0x%x\n", s->st_state);
2125 #endif
2126                         s->st_expire = time_uptime + dyn_rst_lifetime;
2127                         break;
2128                 }
2129         } else if (pkt->proto == IPPROTO_UDP) {
2130                 s->st_expire = time_uptime + dyn_udp_lifetime;
2131         } else {
2132                 /* other protocols */
2133                 s->st_expire = time_uptime + dyn_short_lifetime;
2134         }
2135 }
2136
2137 /*
2138  * Lookup a state.
2139  */
2140 static struct ipfw_state *
2141 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2142     int *match_direction, const struct tcphdr *tcp)
2143 {
2144         struct ipfw_state *key, *s;
2145         int dir = MATCH_NONE;
2146
2147         key = &ctx->ipfw_state_tmpkey;
2148         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2149             pkt->dst_ip, pkt->dst_port, pkt->proto);
2150         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2151         if (s == NULL)
2152                 goto done; /* not found. */
2153         if (IPFW_STATE_ISDEAD(s)) {
2154                 ipfw_state_remove(ctx, s);
2155                 s = NULL;
2156                 goto done;
2157         }
2158         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2159                 /* TCP ports recycling is too fast. */
2160                 ctx->ipfw_sts_tcprecycled++;
2161                 ipfw_state_remove(ctx, s);
2162                 s = NULL;
2163                 goto done;
2164         }
2165
2166         if (s->st_swap == key->st_swap) {
2167                 dir = MATCH_FORWARD;
2168         } else {
2169                 KASSERT((s->st_swap & key->st_swap) == 0,
2170                     ("found mismatch state"));
2171                 dir = MATCH_REVERSE;
2172         }
2173
2174         /* Update this state. */
2175         ipfw_state_update(pkt, dir, tcp, s);
2176
2177         if (s->st_track != NULL) {
2178                 /* This track has been used. */
2179                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2180         }
2181 done:
2182         if (match_direction)
2183                 *match_direction = dir;
2184         return (s);
2185 }
2186
2187 static struct ipfw_state *
2188 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2189     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2190 {
2191         struct ipfw_state *s;
2192         size_t sz;
2193
2194         KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2195             ("invalid state type %u", type));
2196
2197         sz = sizeof(struct ipfw_state);
2198         if (IPFW_ISXLAT(type))
2199                 sz = sizeof(struct ipfw_xlat);
2200
2201         s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2202         if (s == NULL) {
2203                 ctx->ipfw_sts_nomem++;
2204                 return (NULL);
2205         }
2206
2207         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2208             id->dst_ip, id->dst_port, id->proto);
2209
2210         s->st_rule = rule;
2211         s->st_type = type;
2212         if (IPFW_ISXLAT(type)) {
2213                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2214
2215                 x->xlat_dir = MATCH_NONE;
2216                 x->xlat_pcpu = -1;
2217         }
2218
2219         /*
2220          * Update this state:
2221          * Set st_expire and st_state.
2222          */
2223         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2224
2225         return (s);
2226 }
2227
2228 static struct ipfw_state *
2229 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2230     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2231     const struct tcphdr *tcp)
2232 {
2233         struct ipfw_state *s, *dup;
2234
2235         s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2236         if (s == NULL)
2237                 return (NULL);
2238
2239         ctx->ipfw_state_cnt++;
2240         ctx->ipfw_state_loosecnt++;
2241         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2242                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2243                 ctx->ipfw_state_loosecnt = 0;
2244         }
2245
2246         dup = ipfw_state_link(ctx, s);
2247         if (dup != NULL)
2248                 panic("ipfw: %u state exists %p", type, dup);
2249
2250         if (t != NULL) {
2251                 /* Keep the track referenced. */
2252                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2253                 s->st_track = t;
2254         }
2255         return (s);
2256 }
2257
2258 static boolean_t
2259 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2260 {
2261         struct ipfw_trkcnt *trk;
2262         boolean_t trk_freed = FALSE;
2263
2264         KASSERT(t->t_count != NULL, ("track anchor"));
2265         KASSERT(LIST_EMPTY(&t->t_state_list),
2266             ("invalid track is still referenced"));
2267
2268         trk = t->t_trkcnt;
2269         KASSERT(trk != NULL, ("track has no trkcnt"));
2270
2271         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2272         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2273         kfree(t, M_IPFW);
2274
2275         /*
2276          * fdrop() style reference counting.
2277          * See kern/kern_descrip.c fdrop().
2278          */
2279         for (;;) {
2280                 int refs = trk->tc_refs;
2281
2282                 cpu_ccfence();
2283                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2284                 if (refs == 1) {
2285                         IPFW_TRKCNT_TOKGET;
2286                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2287                                 KASSERT(trk->tc_count == 0,
2288                                     ("%d states reference this trkcnt",
2289                                      trk->tc_count));
2290                                 RB_REMOVE(ipfw_trkcnt_tree,
2291                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
2292
2293                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2294                                     ("invalid trkcnt cnt %d",
2295                                      ipfw_gd.ipfw_trkcnt_cnt));
2296                                 ipfw_gd.ipfw_trkcnt_cnt--;
2297                                 IPFW_TRKCNT_TOKREL;
2298
2299                                 if (ctx->ipfw_trkcnt_spare == NULL)
2300                                         ctx->ipfw_trkcnt_spare = trk;
2301                                 else
2302                                         kfree(trk, M_IPFW);
2303                                 trk_freed = TRUE;
2304                                 break; /* done! */
2305                         }
2306                         IPFW_TRKCNT_TOKREL;
2307                         /* retry */
2308                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2309                         break; /* done! */
2310                 }
2311                 /* retry */
2312         }
2313         return (trk_freed);
2314 }
2315
2316 static void
2317 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2318 {
2319         struct ipfw_track *t, *tn;
2320
2321         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2322                 if (t->t_count == NULL) /* anchor */
2323                         continue;
2324                 if (rule != NULL && t->t_rule != rule)
2325                         continue;
2326                 ipfw_track_free(ctx, t);
2327         }
2328 }
2329
2330 static boolean_t
2331 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2332     boolean_t reap)
2333 {
2334         struct ipfw_state *s, *sn;
2335         boolean_t ret = FALSE;
2336
2337         KASSERT(t->t_count != NULL, ("track anchor"));
2338
2339         if (LIST_EMPTY(&t->t_state_list))
2340                 return (FALSE);
2341
2342         /*
2343          * Do not expire more than once per second, it is useless.
2344          */
2345         if (t->t_lastexp == time_uptime)
2346                 return (FALSE);
2347         t->t_lastexp = time_uptime;
2348
2349         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2350                 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2351                         KASSERT(s->st_track == t,
2352                             ("state track %p does not match %p",
2353                              s->st_track, t));
2354                         ipfw_state_del(ctx, s);
2355                         ret = TRUE;
2356                 }
2357         }
2358         return (ret);
2359 }
2360
2361 static __inline struct ipfw_trkcnt *
2362 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2363 {
2364         struct ipfw_trkcnt *trk;
2365
2366         if (ctx->ipfw_trkcnt_spare != NULL) {
2367                 trk = ctx->ipfw_trkcnt_spare;
2368                 ctx->ipfw_trkcnt_spare = NULL;
2369         } else {
2370                 trk = kmalloc(sizeof(*trk), M_IPFW,
2371                               M_INTWAIT | M_NULLOK | M_CACHEALIGN);
2372         }
2373         return (trk);
2374 }
2375
2376 static void
2377 ipfw_track_expire_done(struct ipfw_context *ctx)
2378 {
2379
2380         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2381             ("trackexp is not in progress"));
2382         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2383         callout_reset(&ctx->ipfw_trackto_ch, hz,
2384             ipfw_track_expire_ipifunc, NULL);
2385 }
2386
2387 static void
2388 ipfw_track_expire_more(struct ipfw_context *ctx)
2389 {
2390         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2391
2392         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2393             ("trackexp is not in progress"));
2394         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2395             ("trackexp more did not finish"));
2396         netisr_sendmsg_oncpu(nm);
2397 }
2398
2399 static int
2400 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2401     int scan_max, int expire_max)
2402 {
2403         struct ipfw_track *t;
2404         int scanned = 0, expired = 0;
2405         boolean_t reap = FALSE;
2406
2407         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2408             ("trackexp is not in progress"));
2409
2410         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2411                 reap = TRUE;
2412
2413         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2414                 if (scanned++ >= scan_max) {
2415                         ipfw_track_expire_more(ctx);
2416                         return (expired);
2417                 }
2418
2419                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2420                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2421
2422                 if (t->t_count == NULL) /* anchor */
2423                         continue;
2424
2425                 ipfw_track_state_expire(ctx, t, reap);
2426                 if (!LIST_EMPTY(&t->t_state_list)) {
2427                         /* There are states referencing this track. */
2428                         continue;
2429                 }
2430
2431                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2432                         /* Expired. */
2433                         if (ipfw_track_free(ctx, t)) {
2434                                 if (++expired >= expire_max) {
2435                                         ipfw_track_expire_more(ctx);
2436                                         return (expired);
2437                                 }
2438                         }
2439                 }
2440         }
2441         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2442         ipfw_track_expire_done(ctx);
2443         return (expired);
2444 }
2445
2446 static int
2447 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2448 {
2449         struct ipfw_track *anchor;
2450
2451         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2452             ("trackexp is in progress"));
2453         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2454
2455         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2456                 ipfw_track_expire_done(ctx);
2457                 return (0);
2458         }
2459
2460         /*
2461          * Do not expire more than once per second, it is useless.
2462          */
2463         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2464             ctx->ipfw_track_lastexp == time_uptime) {
2465                 ipfw_track_expire_done(ctx);
2466                 return (0);
2467         }
2468         ctx->ipfw_track_lastexp = time_uptime;
2469
2470         anchor = &ctx->ipfw_trackexp_anch;
2471         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2472         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2473 }
2474
2475 static void
2476 ipfw_track_expire_more_dispatch(netmsg_t nm)
2477 {
2478         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2479         struct ipfw_track *anchor;
2480
2481         ASSERT_NETISR_NCPUS(mycpuid);
2482         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2483             ("trackexp is not in progress"));
2484
2485         /* Reply ASAP */
2486         netisr_replymsg(&nm->base, 0);
2487
2488         anchor = &ctx->ipfw_trackexp_anch;
2489         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2490                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2491                 ipfw_track_expire_done(ctx);
2492                 return;
2493         }
2494         ipfw_track_expire_loop(ctx, anchor,
2495             ipfw_track_scan_max, ipfw_track_expire_max);
2496 }
2497
2498 static void
2499 ipfw_track_expire_dispatch(netmsg_t nm)
2500 {
2501         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2502
2503         ASSERT_NETISR_NCPUS(mycpuid);
2504
2505         /* Reply ASAP */
2506         crit_enter();
2507         netisr_replymsg(&nm->base, 0);
2508         crit_exit();
2509
2510         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2511                 /* Running; done. */
2512                 return;
2513         }
2514         ipfw_track_expire_start(ctx,
2515             ipfw_track_scan_max, ipfw_track_expire_max);
2516 }
2517
2518 static void
2519 ipfw_track_expire_ipifunc(void *dummy __unused)
2520 {
2521         struct netmsg_base *msg;
2522
2523         KKASSERT(mycpuid < netisr_ncpus);
2524         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2525
2526         crit_enter();
2527         if (msg->lmsg.ms_flags & MSGF_DONE)
2528                 netisr_sendmsg_oncpu(msg);
2529         crit_exit();
2530 }
2531
2532 static int
2533 ipfw_track_reap(struct ipfw_context *ctx)
2534 {
2535         struct ipfw_track *t, *anchor;
2536         int expired;
2537
2538         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2539                 /*
2540                  * Kick start track expiring.  Ignore scan limit,
2541                  * we are short of tracks.
2542                  */
2543                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2544                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2545                     ipfw_track_reap_max);
2546                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2547                 return (expired);
2548         }
2549
2550         /*
2551          * Tracks are being expired.
2552          */
2553
2554         if (RB_EMPTY(&ctx->ipfw_track_tree))
2555                 return (0);
2556
2557         expired = 0;
2558         anchor = &ctx->ipfw_trackexp_anch;
2559         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2560                 /*
2561                  * Ignore scan limit; we are short of tracks.
2562                  */
2563
2564                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2565                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2566
2567                 if (t->t_count == NULL) /* anchor */
2568                         continue;
2569
2570                 ipfw_track_state_expire(ctx, t, TRUE);
2571                 if (!LIST_EMPTY(&t->t_state_list)) {
2572                         /* There are states referencing this track. */
2573                         continue;
2574                 }
2575
2576                 if (ipfw_track_free(ctx, t)) {
2577                         if (++expired >= ipfw_track_reap_max) {
2578                                 ipfw_track_expire_more(ctx);
2579                                 break;
2580                         }
2581                 }
2582         }
2583         /*
2584          * NOTE:
2585          * Leave the anchor on the list, even if the end of the list has
2586          * been reached.  ipfw_track_expire_more_dispatch() will handle
2587          * the removal.
2588          */
2589         return (expired);
2590 }
2591
2592 static struct ipfw_track *
2593 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2594     uint16_t limit_mask, struct ip_fw *rule)
2595 {
2596         struct ipfw_track *key, *t, *dup;
2597         struct ipfw_trkcnt *trk, *ret;
2598         boolean_t do_expire = FALSE;
2599
2600         KASSERT(rule->track_ruleid != 0,
2601             ("rule %u has no track ruleid", rule->rulenum));
2602
2603         key = &ctx->ipfw_track_tmpkey;
2604         key->t_proto = id->proto;
2605         key->t_addrs = 0;
2606         key->t_ports = 0;
2607         key->t_rule = rule;
2608         if (limit_mask & DYN_SRC_ADDR)
2609                 key->t_saddr = id->src_ip;
2610         if (limit_mask & DYN_DST_ADDR)
2611                 key->t_daddr = id->dst_ip;
2612         if (limit_mask & DYN_SRC_PORT)
2613                 key->t_sport = id->src_port;
2614         if (limit_mask & DYN_DST_PORT)
2615                 key->t_dport = id->dst_port;
2616
2617         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2618         if (t != NULL)
2619                 goto done;
2620
2621         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2622         if (t == NULL) {
2623                 ctx->ipfw_tks_nomem++;
2624                 return (NULL);
2625         }
2626
2627         t->t_key = key->t_key;
2628         t->t_rule = rule;
2629         t->t_lastexp = 0;
2630         LIST_INIT(&t->t_state_list);
2631
2632         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2633                 time_t globexp, uptime;
2634
2635                 trk = NULL;
2636                 do_expire = TRUE;
2637
2638                 /*
2639                  * Do not expire globally more than once per second,
2640                  * it is useless.
2641                  */
2642                 uptime = time_uptime;
2643                 globexp = ipfw_gd.ipfw_track_globexp;
2644                 if (globexp != uptime &&
2645                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2646                     globexp, uptime)) {
2647                         int cpu;
2648
2649                         /* Expire tracks on other CPUs. */
2650                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2651                                 if (cpu == mycpuid)
2652                                         continue;
2653                                 lwkt_send_ipiq(globaldata_find(cpu),
2654                                     ipfw_track_expire_ipifunc, NULL);
2655                         }
2656                 }
2657         } else {
2658                 trk = ipfw_trkcnt_alloc(ctx);
2659         }
2660         if (trk == NULL) {
2661                 struct ipfw_trkcnt *tkey;
2662
2663                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2664                 key = NULL; /* tkey overlaps key */
2665
2666                 tkey->tc_key = t->t_key;
2667                 tkey->tc_ruleid = rule->track_ruleid;
2668
2669                 IPFW_TRKCNT_TOKGET;
2670                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2671                     tkey);
2672                 if (trk == NULL) {
2673                         IPFW_TRKCNT_TOKREL;
2674                         if (do_expire) {
2675                                 ctx->ipfw_tks_reap++;
2676                                 if (ipfw_track_reap(ctx) > 0) {
2677                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2678                                             ipfw_track_max) {
2679                                                 trk = ipfw_trkcnt_alloc(ctx);
2680                                                 if (trk != NULL)
2681                                                         goto install;
2682                                                 ctx->ipfw_tks_cntnomem++;
2683                                         } else {
2684                                                 ctx->ipfw_tks_overflow++;
2685                                         }
2686                                 } else {
2687                                         ctx->ipfw_tks_reapfailed++;
2688                                         ctx->ipfw_tks_overflow++;
2689                                 }
2690                         } else {
2691                                 ctx->ipfw_tks_cntnomem++;
2692                         }
2693                         kfree(t, M_IPFW);
2694                         return (NULL);
2695                 }
2696                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2697                     ("invalid trkcnt refs %d", trk->tc_refs));
2698                 atomic_add_int(&trk->tc_refs, 1);
2699                 IPFW_TRKCNT_TOKREL;
2700         } else {
2701 install:
2702                 trk->tc_key = t->t_key;
2703                 trk->tc_ruleid = rule->track_ruleid;
2704                 trk->tc_refs = 0;
2705                 trk->tc_count = 0;
2706                 trk->tc_expire = 0;
2707                 trk->tc_rulenum = rule->rulenum;
2708
2709                 IPFW_TRKCNT_TOKGET;
2710                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2711                     trk);
2712                 if (ret != NULL) {
2713                         KASSERT(ret->tc_refs > 0 &&
2714                             ret->tc_refs < netisr_ncpus,
2715                             ("invalid trkcnt refs %d", ret->tc_refs));
2716                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2717                             ("trkcnt spare was installed"));
2718                         ctx->ipfw_trkcnt_spare = trk;
2719                         trk = ret;
2720                 } else {
2721                         ipfw_gd.ipfw_trkcnt_cnt++;
2722                 }
2723                 atomic_add_int(&trk->tc_refs, 1);
2724                 IPFW_TRKCNT_TOKREL;
2725         }
2726         t->t_count = &trk->tc_count;
2727         t->t_trkcnt = trk;
2728
2729         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2730         if (dup != NULL)
2731                 panic("ipfw: track exists");
2732         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2733 done:
2734         t->t_expire = time_uptime + dyn_short_lifetime;
2735         return (t);
2736 }
2737
2738 /*
2739  * Install state for rule type cmd->o.opcode
2740  *
2741  * Returns NULL if state is not installed because of errors or because
2742  * states limitations are enforced.
2743  */
2744 static struct ipfw_state *
2745 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2746     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2747 {
2748         struct ipfw_state *s;
2749         struct ipfw_track *t;
2750         int count, diff;
2751
2752         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2753             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2754                 boolean_t overflow = TRUE;
2755
2756                 ctx->ipfw_sts_reap++;
2757                 if (ipfw_state_reap(ctx, diff) == 0)
2758                         ctx->ipfw_sts_reapfailed++;
2759                 if (ipfw_state_cntsync() < ipfw_state_max)
2760                         overflow = FALSE;
2761
2762                 if (overflow) {
2763                         time_t globexp, uptime;
2764                         int cpu;
2765
2766                         /*
2767                          * Do not expire globally more than once per second,
2768                          * it is useless.
2769                          */
2770                         uptime = time_uptime;
2771                         globexp = ipfw_gd.ipfw_state_globexp;
2772                         if (globexp == uptime ||
2773                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2774                             globexp, uptime)) {
2775                                 ctx->ipfw_sts_overflow++;
2776                                 return (NULL);
2777                         }
2778
2779                         /* Expire states on other CPUs. */
2780                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2781                                 if (cpu == mycpuid)
2782                                         continue;
2783                                 lwkt_send_ipiq(globaldata_find(cpu),
2784                                     ipfw_state_expire_ipifunc, NULL);
2785                         }
2786                         ctx->ipfw_sts_overflow++;
2787                         return (NULL);
2788                 }
2789         }
2790
2791         switch (cmd->o.opcode) {
2792         case O_KEEP_STATE: /* bidir rule */
2793         case O_REDIRECT:
2794                 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2795                     tcp);
2796                 if (s == NULL)
2797                         return (NULL);
2798                 break;
2799
2800         case O_LIMIT: /* limit number of sessions */
2801                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2802                 if (t == NULL)
2803                         return (NULL);
2804
2805                 if (*t->t_count >= cmd->conn_limit) {
2806                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2807                                 return (NULL);
2808                 }
2809                 for (;;) {
2810                         count = *t->t_count;
2811                         if (count >= cmd->conn_limit)
2812                                 return (NULL);
2813                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2814                                 break;
2815                 }
2816
2817                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2818                 if (s == NULL) {
2819                         /* Undo damage. */
2820                         atomic_subtract_int(t->t_count, 1);
2821                         return (NULL);
2822                 }
2823                 break;
2824
2825         default:
2826                 panic("unknown state type %u\n", cmd->o.opcode);
2827         }
2828
2829         if (s->st_type == O_REDIRECT) {
2830                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2831                 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2832
2833                 x->xlat_addr = r->addr.s_addr;
2834                 x->xlat_port = r->port;
2835                 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2836                 x->xlat_dir = MATCH_FORWARD;
2837                 KKASSERT(x->xlat_ifp != NULL);
2838         }
2839         return (s);
2840 }
2841
2842 static int
2843 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2844     const struct in_addr *in)
2845 {
2846         struct radix_node_head *rnh;
2847         struct sockaddr_in sin;
2848         struct ipfw_tblent *te;
2849
2850         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2851         rnh = ctx->ipfw_tables[tableid];
2852         if (rnh == NULL)
2853                 return (0); /* no match */
2854
2855         memset(&sin, 0, sizeof(sin));
2856         sin.sin_family = AF_INET;
2857         sin.sin_len = sizeof(sin);
2858         sin.sin_addr = *in;
2859
2860         te = (struct ipfw_tblent *)rnh->rnh_matchaddr(&sin, rnh);
2861         if (te == NULL)
2862                 return (0); /* no match */
2863
2864         te->te_use++;
2865         te->te_lastuse = time_second;
2866         return (1); /* match */
2867 }
2868
2869 /*
2870  * Transmit a TCP packet, containing either a RST or a keepalive.
2871  * When flags & TH_RST, we are sending a RST packet, because of a
2872  * "reset" action matched the packet.
2873  * Otherwise we are sending a keepalive, and flags & TH_
2874  *
2875  * Only {src,dst}_{ip,port} of "id" are used.
2876  */
2877 static void
2878 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2879 {
2880         struct mbuf *m;
2881         struct ip *ip;
2882         struct tcphdr *tcp;
2883         struct route sro;       /* fake route */
2884
2885         MGETHDR(m, M_NOWAIT, MT_HEADER);
2886         if (m == NULL)
2887                 return;
2888         m->m_pkthdr.rcvif = NULL;
2889         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2890         m->m_data += max_linkhdr;
2891
2892         ip = mtod(m, struct ip *);
2893         bzero(ip, m->m_len);
2894         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2895         ip->ip_p = IPPROTO_TCP;
2896         tcp->th_off = 5;
2897
2898         /*
2899          * Assume we are sending a RST (or a keepalive in the reverse
2900          * direction), swap src and destination addresses and ports.
2901          */
2902         ip->ip_src.s_addr = htonl(id->dst_ip);
2903         ip->ip_dst.s_addr = htonl(id->src_ip);
2904         tcp->th_sport = htons(id->dst_port);
2905         tcp->th_dport = htons(id->src_port);
2906         if (flags & TH_RST) {   /* we are sending a RST */
2907                 if (flags & TH_ACK) {
2908                         tcp->th_seq = htonl(ack);
2909                         tcp->th_ack = htonl(0);
2910                         tcp->th_flags = TH_RST;
2911                 } else {
2912                         if (flags & TH_SYN)
2913                                 seq++;
2914                         tcp->th_seq = htonl(0);
2915                         tcp->th_ack = htonl(seq);
2916                         tcp->th_flags = TH_RST | TH_ACK;
2917                 }
2918         } else {
2919                 /*
2920                  * We are sending a keepalive. flags & TH_SYN determines
2921                  * the direction, forward if set, reverse if clear.
2922                  * NOTE: seq and ack are always assumed to be correct
2923                  * as set by the caller. This may be confusing...
2924                  */
2925                 if (flags & TH_SYN) {
2926                         /*
2927                          * we have to rewrite the correct addresses!
2928                          */
2929                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2930                         ip->ip_src.s_addr = htonl(id->src_ip);
2931                         tcp->th_dport = htons(id->dst_port);
2932                         tcp->th_sport = htons(id->src_port);
2933                 }
2934                 tcp->th_seq = htonl(seq);
2935                 tcp->th_ack = htonl(ack);
2936                 tcp->th_flags = TH_ACK;
2937         }
2938
2939         /*
2940          * set ip_len to the payload size so we can compute
2941          * the tcp checksum on the pseudoheader
2942          * XXX check this, could save a couple of words ?
2943          */
2944         ip->ip_len = htons(sizeof(struct tcphdr));
2945         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2946
2947         /*
2948          * now fill fields left out earlier
2949          */
2950         ip->ip_ttl = ip_defttl;
2951         ip->ip_len = htons(m->m_pkthdr.len);
2952
2953         bzero(&sro, sizeof(sro));
2954         ip_rtaddr(ip->ip_dst, &sro);
2955
2956         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2957         ip_output(m, NULL, &sro, 0, NULL, NULL);
2958         if (sro.ro_rt)
2959                 RTFREE(sro.ro_rt);
2960 }
2961
2962 /*
2963  * Send a reject message, consuming the mbuf passed as an argument.
2964  */
2965 static void
2966 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2967 {
2968         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2969                 /* IP header is always left in network order */
2970                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2971         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2972                 struct tcphdr *const tcp =
2973                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2974
2975                 if ((tcp->th_flags & TH_RST) == 0) {
2976                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2977                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2978                 }
2979                 m_freem(args->m);
2980         } else {
2981                 m_freem(args->m);
2982         }
2983         args->m = NULL;
2984 }
2985
2986 /*
2987  * Given an ip_fw *, lookup_next_rule will return a pointer
2988  * to the next rule, which can be either the jump
2989  * target (for skipto instructions) or the next one in the list (in
2990  * all other cases including a missing jump target).
2991  * The result is also written in the "next_rule" field of the rule.
2992  * Backward jumps are not allowed, so start looking from the next
2993  * rule...
2994  *
2995  * This never returns NULL -- in case we do not have an exact match,
2996  * the next rule is returned. When the ruleset is changed,
2997  * pointers are flushed so we are always correct.
2998  */
2999 static struct ip_fw *
3000 lookup_next_rule(struct ip_fw *me)
3001 {
3002         struct ip_fw *rule = NULL;
3003         ipfw_insn *cmd;
3004
3005         /* look for action, in case it is a skipto */
3006         cmd = ACTION_PTR(me);
3007         if (cmd->opcode == O_LOG)
3008                 cmd += F_LEN(cmd);
3009         if (cmd->opcode == O_SKIPTO) {
3010                 for (rule = me->next; rule; rule = rule->next) {
3011                         if (rule->rulenum >= cmd->arg1)
3012                                 break;
3013                 }
3014         }
3015         if (rule == NULL)                       /* failure or not a skipto */
3016                 rule = me->next;
3017         me->next_rule = rule;
3018         return rule;
3019 }
3020
3021 static int
3022 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3023                 enum ipfw_opcodes opcode, uid_t uid)
3024 {
3025         struct in_addr src_ip, dst_ip;
3026         struct inpcbinfo *pi;
3027         boolean_t wildcard;
3028         struct inpcb *pcb;
3029
3030         if (fid->proto == IPPROTO_TCP) {
3031                 wildcard = FALSE;
3032                 pi = &tcbinfo[mycpuid];
3033         } else if (fid->proto == IPPROTO_UDP) {
3034                 wildcard = TRUE;
3035                 pi = &udbinfo[mycpuid];
3036         } else {
3037                 return 0;
3038         }
3039
3040         /*
3041          * Values in 'fid' are in host byte order
3042          */
3043         dst_ip.s_addr = htonl(fid->dst_ip);
3044         src_ip.s_addr = htonl(fid->src_ip);
3045         if (oif) {
3046                 pcb = in_pcblookup_hash(pi,
3047                         dst_ip, htons(fid->dst_port),
3048                         src_ip, htons(fid->src_port),
3049                         wildcard, oif);
3050         } else {
3051                 pcb = in_pcblookup_hash(pi,
3052                         src_ip, htons(fid->src_port),
3053                         dst_ip, htons(fid->dst_port),
3054                         wildcard, NULL);
3055         }
3056         if (pcb == NULL || pcb->inp_socket == NULL)
3057                 return 0;
3058
3059         if (opcode == O_UID) {
3060 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3061                 return !socheckuid(pcb->inp_socket, uid);
3062 #undef socheckuid
3063         } else  {
3064                 return groupmember(uid, pcb->inp_socket->so_cred);
3065         }
3066 }
3067
3068 static int
3069 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3070 {
3071
3072         if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3073                 struct ifaddr_container *ifac;
3074                 struct ifnet *ifp;
3075
3076                 ifp = ifunit_netisr(cmd->ifname);
3077                 if (ifp == NULL)
3078                         return (0);
3079
3080                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3081                         struct ifaddr *ia = ifac->ifa;
3082
3083                         if (ia->ifa_addr == NULL)
3084                                 continue;
3085                         if (ia->ifa_addr->sa_family != AF_INET)
3086                                 continue;
3087
3088                         cmd->mask.s_addr = INADDR_ANY;
3089                         if (cmd->o.arg1 & IPFW_IFIP_NET) {
3090                                 cmd->mask = ((struct sockaddr_in *)
3091                                     ia->ifa_netmask)->sin_addr;
3092                         }
3093                         if (cmd->mask.s_addr == INADDR_ANY)
3094                                 cmd->mask.s_addr = INADDR_BROADCAST;
3095
3096                         cmd->addr =
3097                             ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3098                         cmd->addr.s_addr &= cmd->mask.s_addr;
3099
3100                         cmd->o.arg1 |= IPFW_IFIP_VALID;
3101                         break;
3102                 }
3103                 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3104                         return (0);
3105         }
3106         return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3107 }
3108
3109 static void
3110 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3111     struct in_addr *old_addr, uint16_t *old_port)
3112 {
3113         struct ip *ip = mtod(m, struct ip *);
3114         struct in_addr *addr;
3115         uint16_t *port, *csum, dlen = 0;
3116         uint8_t udp = 0;
3117         boolean_t pseudo = FALSE;
3118
3119         if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3120                 addr = &ip->ip_src;
3121                 switch (ip->ip_p) {
3122                 case IPPROTO_TCP:
3123                         port = &L3HDR(struct tcphdr, ip)->th_sport;
3124                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3125                         break;
3126                 case IPPROTO_UDP:
3127                         port = &L3HDR(struct udphdr, ip)->uh_sport;
3128                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3129                         udp = 1;
3130                         break;
3131                 default:
3132                         panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3133                 }
3134         } else {
3135                 addr = &ip->ip_dst;
3136                 switch (ip->ip_p) {
3137                 case IPPROTO_TCP:
3138                         port = &L3HDR(struct tcphdr, ip)->th_dport;
3139                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3140                         break;
3141                 case IPPROTO_UDP:
3142                         port = &L3HDR(struct udphdr, ip)->uh_dport;
3143                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3144                         udp = 1;
3145                         break;
3146                 default:
3147                         panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3148                 }
3149         }
3150         if (old_addr != NULL)
3151                 *old_addr = *addr;
3152         if (old_port != NULL) {
3153                 if (x->xlat_port != 0)
3154                         *old_port = *port;
3155                 else
3156                         *old_port = 0;
3157         }
3158
3159         if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3160                 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3161                         dlen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
3162                 pseudo = TRUE;
3163         }
3164
3165         if (!pseudo) {
3166                 const uint16_t *oaddr, *naddr;
3167
3168                 oaddr = (const uint16_t *)&addr->s_addr;
3169                 naddr = (const uint16_t *)&x->xlat_addr;
3170
3171                 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3172                     oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3173                 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3174                     oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3175         }
3176         addr->s_addr = x->xlat_addr;
3177
3178         if (x->xlat_port != 0) {
3179                 if (!pseudo) {
3180                         *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3181                             udp);
3182                 }
3183                 *port = x->xlat_port;
3184         }
3185
3186         if (pseudo) {
3187                 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3188                     htons(dlen + ip->ip_p));
3189         }
3190 }
3191
3192 static void
3193 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3194 {
3195         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3196         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3197         struct mbuf *m = nm->m;
3198         struct ipfw_xlat *x = nm->arg1;
3199         struct ip_fw *rule = x->xlat_rule;
3200
3201         ASSERT_NETISR_NCPUS(mycpuid);
3202         KASSERT(rule->cpuid == mycpuid,
3203             ("rule does not belong to cpu%d", mycpuid));
3204         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3205             ("mbuf does not have ipfw continue rule"));
3206
3207         KASSERT(ctx->ipfw_cont_rule == NULL,
3208             ("pending ipfw continue rule"));
3209         KASSERT(ctx->ipfw_cont_xlat == NULL,
3210             ("pending ipfw continue xlat"));
3211         ctx->ipfw_cont_rule = rule;
3212         ctx->ipfw_cont_xlat = x;
3213
3214         if (nm->arg2 == 0)
3215                 ip_input(m);
3216         else
3217                 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3218
3219         /* May not be cleared, if ipfw was unload/disabled. */
3220         ctx->ipfw_cont_rule = NULL;
3221         ctx->ipfw_cont_xlat = NULL;
3222
3223         /*
3224          * This state is no longer used; decrement its xlat_crefs,
3225          * so this state can be deleted.
3226          */
3227         x->xlat_crefs--;
3228         /*
3229          * This rule is no longer used; decrement its cross_refs,
3230          * so this rule can be deleted.
3231          *
3232          * NOTE:
3233          * Decrement cross_refs in the last step of this function,
3234          * so that the module could be unloaded safely.
3235          */
3236         rule->cross_refs--;
3237 }
3238
3239 static void
3240 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3241     uint32_t flags)
3242 {
3243         struct netmsg_genpkt *nm;
3244
3245         KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3246             x->xlat_pcpu, cpuid));
3247
3248         /*
3249          * Bump cross_refs to prevent this rule and its siblings
3250          * from being deleted, while this mbuf is inflight.  The
3251          * cross_refs of the sibling rule on the target cpu will
3252          * be decremented, once this mbuf is going to be filtered
3253          * on the target cpu.
3254          */
3255         x->xlat_rule->cross_refs++;
3256         /*
3257          * Bump xlat_crefs to prevent this state and its paired
3258          * state from being deleted, while this mbuf is inflight.
3259          * The xlat_crefs of the paired state on the target cpu
3260          * will be decremented, once this mbuf is going to be
3261          * filtered on the target cpu.
3262          */
3263         x->xlat_crefs++;
3264
3265         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3266         if (flags & IPFW_XLATE_INSERT)
3267                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3268         if (flags & IPFW_XLATE_FORWARD)
3269                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3270
3271         /*
3272          * NOTE: We always leave ip_len and ip_off in network
3273          *       order across all network layers.
3274          */
3275         nm = &m->m_hdr.mh_genmsg;
3276         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3277             ipfw_ip_xlate_dispatch);
3278         nm->m = m;
3279         nm->arg1 = x->xlat_pair;
3280         nm->arg2 = 0;
3281         if (flags & IPFW_XLATE_OUTPUT)
3282                 nm->arg2 = 1;
3283         netisr_sendmsg(&nm->base, cpuid);
3284 }
3285
3286 static struct mbuf *
3287 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3288     struct ip_fw_local *local, struct ip **ip0)
3289 {
3290         struct ip *ip = mtod(m, struct ip *);
3291         struct tcphdr *tcp;
3292         struct udphdr *udp;
3293
3294         /*
3295          * Collect parameters into local variables for faster matching.
3296          */
3297         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
3298                 local->proto = args->f_id.proto = 0;    /* mark f_id invalid */
3299                 goto done;
3300         }
3301
3302         local->proto = args->f_id.proto = ip->ip_p;
3303         local->src_ip = ip->ip_src;
3304         local->dst_ip = ip->ip_dst;
3305         local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3306         local->ip_len = ntohs(ip->ip_len);
3307
3308 #define PULLUP_TO(len)                                  \
3309 do {                                                    \
3310         if (m->m_len < (len)) {                         \
3311                 args->m = m = m_pullup(m, (len));       \
3312                 if (m == NULL) {                        \
3313                         ip = NULL;                      \
3314                         goto done;                      \
3315                 }                                       \
3316                 ip = mtod(m, struct ip *);              \
3317         }                                               \
3318 } while (0)
3319
3320         if (local->offset == 0) {
3321                 switch (local->proto) {
3322                 case IPPROTO_TCP:
3323                         PULLUP_TO(hlen + sizeof(struct tcphdr));
3324                         local->tcp = tcp = L3HDR(struct tcphdr, ip);
3325                         local->dst_port = tcp->th_dport;
3326                         local->src_port = tcp->th_sport;
3327                         args->f_id.flags = tcp->th_flags;
3328                         break;
3329
3330                 case IPPROTO_UDP:
3331                         PULLUP_TO(hlen + sizeof(struct udphdr));
3332                         udp = L3HDR(struct udphdr, ip);
3333                         local->dst_port = udp->uh_dport;
3334                         local->src_port = udp->uh_sport;
3335                         break;
3336
3337                 case IPPROTO_ICMP:
3338                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
3339                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3340                         break;
3341
3342                 default:
3343                         break;
3344                 }
3345         }
3346
3347 #undef PULLUP_TO
3348
3349         args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3350         args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3351         args->f_id.src_port = local->src_port = ntohs(local->src_port);
3352         args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3353 done:
3354         *ip0 = ip;
3355         return (m);
3356 }
3357
3358 static struct mbuf *
3359 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3360     struct ip_fw_local *local, struct ip **ip0)
3361 {
3362         m->m_flags &= ~M_HASH;
3363         ip_hashfn(&m, 0);
3364         args->m = m;
3365         if (m == NULL) {
3366                 *ip0 = NULL;
3367                 return (NULL);
3368         }
3369         KASSERT(m->m_flags & M_HASH, ("no hash"));
3370
3371         /* 'm' might be changed by ip_hashfn(). */
3372         return (ipfw_setup_local(m, hlen, args, local, ip0));
3373 }
3374
3375 /*
3376  * The main check routine for the firewall.
3377  *
3378  * All arguments are in args so we can modify them and return them
3379  * back to the caller.
3380  *
3381  * Parameters:
3382  *
3383  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
3384  *              Starts with the IP header.
3385  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
3386  *      args->oif       Outgoing interface, or NULL if packet is incoming.
3387  *              The incoming interface is in the mbuf. (in)
3388  *
3389  *      args->rule      Pointer to the last matching rule (in/out)
3390  *      args->f_id      Addresses grabbed from the packet (out)
3391  *
3392  * Return value:
3393  *
3394  *      If the packet was denied/rejected and has been dropped, *m is equal
3395  *      to NULL upon return.
3396  *
3397  *      IP_FW_DENY      the packet must be dropped.
3398  *      IP_FW_PASS      The packet is to be accepted and routed normally.
3399  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
3400  *      IP_FW_TEE       Tee the packet to port (args->cookie)
3401  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
3402  *      IP_FW_CONTINUE  Continue processing on another cpu.
3403  */
3404 static int
3405 ipfw_chk(struct ip_fw_args *args)
3406 {
3407         /*
3408          * Local variables hold state during the processing of a packet.
3409          *
3410          * IMPORTANT NOTE: to speed up the processing of rules, there
3411          * are some assumption on the values of the variables, which
3412          * are documented here. Should you change them, please check
3413          * the implementation of the various instructions to make sure
3414          * that they still work.
3415          *
3416          * args->eh     The MAC header. It is non-null for a layer2
3417          *      packet, it is NULL for a layer-3 packet.
3418          *
3419          * m | args->m  Pointer to the mbuf, as received from the caller.
3420          *      It may change if ipfw_chk() does an m_pullup, or if it
3421          *      consumes the packet because it calls send_reject().
3422          *      XXX This has to change, so that ipfw_chk() never modifies
3423          *      or consumes the buffer.
3424          * ip   is simply an alias of the value of m, and it is kept
3425          *      in sync with it (the packet is  supposed to start with
3426          *      the ip header).
3427          */
3428         struct mbuf *m = args->m;
3429         struct ip *ip = mtod(m, struct ip *);
3430
3431         /*
3432          * oif | args->oif      If NULL, ipfw_chk has been called on the
3433          *      inbound path (ether_input, ip_input).
3434          *      If non-NULL, ipfw_chk has been called on the outbound path
3435          *      (ether_output, ip_output).
3436          */
3437         struct ifnet *oif = args->oif;
3438
3439         struct ip_fw *f = NULL;         /* matching rule */
3440         int retval = IP_FW_PASS;
3441         struct m_tag *mtag;
3442         struct divert_info *divinfo;
3443         struct ipfw_state *s;
3444
3445         /*
3446          * hlen The length of the IPv4 header.
3447          *      hlen >0 means we have an IPv4 packet.
3448          */
3449         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
3450
3451         struct ip_fw_local lc;
3452
3453         /*
3454          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3455          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
3456          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3457          */
3458         int dyn_dir = MATCH_UNKNOWN;
3459         struct ip_fw *dyn_f = NULL;
3460         int cpuid = mycpuid;
3461         struct ipfw_context *ctx;
3462
3463         ASSERT_NETISR_NCPUS(cpuid);
3464         ctx = ipfw_ctx[cpuid];
3465
3466         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3467                 return IP_FW_PASS;      /* accept */
3468
3469         if (args->eh == NULL ||         /* layer 3 packet */
3470             (m->m_pkthdr.len >= sizeof(struct ip) &&
3471              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3472                 hlen = ip->ip_hl << 2;
3473
3474         memset(&lc, 0, sizeof(lc));
3475
3476         m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3477         if (m == NULL)
3478                 goto pullup_failed;
3479
3480         if (args->rule) {
3481                 /*
3482                  * Packet has already been tagged. Look for the next rule
3483                  * to restart processing.
3484                  *
3485                  * If fw_one_pass != 0 then just accept it.
3486                  * XXX should not happen here, but optimized out in
3487                  * the caller.
3488                  */
3489                 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3490                         return IP_FW_PASS;
3491                 args->flags &= ~IP_FWARG_F_CONT;
3492
3493                 /* This rule is being/has been flushed */
3494                 if (ipfw_flushing)
3495                         return IP_FW_DENY;
3496
3497                 KASSERT(args->rule->cpuid == cpuid,
3498                         ("rule used on cpu%d", cpuid));
3499
3500                 /* This rule was deleted */
3501                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3502                         return IP_FW_DENY;
3503
3504                 if (args->xlat != NULL) {
3505                         struct ipfw_xlat *x = args->xlat;
3506
3507                         /* This xlat is being deleted. */
3508                         if (x->xlat_invalid)
3509                                 return IP_FW_DENY;
3510
3511                         f = args->rule;
3512
3513                         dyn_f = f;
3514                         dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3515                             MATCH_FORWARD : MATCH_REVERSE;
3516
3517                         if (args->flags & IP_FWARG_F_XLATINS) {
3518                                 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3519                                     ("not slave %u state", x->xlat_type));
3520                                 s = ipfw_state_link(ctx, &x->xlat_st);
3521                                 if (s != NULL) {
3522                                         ctx->ipfw_xlate_conflicts++;
3523                                         if (IPFW_STATE_ISDEAD(s)) {
3524                                                 ipfw_state_remove(ctx, s);
3525                                                 s = ipfw_state_link(ctx,
3526                                                     &x->xlat_st);
3527                                         }
3528                                         if (s != NULL) {
3529                                                 if (bootverbose) {
3530                                                         kprintf("ipfw: "
3531                                                         "slave %u state "
3532                                                         "conflicts %u state\n",
3533                                                         x->xlat_type,
3534                                                         s->st_type);
3535                                                 }
3536                                                 ipfw_xlat_invalidate(x);
3537                                                 return IP_FW_DENY;
3538                                         }
3539                                         ctx->ipfw_xlate_cresolved++;
3540                                 }
3541                         } else {
3542                                 ipfw_state_update(&args->f_id, dyn_dir,
3543                                     lc.tcp, &x->xlat_st);
3544                         }
3545                 } else {
3546                         /* TODO: setup dyn_f, dyn_dir */
3547
3548                         f = args->rule->next_rule;
3549                         if (f == NULL)
3550                                 f = lookup_next_rule(args->rule);
3551                 }
3552         } else {
3553                 /*
3554                  * Find the starting rule. It can be either the first
3555                  * one, or the one after divert_rule if asked so.
3556                  */
3557                 int skipto;
3558
3559                 KKASSERT((args->flags &
3560                     (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3561                 KKASSERT(args->xlat == NULL);
3562
3563                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3564                 if (mtag != NULL) {
3565                         divinfo = m_tag_data(mtag);
3566                         skipto = divinfo->skipto;
3567                 } else {
3568                         skipto = 0;
3569                 }
3570
3571                 f = ctx->ipfw_layer3_chain;
3572                 if (args->eh == NULL && skipto != 0) {
3573                         /* No skipto during rule flushing */
3574                         if (ipfw_flushing)
3575                                 return IP_FW_DENY;
3576
3577                         if (skipto >= IPFW_DEFAULT_RULE)
3578                                 return IP_FW_DENY; /* invalid */
3579
3580                         while (f && f->rulenum <= skipto)
3581                                 f = f->next;
3582                         if (f == NULL)  /* drop packet */
3583                                 return IP_FW_DENY;
3584                 } else if (ipfw_flushing) {
3585                         /* Rules are being flushed; skip to default rule */
3586                         f = ctx->ipfw_default_rule;
3587                 }
3588         }
3589         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3590                 m_tag_delete(m, mtag);
3591
3592         /*
3593          * Now scan the rules, and parse microinstructions for each rule.
3594          */
3595         for (; f; f = f->next) {
3596                 int l, cmdlen;
3597                 ipfw_insn *cmd;
3598                 int skip_or; /* skip rest of OR block */
3599
3600 again:
3601                 if (ctx->ipfw_set_disable & (1 << f->set)) {
3602                         args->xlat = NULL;
3603                         continue;
3604                 }
3605
3606                 if (args->xlat != NULL) {
3607                         args->xlat = NULL;
3608                         l = f->cmd_len - f->act_ofs;
3609                         cmd = ACTION_PTR(f);
3610                 } else {
3611                         l = f->cmd_len;
3612                         cmd = f->cmd;
3613                 }
3614
3615                 skip_or = 0;
3616                 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3617                         int match;
3618
3619                         /*
3620                          * check_body is a jump target used when we find a
3621                          * CHECK_STATE, and need to jump to the body of
3622                          * the target rule.
3623                          */
3624 check_body:
3625                         cmdlen = F_LEN(cmd);
3626                         /*
3627                          * An OR block (insn_1 || .. || insn_n) has the
3628                          * F_OR bit set in all but the last instruction.
3629                          * The first match will set "skip_or", and cause
3630                          * the following instructions to be skipped until
3631                          * past the one with the F_OR bit clear.
3632                          */
3633                         if (skip_or) {          /* skip this instruction */
3634                                 if ((cmd->len & F_OR) == 0)
3635                                         skip_or = 0;    /* next one is good */
3636                                 continue;
3637                         }
3638                         match = 0; /* set to 1 if we succeed */
3639
3640                         switch (cmd->opcode) {
3641                         /*
3642                          * The first set of opcodes compares the packet's
3643                          * fields with some pattern, setting 'match' if a
3644                          * match is found. At the end of the loop there is
3645                          * logic to deal with F_NOT and F_OR flags associated
3646                          * with the opcode.
3647                          */
3648                         case O_NOP:
3649                                 match = 1;
3650                                 break;
3651
3652                         case O_FORWARD_MAC:
3653                                 kprintf("ipfw: opcode %d unimplemented\n",
3654                                         cmd->opcode);
3655                                 break;
3656
3657                         case O_GID:
3658                         case O_UID:
3659                                 /*
3660                                  * We only check offset == 0 && proto != 0,
3661                                  * as this ensures that we have an IPv4
3662                                  * packet with the ports info.
3663                                  */
3664                                 if (lc.offset!=0)
3665                                         break;
3666
3667                                 match = ipfw_match_uid(&args->f_id, oif,
3668                                         cmd->opcode,
3669                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3670                                 break;
3671
3672                         case O_RECV:
3673                                 match = iface_match(m->m_pkthdr.rcvif,
3674                                     (ipfw_insn_if *)cmd);
3675                                 break;
3676
3677                         case O_XMIT:
3678                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3679                                 break;
3680
3681                         case O_VIA:
3682                                 match = iface_match(oif ? oif :
3683                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3684                                 break;
3685
3686                         case O_MACADDR2:
3687                                 if (args->eh != NULL) { /* have MAC header */
3688                                         uint32_t *want = (uint32_t *)
3689                                                 ((ipfw_insn_mac *)cmd)->addr;
3690                                         uint32_t *mask = (uint32_t *)
3691                                                 ((ipfw_insn_mac *)cmd)->mask;
3692                                         uint32_t *hdr = (uint32_t *)args->eh;
3693
3694                                         match =
3695                                         (want[0] == (hdr[0] & mask[0]) &&
3696                                          want[1] == (hdr[1] & mask[1]) &&
3697                                          want[2] == (hdr[2] & mask[2]));
3698                                 }
3699                                 break;
3700
3701                         case O_MAC_TYPE:
3702                                 if (args->eh != NULL) {
3703                                         uint16_t t =
3704                                             ntohs(args->eh->ether_type);
3705                                         uint16_t *p =
3706                                             ((ipfw_insn_u16 *)cmd)->ports;
3707                                         int i;
3708
3709                                         /* Special vlan handling */
3710                                         if (m->m_flags & M_VLANTAG)
3711                                                 t = ETHERTYPE_VLAN;
3712
3713                                         for (i = cmdlen - 1; !match && i > 0;
3714                                              i--, p += 2) {
3715                                                 match =
3716                                                 (t >= p[0] && t <= p[1]);
3717                                         }
3718                                 }
3719                                 break;
3720
3721                         case O_FRAG:
3722                                 match = (hlen > 0 && lc.offset != 0);
3723                                 break;
3724
3725                         case O_IPFRAG:
3726                                 if (hlen > 0) {
3727                                         uint16_t off;
3728
3729                                         off = ntohs(ip->ip_off);
3730                                         if (off & (IP_MF | IP_OFFMASK))
3731                                                 match = 1;
3732                                 }
3733                                 break;
3734
3735                         case O_IN:      /* "out" is "not in" */
3736                                 match = (oif == NULL);
3737                                 break;
3738
3739                         case O_LAYER2:
3740                                 match = (args->eh != NULL);
3741                                 break;
3742
3743                         case O_PROTO:
3744                                 /*
3745                                  * We do not allow an arg of 0 so the
3746                                  * check of "proto" only suffices.
3747                                  */
3748                                 match = (lc.proto == cmd->arg1);
3749                                 break;
3750
3751                         case O_IP_SRC:
3752                                 match = (hlen > 0 &&
3753                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3754                                     lc.src_ip.s_addr);
3755                                 break;
3756
3757                         case O_IP_SRC_MASK:
3758                                 match = (hlen > 0 &&
3759                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3760                                      (lc.src_ip.s_addr &
3761                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3762                                 break;
3763
3764                         case O_IP_SRC_ME:
3765                                 if (hlen > 0) {
3766                                         struct ifnet *tif;
3767
3768                                         tif = INADDR_TO_IFP(&lc.src_ip);
3769                                         match = (tif != NULL);
3770                                 }
3771                                 break;
3772
3773                         case O_IP_SRC_TABLE:
3774                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3775                                     &lc.src_ip);
3776                                 break;
3777
3778                         case O_IP_SRC_IFIP:
3779                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3780                                     &lc.src_ip);
3781                                 break;
3782
3783                         case O_IP_DST_SET:
3784                         case O_IP_SRC_SET:
3785                                 if (hlen > 0) {
3786                                         uint32_t *d = (uint32_t *)(cmd + 1);
3787                                         uint32_t addr =
3788                                             cmd->opcode == O_IP_DST_SET ?
3789                                                 args->f_id.dst_ip :
3790                                                 args->f_id.src_ip;
3791
3792                                         if (addr < d[0])
3793                                                 break;
3794                                         addr -= d[0]; /* subtract base */
3795                                         match =
3796                                         (addr < cmd->arg1) &&
3797                                          (d[1 + (addr >> 5)] &
3798                                           (1 << (addr & 0x1f)));
3799                                 }
3800                                 break;
3801
3802                         case O_IP_DST:
3803                                 match = (hlen > 0 &&
3804                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3805                                     lc.dst_ip.s_addr);
3806                                 break;
3807
3808                         case O_IP_DST_MASK:
3809                                 match = (hlen > 0) &&
3810                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3811                                      (lc.dst_ip.s_addr &
3812                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3813                                 break;
3814
3815                         case O_IP_DST_ME:
3816                                 if (hlen > 0) {
3817                                         struct ifnet *tif;
3818
3819                                         tif = INADDR_TO_IFP(&lc.dst_ip);
3820                                         match = (tif != NULL);
3821                                 }
3822                                 break;
3823
3824                         case O_IP_DST_TABLE:
3825                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3826                                     &lc.dst_ip);
3827                                 break;
3828
3829                         case O_IP_DST_IFIP:
3830                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3831                                     &lc.dst_ip);
3832                                 break;
3833
3834                         case O_IP_SRCPORT:
3835                         case O_IP_DSTPORT:
3836                                 /*
3837                                  * offset == 0 && proto != 0 is enough
3838                                  * to guarantee that we have an IPv4
3839                                  * packet with port info.
3840                                  */
3841                                 if ((lc.proto==IPPROTO_UDP ||
3842                                      lc.proto==IPPROTO_TCP)
3843                                     && lc.offset == 0) {
3844                                         uint16_t x =
3845                                             (cmd->opcode == O_IP_SRCPORT) ?
3846                                                 lc.src_port : lc.dst_port;
3847                                         uint16_t *p =
3848                                             ((ipfw_insn_u16 *)cmd)->ports;
3849                                         int i;
3850
3851                                         for (i = cmdlen - 1; !match && i > 0;
3852                                              i--, p += 2) {
3853                                                 match =
3854                                                 (x >= p[0] && x <= p[1]);
3855                                         }
3856                                 }
3857                                 break;
3858
3859                         case O_ICMPCODE:
3860                                 match = (lc.offset == 0 &&
3861                                     lc.proto==IPPROTO_ICMP &&
3862                                     icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3863                                 break;
3864
3865                         case O_ICMPTYPE:
3866                                 match = (lc.offset == 0 &&
3867                                     lc.proto==IPPROTO_ICMP &&
3868                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3869                                 break;
3870
3871                         case O_IPOPT:
3872                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3873                                 break;
3874
3875                         case O_IPVER:
3876                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3877                                 break;
3878
3879                         case O_IPTTL:
3880                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3881                                 break;
3882
3883                         case O_IPID:
3884                                 match = (hlen > 0 &&
3885                                     cmd->arg1 == ntohs(ip->ip_id));
3886                                 break;
3887
3888                         case O_IPLEN:
3889                                 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3890                                 break;
3891
3892                         case O_IPPRECEDENCE:
3893                                 match = (hlen > 0 &&
3894                                     (cmd->arg1 == (ip->ip_tos & 0xe0)));
3895                                 break;
3896
3897                         case O_IPTOS:
3898                                 match = (hlen > 0 &&
3899                                     flags_match(cmd, ip->ip_tos));
3900                                 break;
3901
3902                         case O_TCPFLAGS:
3903                                 match = (lc.proto == IPPROTO_TCP &&
3904                                     lc.offset == 0 &&
3905                                     flags_match(cmd,
3906                                         L3HDR(struct tcphdr,ip)->th_flags));
3907                                 break;
3908
3909                         case O_TCPOPTS:
3910                                 match = (lc.proto == IPPROTO_TCP &&
3911                                     lc.offset == 0 && tcpopts_match(ip, cmd));
3912                                 break;
3913
3914                         case O_TCPSEQ:
3915                                 match = (lc.proto == IPPROTO_TCP &&
3916                                     lc.offset == 0 &&
3917                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3918                                         L3HDR(struct tcphdr,ip)->th_seq);
3919                                 break;
3920
3921                         case O_TCPACK:
3922                                 match = (lc.proto == IPPROTO_TCP &&
3923                                     lc.offset == 0 &&
3924                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3925                                         L3HDR(struct tcphdr,ip)->th_ack);
3926                                 break;
3927
3928                         case O_TCPWIN:
3929                                 match = (lc.proto == IPPROTO_TCP &&
3930                                     lc.offset == 0 &&
3931                                     cmd->arg1 ==
3932                                         L3HDR(struct tcphdr,ip)->th_win);
3933                                 break;
3934
3935                         case O_ESTAB:
3936                                 /* reject packets which have SYN only */
3937                                 /* XXX should i also check for TH_ACK ? */
3938                                 match = (lc.proto == IPPROTO_TCP &&
3939                                     lc.offset == 0 &&
3940                                     (L3HDR(struct tcphdr,ip)->th_flags &
3941                                      (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3942                                 break;
3943
3944                         case O_LOG:
3945                                 if (fw_verbose) {
3946                                         ipfw_log(ctx, f, hlen, args->eh, m,
3947                                             oif);
3948                                 }
3949                                 match = 1;
3950                                 break;
3951
3952                         case O_PROB:
3953                                 match = (krandom() <
3954                                         ((ipfw_insn_u32 *)cmd)->d[0]);
3955                                 break;
3956
3957                         /*
3958                          * The second set of opcodes represents 'actions',
3959                          * i.e. the terminal part of a rule once the packet
3960                          * matches all previous patterns.
3961                          * Typically there is only one action for each rule,
3962                          * and the opcode is stored at the end of the rule
3963                          * (but there are exceptions -- see below).
3964                          *
3965                          * In general, here we set retval and terminate the
3966                          * outer loop (would be a 'break 3' in some language,
3967                          * but we need to do a 'goto done').
3968                          *
3969                          * Exceptions:
3970                          * O_COUNT and O_SKIPTO actions:
3971                          *   instead of terminating, we jump to the next rule
3972                          *   ('goto next_rule', equivalent to a 'break 2'),
3973                          *   or to the SKIPTO target ('goto again' after
3974                          *   having set f, cmd and l), respectively.
3975                          *
3976                          * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
3977                          *   are not real 'actions', and are stored right
3978                          *   before the 'action' part of the rule.
3979                          *   These opcodes try to install an entry in the
3980                          *   state tables; if successful, we continue with
3981                          *   the next opcode (match=1; break;), otherwise
3982                          *   the packet must be dropped ('goto done' after
3983                          *   setting retval).  If static rules are changed
3984                          *   during the state installation, the packet will
3985                          *   be dropped and rule's stats will not beupdated
3986                          *   ('return IP_FW_DENY').
3987                          *
3988                          * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3989                          *   cause a lookup of the state table, and a jump
3990                          *   to the 'action' part of the parent rule
3991                          *   ('goto check_body') if an entry is found, or
3992                          *   (CHECK_STATE only) a jump to the next rule if
3993                          *   the entry is not found ('goto next_rule').
3994                          *   The result of the lookup is cached to make
3995                          *   further instances of these opcodes are
3996                          *   effectively NOPs.  If static rules are changed
3997                          *   during the state looking up, the packet will
3998                          *   be dropped and rule's stats will not be updated
3999                          *   ('return IP_FW_DENY').
4000                          */
4001                         case O_REDIRECT:
4002                                 if (f->cross_rules == NULL) {
4003                                         /*
4004                                          * This rule was not completely setup;
4005                                          * move on to the next rule.
4006                                          */
4007                                         goto next_rule;
4008                                 }
4009                                 /*
4010                                  * Apply redirect only on input path and
4011                                  * only to non-fragment TCP segments or
4012                                  * UDP datagrams.
4013                                  *
4014                                  * Does _not_ work with layer2 filtering.
4015                                  */
4016                                 if (oif != NULL || args->eh != NULL ||
4017                                     (ip->ip_off & htons(IP_MF | IP_OFFMASK)) ||
4018                                     (lc.proto != IPPROTO_TCP &&
4019                                      lc.proto != IPPROTO_UDP))
4020                                         break;
4021                                 /* FALL THROUGH */
4022                         case O_LIMIT:
4023                         case O_KEEP_STATE:
4024                                 if (hlen == 0)
4025                                         break;
4026                                 s = ipfw_state_install(ctx, f,
4027                                     (ipfw_insn_limit *)cmd, args, lc.tcp);
4028                                 if (s == NULL) {
4029                                         retval = IP_FW_DENY;
4030                                         goto done; /* error/limit violation */
4031                                 }
4032                                 s->st_pcnt++;
4033                                 s->st_bcnt += lc.ip_len;
4034
4035                                 if (s->st_type == O_REDIRECT) {
4036                                         struct in_addr oaddr;
4037                                         uint16_t oport;
4038                                         struct ipfw_xlat *slave_x, *x;
4039                                         struct ipfw_state *dup;
4040
4041                                         x = (struct ipfw_xlat *)s;
4042                                         ipfw_xlate(x, m, &oaddr, &oport);
4043                                         m = ipfw_rehashm(m, hlen, args, &lc,
4044                                             &ip);
4045                                         if (m == NULL) {
4046                                                 ipfw_state_del(ctx, s);
4047                                                 goto pullup_failed;
4048                                         }
4049
4050                                         cpuid = netisr_hashcpu(
4051                                             m->m_pkthdr.hash);
4052
4053                                         slave_x = (struct ipfw_xlat *)
4054                                             ipfw_state_alloc(ctx, &args->f_id,
4055                                             O_REDIRECT, f->cross_rules[cpuid],
4056                                             lc.tcp);
4057                                         if (slave_x == NULL) {
4058                                                 ipfw_state_del(ctx, s);
4059                                                 retval = IP_FW_DENY;
4060                                                 goto done;
4061                                         }
4062                                         slave_x->xlat_addr = oaddr.s_addr;
4063                                         slave_x->xlat_port = oport;
4064                                         slave_x->xlat_dir = MATCH_REVERSE;
4065                                         slave_x->xlat_flags |=
4066                                             IPFW_STATE_F_XLATSRC |
4067                                             IPFW_STATE_F_XLATSLAVE;
4068
4069                                         slave_x->xlat_pair = x;
4070                                         slave_x->xlat_pcpu = mycpuid;
4071                                         x->xlat_pair = slave_x;
4072                                         x->xlat_pcpu = cpuid;
4073
4074                                         ctx->ipfw_xlated++;
4075                                         if (cpuid != mycpuid) {
4076                                                 ctx->ipfw_xlate_split++;
4077                                                 ipfw_xlate_redispatch(
4078                                                     m, cpuid, x,
4079                                                     IPFW_XLATE_INSERT |
4080                                                     IPFW_XLATE_FORWARD);
4081                                                 args->m = NULL;
4082                                                 return (IP_FW_REDISPATCH);
4083                                         }
4084
4085                                         dup = ipfw_state_link(ctx,
4086                                             &slave_x->xlat_st);
4087                                         if (dup != NULL) {
4088                                                 ctx->ipfw_xlate_conflicts++;
4089                                                 if (IPFW_STATE_ISDEAD(dup)) {
4090                                                         ipfw_state_remove(ctx,
4091                                                             dup);
4092                                                         dup = ipfw_state_link(
4093                                                         ctx, &slave_x->xlat_st);
4094                                                 }
4095                                                 if (dup != NULL) {
4096                                                         if (bootverbose) {
4097                                                             kprintf("ipfw: "
4098                                                             "slave %u state "
4099                                                             "conflicts "
4100                                                             "%u state\n",
4101                                                             x->xlat_type,
4102                                                             s->st_type);
4103                                                         }
4104                                                         ipfw_state_del(ctx, s);
4105                                                         return (IP_FW_DENY);
4106                                                 }
4107                                                 ctx->ipfw_xlate_cresolved++;
4108                                         }
4109                                 }
4110                                 match = 1;
4111                                 break;
4112
4113                         case O_PROBE_STATE:
4114                         case O_CHECK_STATE:
4115                                 /*
4116                                  * States are checked at the first keep-state
4117                                  * check-state occurrence, with the result
4118                                  * being stored in dyn_dir.  The compiler
4119                                  * introduces a PROBE_STATE instruction for
4120                                  * us when we have a KEEP_STATE/LIMIT/RDR
4121                                  * (because PROBE_STATE needs to be run first).
4122                                  */
4123                                 s = NULL;
4124                                 if (dyn_dir == MATCH_UNKNOWN) {
4125                                         s = ipfw_state_lookup(ctx,
4126                                             &args->f_id, &dyn_dir, lc.tcp);
4127                                 }
4128                                 if (s == NULL ||
4129                                     (s->st_type == O_REDIRECT &&
4130                                      (args->eh != NULL ||
4131                                       (ip->ip_off & htons(IP_MF | IP_OFFMASK)) ||
4132                                       (lc.proto != IPPROTO_TCP &&
4133                                        lc.proto != IPPROTO_UDP)))) {
4134                                         /*
4135                                          * State not found. If CHECK_STATE,
4136                                          * skip to next rule, if PROBE_STATE
4137                                          * just ignore and continue with next
4138                                          * opcode.
4139                                          */
4140                                         if (cmd->opcode == O_CHECK_STATE)
4141                                                 goto next_rule;
4142                                         match = 1;
4143                                         break;
4144                                 }
4145
4146                                 s->st_pcnt++;
4147                                 s->st_bcnt += lc.ip_len;
4148
4149                                 if (s->st_type == O_REDIRECT) {
4150                                         struct ipfw_xlat *x =
4151                                             (struct ipfw_xlat *)s;
4152
4153                                         if (oif != NULL &&
4154                                             x->xlat_ifp == NULL) {
4155                                                 KASSERT(x->xlat_flags &
4156                                                     IPFW_STATE_F_XLATSLAVE,
4157                                                     ("master rdr state "
4158                                                      "missing ifp"));
4159                                                 x->xlat_ifp = oif;
4160                                         } else if (
4161                                             (oif != NULL && x->xlat_ifp!=oif) ||
4162                                             (oif == NULL &&
4163                                              x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4164                                                 retval = IP_FW_DENY;
4165                                                 goto done;
4166                                         }
4167                                         if (x->xlat_dir != dyn_dir)
4168                                                 goto skip_xlate;
4169
4170                                         ipfw_xlate(x, m, NULL, NULL);
4171                                         m = ipfw_rehashm(m, hlen, args, &lc,
4172                                             &ip);
4173                                         if (m == NULL)
4174                                                 goto pullup_failed;
4175
4176                                         cpuid = netisr_hashcpu(
4177                                             m->m_pkthdr.hash);
4178                                         if (cpuid != mycpuid) {
4179                                                 uint32_t xlate = 0;
4180
4181                                                 if (oif != NULL) {
4182                                                         xlate |=
4183                                                             IPFW_XLATE_OUTPUT;
4184                                                 }
4185                                                 if (dyn_dir == MATCH_FORWARD) {
4186                                                         xlate |=
4187                                                             IPFW_XLATE_FORWARD;
4188                                                 }
4189                                                 ipfw_xlate_redispatch(m, cpuid,
4190                                                     x, xlate);
4191                                                 args->m = NULL;
4192                                                 return (IP_FW_REDISPATCH);
4193                                         }
4194
4195                                         KKASSERT(x->xlat_pcpu == mycpuid);
4196                                         ipfw_state_update(&args->f_id, dyn_dir,
4197                                             lc.tcp, &x->xlat_pair->xlat_st);
4198                                 }
4199 skip_xlate:
4200                                 /*
4201                                  * Found a rule from a state; jump to the
4202                                  * 'action' part of the rule.
4203                                  */
4204                                 f = s->st_rule;
4205                                 KKASSERT(f->cpuid == mycpuid);
4206
4207                                 cmd = ACTION_PTR(f);
4208                                 l = f->cmd_len - f->act_ofs;
4209                                 dyn_f = f;
4210                                 goto check_body;
4211
4212                         case O_ACCEPT:
4213                                 retval = IP_FW_PASS;    /* accept */
4214                                 goto done;
4215
4216                         case O_DEFRAG:
4217                                 if (f->cross_rules == NULL) {
4218                                         /*
4219                                          * This rule was not completely setup;
4220                                          * move on to the next rule.
4221                                          */
4222                                         goto next_rule;
4223                                 }
4224
4225                                 /*
4226                                  * Don't defrag for l2 packets, output packets
4227                                  * or non-fragments.
4228                                  */
4229                                 if (oif != NULL || args->eh != NULL ||
4230                                     (ip->ip_off & htons(IP_MF | IP_OFFMASK)) == 0)
4231                                         goto next_rule;
4232
4233                                 ctx->ipfw_frags++;
4234                                 m = ip_reass(m);
4235                                 args->m = m;
4236                                 if (m == NULL) {
4237                                         retval = IP_FW_PASS;
4238                                         goto done;
4239                                 }
4240                                 ctx->ipfw_defraged++;
4241                                 KASSERT((m->m_flags & M_HASH) == 0,
4242                                     ("hash not cleared"));
4243
4244                                 /* Update statistics */
4245                                 f->pcnt++;
4246                                 f->bcnt += lc.ip_len;
4247                                 f->timestamp = time_second;
4248
4249                                 ip = mtod(m, struct ip *);
4250                                 hlen = ip->ip_hl << 2;
4251                                 ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
4252
4253                                 ip_hashfn(&m, 0);
4254                                 args->m = m;
4255                                 if (m == NULL)
4256                                         goto pullup_failed;
4257
4258                                 KASSERT(m->m_flags & M_HASH, ("no hash"));
4259                                 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4260                                 if (cpuid != mycpuid) {
4261                                         ctx->ipfw_defrag_remote++;
4262                                         ipfw_defrag_redispatch(m, cpuid, f);
4263                                         args->m = NULL;
4264                                         return (IP_FW_REDISPATCH);
4265                                 }
4266
4267                                 /* 'm' might be changed by ip_hashfn(). */
4268                                 ip = mtod(m, struct ip *);
4269
4270                                 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4271                                 if (m == NULL)
4272                                         goto pullup_failed;
4273
4274                                 /* Move on. */
4275                                 goto next_rule;
4276
4277                         case O_PIPE:
4278                         case O_QUEUE:
4279                                 args->rule = f; /* report matching rule */
4280                                 args->cookie = cmd->arg1;
4281                                 retval = IP_FW_DUMMYNET;
4282                                 goto done;
4283
4284                         case O_DIVERT:
4285                         case O_TEE:
4286                                 if (args->eh) /* not on layer 2 */
4287                                         break;
4288
4289                                 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4290                                     sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4291                                 if (mtag == NULL) {
4292                                         retval = IP_FW_DENY;
4293                                         goto done;
4294                                 }
4295                                 divinfo = m_tag_data(mtag);
4296
4297                                 divinfo->skipto = f->rulenum;
4298                                 divinfo->port = cmd->arg1;
4299                                 divinfo->tee = (cmd->opcode == O_TEE);
4300                                 m_tag_prepend(m, mtag);
4301
4302                                 args->cookie = cmd->arg1;
4303                                 retval = (cmd->opcode == O_DIVERT) ?
4304                                          IP_FW_DIVERT : IP_FW_TEE;
4305                                 goto done;
4306
4307                         case O_COUNT:
4308                         case O_SKIPTO:
4309                                 f->pcnt++;      /* update stats */
4310                                 f->bcnt += lc.ip_len;
4311                                 f->timestamp = time_second;
4312                                 if (cmd->opcode == O_COUNT)
4313                                         goto next_rule;
4314                                 /* handle skipto */
4315                                 if (f->next_rule == NULL)
4316                                         lookup_next_rule(f);
4317                                 f = f->next_rule;
4318                                 goto again;
4319
4320                         case O_REJECT:
4321                                 /*
4322                                  * Drop the packet and send a reject notice
4323                                  * if the packet is not ICMP (or is an ICMP
4324                                  * query), and it is not multicast/broadcast.
4325                                  */
4326                                 if (hlen > 0 &&
4327                                     (lc.proto != IPPROTO_ICMP ||
4328                                      is_icmp_query(ip)) &&
4329                                     !(m->m_flags & (M_BCAST|M_MCAST)) &&
4330                                     !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4331                                         send_reject(args, cmd->arg1,
4332                                             lc.offset, lc.ip_len);
4333                                         retval = IP_FW_DENY;
4334                                         goto done;
4335                                 }
4336                                 /* FALLTHROUGH */
4337                         case O_DENY:
4338                                 retval = IP_FW_DENY;
4339                                 goto done;
4340
4341                         case O_FORWARD_IP:
4342                                 if (args->eh)   /* not valid on layer2 pkts */
4343                                         break;
4344                                 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4345                                         struct sockaddr_in *sin;
4346
4347                                         mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4348                                             sizeof(*sin), M_INTWAIT | M_NULLOK);
4349                                         if (mtag == NULL) {
4350                                                 retval = IP_FW_DENY;
4351                                                 goto done;
4352                                         }
4353                                         sin = m_tag_data(mtag);
4354
4355                                         /* Structure copy */
4356                                         *sin = ((ipfw_insn_sa *)cmd)->sa;
4357
4358                                         m_tag_prepend(m, mtag);
4359                                         m->m_pkthdr.fw_flags |=
4360                                                 IPFORWARD_MBUF_TAGGED;
4361                                         m->m_pkthdr.fw_flags &=
4362                                                 ~BRIDGE_MBUF_TAGGED;
4363                                 }
4364                                 retval = IP_FW_PASS;
4365                                 goto done;
4366
4367                         default:
4368                                 panic("-- unknown opcode %d", cmd->opcode);
4369                         } /* end of switch() on opcodes */
4370
4371                         if (cmd->len & F_NOT)
4372                                 match = !match;
4373
4374                         if (match) {
4375                                 if (cmd->len & F_OR)
4376                                         skip_or = 1;
4377                         } else {
4378                                 if (!(cmd->len & F_OR)) /* not an OR block, */
4379                                         break;          /* try next rule    */
4380                         }
4381
4382                 }       /* end of inner for, scan opcodes */
4383
4384 next_rule:;             /* try next rule                */
4385
4386         }               /* end of outer for, scan rules */
4387         kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4388         return IP_FW_DENY;
4389
4390 done:
4391         /* Update statistics */
4392         f->pcnt++;
4393         f->bcnt += lc.ip_len;
4394         f->timestamp = time_second;
4395         return retval;
4396
4397 pullup_failed:
4398         if (fw_verbose)
4399                 kprintf("pullup failed\n");
4400         return IP_FW_DENY;
4401 }
4402
4403 static struct mbuf *
4404 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4405 {
4406         struct m_tag *mtag;
4407         struct dn_pkt *pkt;
4408         ipfw_insn *cmd;
4409         const struct ipfw_flow_id *id;
4410         struct dn_flow_id *fid;
4411
4412         M_ASSERTPKTHDR(m);
4413
4414         mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4415             M_INTWAIT | M_NULLOK);
4416         if (mtag == NULL) {
4417                 m_freem(m);
4418                 return (NULL);
4419         }
4420         m_tag_prepend(m, mtag);
4421
4422         pkt = m_tag_data(mtag);
4423         bzero(pkt, sizeof(*pkt));
4424
4425         cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4426         if (cmd->opcode == O_LOG)
4427                 cmd += F_LEN(cmd);
4428         KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4429                 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4430
4431         pkt->dn_m = m;
4432         pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4433         pkt->ifp = fwa->oif;
4434         pkt->pipe_nr = pipe_nr;
4435
4436         pkt->cpuid = mycpuid;
4437         pkt->msgport = netisr_curport();
4438
4439         id = &fwa->f_id;
4440         fid = &pkt->id;
4441         fid->fid_dst_ip = id->dst_ip;
4442         fid->fid_src_ip = id->src_ip;
4443         fid->fid_dst_port = id->dst_port;
4444         fid->fid_src_port = id->src_port;
4445         fid->fid_proto = id->proto;
4446         fid->fid_flags = id->flags;
4447
4448         ipfw_ref_rule(fwa->rule);
4449         pkt->dn_priv = fwa->rule;
4450         pkt->dn_unref_priv = ipfw_unref_rule;
4451
4452         if (cmd->opcode == O_PIPE)
4453                 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4454
4455         m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4456         return (m);
4457 }
4458
4459 /*
4460  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4461  * These will be reconstructed on the fly as packets are matched.
4462  */
4463 static void
4464 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4465 {
4466         struct ip_fw *rule;
4467
4468         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4469                 rule->next_rule = NULL;
4470 }
4471
4472 static void
4473 ipfw_inc_static_count(struct ip_fw *rule)
4474 {
4475         /* Static rule's counts are updated only on CPU0 */
4476         KKASSERT(mycpuid == 0);
4477
4478         static_count++;
4479         static_ioc_len += IOC_RULESIZE(rule);
4480 }
4481
4482 static void
4483 ipfw_dec_static_count(struct ip_fw *rule)
4484 {
4485         int l = IOC_RULESIZE(rule);
4486
4487         /* Static rule's counts are updated only on CPU0 */
4488         KKASSERT(mycpuid == 0);
4489
4490         KASSERT(static_count > 0, ("invalid static count %u", static_count));
4491         static_count--;
4492
4493         KASSERT(static_ioc_len >= l,
4494                 ("invalid static len %u", static_ioc_len));
4495         static_ioc_len -= l;
4496 }
4497
4498 static void
4499 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4500 {
4501         if (fwmsg->sibling != NULL) {
4502                 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4503                 fwmsg->sibling->sibling = rule;
4504         }
4505         fwmsg->sibling = rule;
4506 }
4507
4508 static struct ip_fw *
4509 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4510 {
4511         struct ip_fw *rule;
4512
4513         rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4514
4515         rule->act_ofs = ioc_rule->act_ofs;
4516         rule->cmd_len = ioc_rule->cmd_len;
4517         rule->rulenum = ioc_rule->rulenum;
4518         rule->set = ioc_rule->set;
4519         rule->usr_flags = ioc_rule->usr_flags;
4520
4521         bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4522
4523         rule->refcnt = 1;
4524         rule->cpuid = mycpuid;
4525         rule->rule_flags = rule_flags;
4526
4527         return rule;
4528 }
4529
4530 static void
4531 ipfw_add_rule_dispatch(netmsg_t nmsg)
4532 {
4533         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4534         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4535         struct ip_fw *rule;
4536
4537         ASSERT_NETISR_NCPUS(mycpuid);
4538
4539         rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4540
4541         /*
4542          * Insert rule into the pre-determined position
4543          */
4544         if (fwmsg->prev_rule != NULL) {
4545                 struct ip_fw *prev, *next;
4546
4547                 prev = fwmsg->prev_rule;
4548                 KKASSERT(prev->cpuid == mycpuid);
4549
4550                 next = fwmsg->next_rule;
4551                 KKASSERT(next->cpuid == mycpuid);
4552
4553                 rule->next = next;
4554                 prev->next = rule;
4555
4556                 /*
4557                  * Move to the position on the next CPU
4558                  * before the msg is forwarded.
4559                  */
4560                 fwmsg->prev_rule = prev->sibling;
4561                 fwmsg->next_rule = next->sibling;
4562         } else {
4563                 KKASSERT(fwmsg->next_rule == NULL);
4564                 rule->next = ctx->ipfw_layer3_chain;
4565                 ctx->ipfw_layer3_chain = rule;
4566         }
4567
4568         /* Link rule CPU sibling */
4569         ipfw_link_sibling(fwmsg, rule);
4570
4571         ipfw_flush_rule_ptrs(ctx);
4572
4573         if (mycpuid == 0) {
4574                 /* Statistics only need to be updated once */
4575                 ipfw_inc_static_count(rule);
4576
4577                 /* Return the rule on CPU0 */
4578                 nmsg->lmsg.u.ms_resultp = rule;
4579         }
4580
4581         if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4582                 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4583
4584         if (fwmsg->cross_rules != NULL) {
4585                 /* Save rules for later use. */
4586                 fwmsg->cross_rules[mycpuid] = rule;
4587         }
4588
4589         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4590 }
4591
4592 static void
4593 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4594 {
4595         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4596         struct ip_fw *rule = fwmsg->sibling;
4597         int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4598
4599         ASSERT_NETISR_NCPUS(mycpuid);
4600         KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4601             ("not crossref rule"));
4602
4603         rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4604         memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4605
4606         fwmsg->sibling = rule->sibling;
4607         netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4608 }
4609
4610 /*
4611  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4612  * then possibly create a rule number and add the rule to the list.
4613  * Update the rule_number in the input struct so the caller knows
4614  * it as well.
4615  */
4616 static void
4617 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4618 {
4619         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4620         struct netmsg_ipfw fwmsg;
4621         struct ip_fw *f, *prev, *rule;
4622
4623         ASSERT_NETISR0;
4624
4625         /*
4626          * If rulenum is 0, find highest numbered rule before the
4627          * default rule, and add rule number incremental step.
4628          */
4629         if (ioc_rule->rulenum == 0) {
4630                 int step = autoinc_step;
4631
4632                 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4633                          step <= IPFW_AUTOINC_STEP_MAX);
4634
4635                 /*
4636                  * Locate the highest numbered rule before default
4637                  */
4638                 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4639                         if (f->rulenum == IPFW_DEFAULT_RULE)
4640                                 break;
4641                         ioc_rule->rulenum = f->rulenum;
4642                 }
4643                 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4644                         ioc_rule->rulenum += step;
4645         }
4646         KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4647                 ioc_rule->rulenum != 0,
4648                 ("invalid rule num %d", ioc_rule->rulenum));
4649
4650         /*
4651          * Now find the right place for the new rule in the sorted list.
4652          */
4653         for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4654              prev = f, f = f->next) {
4655                 if (f->rulenum > ioc_rule->rulenum) {
4656                         /* Found the location */
4657                         break;
4658                 }
4659         }
4660         KASSERT(f != NULL, ("no default rule?!"));
4661
4662         /*
4663          * Duplicate the rule onto each CPU.
4664          * The rule duplicated on CPU0 will be returned.
4665          */
4666         bzero(&fwmsg, sizeof(fwmsg));
4667         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4668             ipfw_add_rule_dispatch);
4669         fwmsg.ioc_rule = ioc_rule;
4670         fwmsg.prev_rule = prev;
4671         fwmsg.next_rule = prev == NULL ? NULL : f;
4672         fwmsg.rule_flags = rule_flags;
4673         if (rule_flags & IPFW_RULE_F_CROSSREF) {
4674                 fwmsg.cross_rules = kmalloc(
4675                     sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4676                     M_WAITOK | M_ZERO);
4677         }
4678
4679         netisr_domsg_global(&fwmsg.base);
4680         KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4681
4682         rule = fwmsg.base.lmsg.u.ms_resultp;
4683         KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4684
4685         if (fwmsg.cross_rules != NULL) {
4686                 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4687                     MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4688                 fwmsg.sibling = rule;
4689                 netisr_domsg_global(&fwmsg.base);
4690                 KKASSERT(fwmsg.sibling == NULL);
4691
4692                 kfree(fwmsg.cross_rules, M_TEMP);
4693
4694 #ifdef KLD_MODULE
4695                 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4696 #endif
4697         }
4698
4699         DPRINTF("++ installed rule %d, static count now %d\n",
4700                 rule->rulenum, static_count);
4701 }
4702
4703 /*
4704  * Free storage associated with a static rule (including derived
4705  * states/tracks).
4706  * The caller is in charge of clearing rule pointers to avoid
4707  * dangling pointers.
4708  * @return a pointer to the next entry.
4709  * Arguments are not checked, so they better be correct.
4710  */
4711 static struct ip_fw *
4712 ipfw_delete_rule(struct ipfw_context *ctx,
4713                  struct ip_fw *prev, struct ip_fw *rule)
4714 {
4715         struct ip_fw *n;
4716
4717         n = rule->next;
4718         if (prev == NULL)
4719                 ctx->ipfw_layer3_chain = n;
4720         else
4721                 prev->next = n;
4722
4723         /* Mark the rule as invalid */
4724         rule->rule_flags |= IPFW_RULE_F_INVALID;
4725         rule->next_rule = NULL;
4726         rule->sibling = NULL;
4727 #ifdef foo
4728         /* Don't reset cpuid here; keep various assertion working */
4729         rule->cpuid = -1;
4730 #endif
4731
4732         /* Statistics only need to be updated once */
4733         if (mycpuid == 0)
4734                 ipfw_dec_static_count(rule);
4735
4736         if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4737                 /* Try to free this rule */
4738                 ipfw_free_rule(rule);
4739         } else {
4740                 /* TODO: check staging area. */
4741                 if (mycpuid == 0) {
4742                         rule->next = ipfw_gd.ipfw_crossref_free;
4743                         ipfw_gd.ipfw_crossref_free = rule;
4744                 }
4745         }
4746
4747         /* Return the next rule */
4748         return n;
4749 }
4750
4751 static void
4752 ipfw_flush_dispatch(netmsg_t nmsg)
4753 {
4754         int kill_default = nmsg->lmsg.u.ms_result;
4755         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4756         struct ip_fw *rule;
4757
4758         ASSERT_NETISR_NCPUS(mycpuid);
4759
4760         /*
4761          * Flush states.
4762          */
4763         ipfw_state_flush(ctx, NULL);
4764         KASSERT(ctx->ipfw_state_cnt == 0,
4765             ("%d pcpu states remain", ctx->ipfw_state_cnt));
4766         ctx->ipfw_state_loosecnt = 0;
4767         ctx->ipfw_state_lastexp = 0;
4768
4769         /*
4770          * Flush tracks.
4771          */
4772         ipfw_track_flush(ctx, NULL);
4773         ctx->ipfw_track_lastexp = 0;
4774         if (ctx->ipfw_trkcnt_spare != NULL) {
4775                 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4776                 ctx->ipfw_trkcnt_spare = NULL;
4777         }
4778
4779         ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4780
4781         while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4782                (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4783                 ipfw_delete_rule(ctx, NULL, rule);
4784
4785         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4786 }
4787
4788 /*
4789  * Deletes all rules from a chain (including the default rule
4790  * if the second argument is set).
4791  */
4792 static void
4793 ipfw_flush(int kill_default)
4794 {
4795         struct netmsg_base nmsg;
4796 #ifdef INVARIANTS
4797         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4798         int state_cnt;
4799 #endif
4800
4801         ASSERT_NETISR0;
4802
4803         /*
4804          * If 'kill_default' then caller has done the necessary
4805          * msgport syncing; unnecessary to do it again.
4806          */
4807         if (!kill_default) {
4808                 /*
4809                  * Let ipfw_chk() know the rules are going to
4810                  * be flushed, so it could jump directly to
4811                  * the default rule.
4812                  */
4813                 ipfw_flushing = 1;
4814                 /* XXX use priority sync */
4815                 netmsg_service_sync();
4816         }
4817
4818         /*
4819          * Press the 'flush' button
4820          */
4821         bzero(&nmsg, sizeof(nmsg));
4822         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4823             ipfw_flush_dispatch);
4824         nmsg.lmsg.u.ms_result = kill_default;
4825         netisr_domsg_global(&nmsg);
4826         ipfw_gd.ipfw_state_loosecnt = 0;
4827         ipfw_gd.ipfw_state_globexp = 0;
4828         ipfw_gd.ipfw_track_globexp = 0;
4829
4830 #ifdef INVARIANTS
4831         state_cnt = ipfw_state_cntcoll();
4832         KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4833
4834         KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4835             ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4836
4837         if (kill_default) {
4838                 KASSERT(static_count == 0,
4839                         ("%u static rules remain", static_count));
4840                 KASSERT(static_ioc_len == 0,
4841                         ("%u bytes of static rules remain", static_ioc_len));
4842         } else {
4843                 KASSERT(static_count == 1,
4844                         ("%u static rules remain", static_count));
4845                 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4846                         ("%u bytes of static rules remain, should be %lu",
4847                          static_ioc_len,
4848                          (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4849         }
4850 #endif
4851
4852         /* Flush is done */
4853         ipfw_flushing = 0;
4854 }
4855
4856 static void
4857 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4858 {
4859         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4860         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4861         struct ip_fw *rule, *prev;
4862
4863         ASSERT_NETISR_NCPUS(mycpuid);
4864
4865         rule = dmsg->start_rule;
4866         KKASSERT(rule->cpuid == mycpuid);
4867         dmsg->start_rule = rule->sibling;
4868
4869         prev = dmsg->prev_rule;
4870         if (prev != NULL) {
4871                 KKASSERT(prev->cpuid == mycpuid);
4872
4873                 /*
4874                  * Move to the position on the next CPU
4875                  * before the msg is forwarded.
4876                  */
4877                 dmsg->prev_rule = prev->sibling;
4878         }
4879
4880         /*
4881          * flush pointers outside the loop, then delete all matching
4882          * rules.  'prev' remains the same throughout the cycle.
4883          */
4884         ipfw_flush_rule_ptrs(ctx);
4885         while (rule && rule->rulenum == dmsg->rulenum) {
4886                 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4887                         /* Flush states generated by this rule. */
4888                         ipfw_state_flush(ctx, rule);
4889                 }
4890                 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4891                         /* Flush tracks generated by this rule. */
4892                         ipfw_track_flush(ctx, rule);
4893                 }
4894                 rule = ipfw_delete_rule(ctx, prev, rule);
4895         }
4896
4897         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4898 }
4899
4900 static int
4901 ipfw_alt_delete_rule(uint16_t rulenum)
4902 {
4903         struct ip_fw *prev, *rule;
4904         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4905         struct netmsg_del dmsg;
4906
4907         ASSERT_NETISR0;
4908
4909         /*
4910          * Locate first rule to delete
4911          */
4912         for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4913              rule && rule->rulenum < rulenum;
4914              prev = rule, rule = rule->next)
4915                 ; /* EMPTY */
4916         if (rule->rulenum != rulenum)
4917                 return EINVAL;
4918
4919         /*
4920          * Get rid of the rule duplications on all CPUs
4921          */
4922         bzero(&dmsg, sizeof(dmsg));
4923         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4924             ipfw_alt_delete_rule_dispatch);
4925         dmsg.prev_rule = prev;
4926         dmsg.start_rule = rule;
4927         dmsg.rulenum = rulenum;
4928
4929         netisr_domsg_global(&dmsg.base);
4930         KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4931         return 0;
4932 }
4933
4934 static void
4935 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4936 {
4937         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4938         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4939         struct ip_fw *prev, *rule;
4940 #ifdef INVARIANTS
4941         int del = 0;
4942 #endif
4943
4944         ASSERT_NETISR_NCPUS(mycpuid);
4945
4946         ipfw_flush_rule_ptrs(ctx);
4947
4948         prev = NULL;
4949         rule = ctx->ipfw_layer3_chain;
4950         while (rule != NULL) {
4951                 if (rule->set == dmsg->from_set) {
4952                         if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4953                                 /* Flush states generated by this rule. */
4954                                 ipfw_state_flush(ctx, rule);
4955                         }
4956                         if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4957                                 /* Flush tracks generated by this rule. */
4958                                 ipfw_track_flush(ctx, rule);
4959                         }
4960                         rule = ipfw_delete_rule(ctx, prev, rule);
4961 #ifdef INVARIANTS
4962                         del = 1;
4963 #endif
4964                 } else {
4965                         prev = rule;
4966                         rule = rule->next;
4967                 }
4968         }
4969         KASSERT(del, ("no match set?!"));
4970
4971         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4972 }
4973
4974 static int
4975 ipfw_alt_delete_ruleset(uint8_t set)
4976 {
4977         struct netmsg_del dmsg;
4978         int del;
4979         struct ip_fw *rule;
4980         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4981
4982         ASSERT_NETISR0;
4983
4984         /*
4985          * Check whether the 'set' exists.  If it exists,
4986          * then check whether any rules within the set will
4987          * try to create states.
4988          */
4989         del = 0;
4990         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4991                 if (rule->set == set)
4992                         del = 1;
4993         }
4994         if (!del)
4995                 return 0; /* XXX EINVAL? */
4996
4997         /*
4998          * Delete this set
4999          */
5000         bzero(&dmsg, sizeof(dmsg));
5001         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5002             ipfw_alt_delete_ruleset_dispatch);
5003         dmsg.from_set = set;
5004         netisr_domsg_global(&dmsg.base);
5005
5006         return 0;
5007 }
5008
5009 static void
5010 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5011 {
5012         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5013         struct ip_fw *rule;
5014
5015         ASSERT_NETISR_NCPUS(mycpuid);
5016
5017         rule = dmsg->start_rule;
5018         KKASSERT(rule->cpuid == mycpuid);
5019
5020         /*
5021          * Move to the position on the next CPU
5022          * before the msg is forwarded.
5023          */
5024         dmsg->start_rule = rule->sibling;
5025
5026         while (rule && rule->rulenum <= dmsg->rulenum) {
5027                 if (rule->rulenum == dmsg->rulenum)
5028                         rule->set = dmsg->to_set;
5029                 rule = rule->next;
5030         }
5031         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5032 }
5033
5034 static int
5035 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5036 {
5037         struct netmsg_del dmsg;
5038         struct netmsg_base *nmsg;
5039         struct ip_fw *rule;
5040         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5041
5042         ASSERT_NETISR0;
5043
5044         /*
5045          * Locate first rule to move
5046          */
5047         for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5048              rule = rule->next) {
5049                 if (rule->rulenum == rulenum && rule->set != set)
5050                         break;
5051         }
5052         if (rule == NULL || rule->rulenum > rulenum)
5053                 return 0; /* XXX error? */
5054
5055         bzero(&dmsg, sizeof(dmsg));
5056         nmsg = &dmsg.base;
5057         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5058             ipfw_alt_move_rule_dispatch);
5059         dmsg.start_rule = rule;
5060         dmsg.rulenum = rulenum;
5061         dmsg.to_set = set;
5062
5063         netisr_domsg_global(nmsg);
5064         KKASSERT(dmsg.start_rule == NULL);
5065         return 0;
5066 }
5067
5068 static void
5069 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5070 {
5071         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5072         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5073         struct ip_fw *rule;
5074
5075         ASSERT_NETISR_NCPUS(mycpuid);
5076
5077         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5078                 if (rule->set == dmsg->from_set)
5079                         rule->set = dmsg->to_set;
5080         }
5081         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5082 }
5083
5084 static int
5085 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5086 {
5087         struct netmsg_del dmsg;
5088         struct netmsg_base *nmsg;
5089
5090         ASSERT_NETISR0;
5091
5092         bzero(&dmsg, sizeof(dmsg));
5093         nmsg = &dmsg.base;
5094         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5095             ipfw_alt_move_ruleset_dispatch);
5096         dmsg.from_set = from_set;
5097         dmsg.to_set = to_set;
5098
5099         netisr_domsg_global(nmsg);
5100         return 0;
5101 }
5102
5103 static void
5104 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5105 {
5106         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5107         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5108         struct ip_fw *rule;
5109
5110         ASSERT_NETISR_NCPUS(mycpuid);
5111
5112         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5113                 if (rule->set == dmsg->from_set)
5114                         rule->set = dmsg->to_set;
5115                 else if (rule->set == dmsg->to_set)
5116                         rule->set = dmsg->from_set;
5117         }
5118         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5119 }
5120
5121 static int
5122 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5123 {
5124         struct netmsg_del dmsg;
5125         struct netmsg_base *nmsg;
5126
5127         ASSERT_NETISR0;
5128
5129         bzero(&dmsg, sizeof(dmsg));
5130         nmsg = &dmsg.base;
5131         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5132             ipfw_alt_swap_ruleset_dispatch);
5133         dmsg.from_set = set1;
5134         dmsg.to_set = set2;
5135
5136         netisr_domsg_global(nmsg);
5137         return 0;
5138 }
5139
5140 /*
5141  * Remove all rules with given number, and also do set manipulation.
5142  *
5143  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5144  * the next 8 bits are the new set, the top 8 bits are the command:
5145  *
5146  *      0       delete rules with given number
5147  *      1       delete rules with given set number
5148  *      2       move rules with given number to new set
5149  *      3       move rules with given set number to new set
5150  *      4       swap sets with given numbers
5151  */
5152 static int
5153 ipfw_ctl_alter(uint32_t arg)
5154 {
5155         uint16_t rulenum;
5156         uint8_t cmd, new_set;
5157         int error = 0;
5158
5159         ASSERT_NETISR0;
5160
5161         rulenum = arg & 0xffff;
5162         cmd = (arg >> 24) & 0xff;
5163         new_set = (arg >> 16) & 0xff;
5164
5165         if (cmd > 4)
5166                 return EINVAL;
5167         if (new_set >= IPFW_DEFAULT_SET)
5168                 return EINVAL;
5169         if (cmd == 0 || cmd == 2) {
5170                 if (rulenum == IPFW_DEFAULT_RULE)
5171                         return EINVAL;
5172         } else {
5173                 if (rulenum >= IPFW_DEFAULT_SET)
5174                         return EINVAL;
5175         }
5176
5177         switch (cmd) {
5178         case 0: /* delete rules with given number */
5179                 error = ipfw_alt_delete_rule(rulenum);
5180                 break;
5181
5182         case 1: /* delete all rules with given set number */
5183                 error = ipfw_alt_delete_ruleset(rulenum);
5184                 break;
5185
5186         case 2: /* move rules with given number to new set */
5187                 error = ipfw_alt_move_rule(rulenum, new_set);
5188                 break;
5189
5190         case 3: /* move rules with given set number to new set */
5191                 error = ipfw_alt_move_ruleset(rulenum, new_set);
5192                 break;
5193
5194         case 4: /* swap two sets */
5195                 error = ipfw_alt_swap_ruleset(rulenum, new_set);
5196                 break;
5197         }
5198         return error;
5199 }
5200
5201 /*
5202  * Clear counters for a specific rule.
5203  */
5204 static void
5205 clear_counters(struct ip_fw *rule, int log_only)
5206 {
5207         ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5208
5209         if (log_only == 0) {
5210                 rule->bcnt = rule->pcnt = 0;
5211                 rule->timestamp = 0;
5212         }
5213         if (l->o.opcode == O_LOG)
5214                 l->log_left = l->max_log;
5215 }
5216
5217 static void
5218 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5219 {
5220         struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5221         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5222         struct ip_fw *rule;
5223
5224         ASSERT_NETISR_NCPUS(mycpuid);
5225
5226         if (zmsg->rulenum == 0) {
5227                 KKASSERT(zmsg->start_rule == NULL);
5228
5229                 ctx->ipfw_norule_counter = 0;
5230                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5231                         clear_counters(rule, zmsg->log_only);
5232         } else {
5233                 struct ip_fw *start = zmsg->start_rule;
5234
5235                 KKASSERT(start->cpuid == mycpuid);
5236                 KKASSERT(start->rulenum == zmsg->rulenum);
5237
5238                 /*
5239                  * We can have multiple rules with the same number, so we
5240                  * need to clear them all.
5241                  */
5242                 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5243                      rule = rule->next)
5244                         clear_counters(rule, zmsg->log_only);
5245
5246                 /*
5247                  * Move to the position on the next CPU
5248                  * before the msg is forwarded.
5249                  */
5250                 zmsg->start_rule = start->sibling;
5251         }
5252         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5253 }
5254
5255 /*
5256  * Reset some or all counters on firewall rules.
5257  * @arg frwl is null to clear all entries, or contains a specific
5258  * rule number.
5259  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5260  */
5261 static int
5262 ipfw_ctl_zero_entry(int rulenum, int log_only)
5263 {
5264         struct netmsg_zent zmsg;
5265         struct netmsg_base *nmsg;
5266         const char *msg;
5267         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5268
5269         ASSERT_NETISR0;
5270
5271         bzero(&zmsg, sizeof(zmsg));
5272         nmsg = &zmsg.base;
5273         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5274             ipfw_zero_entry_dispatch);
5275         zmsg.log_only = log_only;
5276
5277         if (rulenum == 0) {
5278                 msg = log_only ? "ipfw: All logging counts reset.\n"
5279                                : "ipfw: Accounting cleared.\n";
5280         } else {
5281                 struct ip_fw *rule;
5282
5283                 /*
5284                  * Locate the first rule with 'rulenum'
5285                  */
5286                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5287                         if (rule->rulenum == rulenum)
5288                                 break;
5289                 }
5290                 if (rule == NULL) /* we did not find any matching rules */
5291                         return (EINVAL);
5292                 zmsg.start_rule = rule;
5293                 zmsg.rulenum = rulenum;
5294
5295                 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5296                                : "ipfw: Entry %d cleared.\n";
5297         }
5298         netisr_domsg_global(nmsg);
5299         KKASSERT(zmsg.start_rule == NULL);
5300
5301         if (fw_verbose)
5302                 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5303         return (0);
5304 }
5305
5306 /*
5307  * Check validity of the structure before insert.
5308  * Fortunately rules are simple, so this mostly need to check rule sizes.
5309  */
5310 static int
5311 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5312 {
5313         int l, cmdlen = 0;
5314         int have_action = 0;
5315         ipfw_insn *cmd;
5316
5317         *rule_flags = 0;
5318
5319         /* Check for valid size */
5320         if (size < sizeof(*rule)) {
5321                 kprintf("ipfw: rule too short\n");
5322                 return EINVAL;
5323         }
5324         l = IOC_RULESIZE(rule);
5325         if (l != size) {
5326                 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5327                 return EINVAL;
5328         }
5329
5330         /* Check rule number */
5331         if (rule->rulenum == IPFW_DEFAULT_RULE) {
5332                 kprintf("ipfw: invalid rule number\n");
5333                 return EINVAL;
5334         }
5335
5336         /*
5337          * Now go for the individual checks. Very simple ones, basically only
5338          * instruction sizes.
5339          */
5340         for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5341              l -= cmdlen, cmd += cmdlen) {
5342                 cmdlen = F_LEN(cmd);
5343                 if (cmdlen > l) {
5344                         kprintf("ipfw: opcode %d size truncated\n",
5345                                 cmd->opcode);
5346                         return EINVAL;
5347                 }
5348
5349                 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5350
5351                 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5352                     IPFW_ISXLAT(cmd->opcode)) {
5353                         /* This rule will generate states. */
5354                         *rule_flags |= IPFW_RULE_F_GENSTATE;
5355                         if (cmd->opcode == O_LIMIT)
5356                                 *rule_flags |= IPFW_RULE_F_GENTRACK;
5357                 }
5358                 if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5359                         *rule_flags |= IPFW_RULE_F_CROSSREF;
5360                 if (cmd->opcode == O_IP_SRC_IFIP ||
5361                     cmd->opcode == O_IP_DST_IFIP) {
5362                         *rule_flags |= IPFW_RULE_F_DYNIFADDR;
5363                         cmd->arg1 &= IPFW_IFIP_SETTINGS;
5364                 }
5365
5366                 switch (cmd->opcode) {
5367                 case O_NOP:
5368                 case O_PROBE_STATE:
5369                 case O_KEEP_STATE:
5370                 case O_PROTO:
5371                 case O_IP_SRC_ME:
5372                 case O_IP_DST_ME:
5373                 case O_LAYER2:
5374                 case O_IN:
5375                 case O_FRAG:
5376                 case O_IPFRAG:
5377                 case O_IPOPT:
5378                 case O_IPLEN:
5379                 case O_IPID:
5380                 case O_IPTOS:
5381                 case O_IPPRECEDENCE:
5382                 case O_IPTTL:
5383                 case O_IPVER:
5384                 case O_TCPWIN:
5385                 case O_TCPFLAGS:
5386                 case O_TCPOPTS:
5387                 case O_ESTAB:
5388                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5389                                 goto bad_size;
5390                         break;
5391
5392                 case O_IP_SRC_TABLE:
5393                 case O_IP_DST_TABLE:
5394                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5395                                 goto bad_size;
5396                         if (cmd->arg1 >= ipfw_table_max) {
5397                                 kprintf("ipfw: invalid table id %u, max %d\n",
5398                                     cmd->arg1, ipfw_table_max);
5399                                 return EINVAL;
5400                         }
5401                         break;
5402
5403                 case O_IP_SRC_IFIP:
5404                 case O_IP_DST_IFIP:
5405                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5406                                 goto bad_size;
5407                         break;
5408
5409                 case O_ICMPCODE:
5410                 case O_ICMPTYPE:
5411                         if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5412                                 goto bad_size;
5413                         break;
5414
5415                 case O_UID:
5416                 case O_GID:
5417                 case O_IP_SRC:
5418                 case O_IP_DST:
5419                 case O_TCPSEQ:
5420                 case O_TCPACK:
5421                 case O_PROB:
5422                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5423                                 goto bad_size;
5424                         break;
5425
5426                 case O_LIMIT:
5427                         if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5428                                 goto bad_size;
5429                         break;
5430                 case O_REDIRECT:
5431                         if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5432                                 goto bad_size;
5433                         break;
5434
5435                 case O_LOG:
5436                         if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5437                                 goto bad_size;
5438
5439                         ((ipfw_insn_log *)cmd)->log_left =
5440                             ((ipfw_insn_log *)cmd)->max_log;
5441
5442                         break;
5443
5444                 case O_IP_SRC_MASK:
5445                 case O_IP_DST_MASK:
5446                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5447                                 goto bad_size;
5448                         if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5449                                 kprintf("ipfw: opcode %d, useless rule\n",
5450                                         cmd->opcode);
5451                                 return EINVAL;
5452                         }
5453                         break;
5454
5455                 case O_IP_SRC_SET:
5456                 case O_IP_DST_SET:
5457                         if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5458                                 kprintf("ipfw: invalid set size %d\n",
5459                                         cmd->arg1);
5460                                 return EINVAL;
5461                         }
5462                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5463                             (cmd->arg1+31)/32 )
5464                                 goto bad_size;
5465                         break;
5466
5467                 case O_MACADDR2:
5468                         if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5469                                 goto bad_size;
5470                         break;
5471
5472                 case O_MAC_TYPE:
5473                 case O_IP_SRCPORT:
5474                 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5475                         if (cmdlen < 2 || cmdlen > 31)
5476                                 goto bad_size;
5477                         break;
5478
5479                 case O_RECV:
5480                 case O_XMIT:
5481                 case O_VIA:
5482                         if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5483                                 goto bad_size;
5484                         break;
5485
5486                 case O_PIPE:
5487                 case O_QUEUE:
5488                         if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5489                                 goto bad_size;
5490                         goto check_action;
5491
5492                 case O_FORWARD_IP:
5493                         if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5494                                 goto bad_size;
5495                         } else {
5496                                 in_addr_t fwd_addr;
5497
5498                                 fwd_addr = ((ipfw_insn_sa *)cmd)->
5499                                            sa.sin_addr.s_addr;
5500                                 if (IN_MULTICAST(ntohl(fwd_addr))) {
5501                                         kprintf("ipfw: try forwarding to "
5502                                                 "multicast address\n");
5503                                         return EINVAL;
5504                                 }
5505                         }
5506                         goto check_action;
5507
5508                 case O_FORWARD_MAC: /* XXX not implemented yet */
5509                 case O_CHECK_STATE:
5510                 case O_COUNT:
5511                 case O_ACCEPT:
5512                 case O_DENY:
5513                 case O_REJECT:
5514                 case O_SKIPTO:
5515                 case O_DIVERT:
5516                 case O_TEE:
5517                 case O_DEFRAG:
5518                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5519                                 goto bad_size;
5520 check_action:
5521                         if (have_action) {
5522                                 kprintf("ipfw: opcode %d, multiple actions"
5523                                         " not allowed\n",
5524                                         cmd->opcode);
5525                                 return EINVAL;
5526                         }
5527                         have_action = 1;
5528                         if (l != cmdlen) {
5529                                 kprintf("ipfw: opcode %d, action must be"
5530                                         " last opcode\n",
5531                                         cmd->opcode);
5532                                 return EINVAL;
5533                         }
5534                         break;
5535                 default:
5536                         kprintf("ipfw: opcode %d, unknown opcode\n",
5537                                 cmd->opcode);
5538                         return EINVAL;
5539                 }
5540         }
5541         if (have_action == 0) {
5542                 kprintf("ipfw: missing action\n");
5543                 return EINVAL;
5544         }
5545         return 0;
5546
5547 bad_size:
5548         kprintf("ipfw: opcode %d size %d wrong\n",
5549                 cmd->opcode, cmdlen);
5550         return EINVAL;
5551 }
5552
5553 static int
5554 ipfw_ctl_add_rule(struct sockopt *sopt)
5555 {
5556         struct ipfw_ioc_rule *ioc_rule;
5557         size_t size;
5558         uint32_t rule_flags;
5559         int error;
5560
5561         ASSERT_NETISR0;
5562
5563         size = sopt->sopt_valsize;
5564         if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5565             size < sizeof(*ioc_rule)) {
5566                 return EINVAL;
5567         }
5568         if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5569                 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5570                                           IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5571         }
5572         ioc_rule = sopt->sopt_val;
5573
5574         error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5575         if (error)
5576                 return error;
5577
5578         ipfw_add_rule(ioc_rule, rule_flags);
5579
5580         if (sopt->sopt_dir == SOPT_GET)
5581                 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5582         return 0;
5583 }
5584
5585 static void *
5586 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5587     struct ipfw_ioc_rule *ioc_rule)
5588 {
5589         const struct ip_fw *sibling;
5590 #ifdef INVARIANTS
5591         int i;
5592 #endif
5593
5594         ASSERT_NETISR0;
5595         KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5596
5597         ioc_rule->act_ofs = rule->act_ofs;
5598         ioc_rule->cmd_len = rule->cmd_len;
5599         ioc_rule->rulenum = rule->rulenum;
5600         ioc_rule->set = rule->set;
5601         ioc_rule->usr_flags = rule->usr_flags;
5602
5603         ioc_rule->set_disable = ctx->ipfw_set_disable;
5604         ioc_rule->static_count = static_count;
5605         ioc_rule->static_len = static_ioc_len;
5606
5607         /*
5608          * Visit (read-only) all of the rule's duplications to get
5609          * the necessary statistics
5610          */
5611 #ifdef INVARIANTS
5612         i = 0;
5613 #endif
5614         ioc_rule->pcnt = 0;
5615         ioc_rule->bcnt = 0;
5616         ioc_rule->timestamp = 0;
5617         for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5618                 ioc_rule->pcnt += sibling->pcnt;
5619                 ioc_rule->bcnt += sibling->bcnt;
5620                 if (sibling->timestamp > ioc_rule->timestamp)
5621                         ioc_rule->timestamp = sibling->timestamp;
5622 #ifdef INVARIANTS
5623                 ++i;
5624 #endif
5625         }
5626         KASSERT(i == netisr_ncpus,
5627             ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5628
5629         bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5630
5631         return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5632 }
5633
5634 static boolean_t
5635 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5636 {
5637         struct ipfw_ioc_flowid *ioc_id;
5638
5639         if (trk->tc_expire == 0) {
5640                 /* Not a scanned one. */
5641                 return (FALSE);
5642         }
5643
5644         ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5645             0 : trk->tc_expire - time_uptime;
5646         ioc_state->pcnt = 0;
5647         ioc_state->bcnt = 0;
5648
5649         ioc_state->dyn_type = O_LIMIT_PARENT;
5650         ioc_state->count = trk->tc_count;
5651
5652         ioc_state->rulenum = trk->tc_rulenum;
5653
5654         ioc_id = &ioc_state->id;
5655         ioc_id->type = ETHERTYPE_IP;
5656         ioc_id->u.ip.proto = trk->tc_proto;
5657         ioc_id->u.ip.src_ip = trk->tc_saddr;
5658         ioc_id->u.ip.dst_ip = trk->tc_daddr;
5659         ioc_id->u.ip.src_port = trk->tc_sport;
5660         ioc_id->u.ip.dst_port = trk->tc_dport;
5661
5662         return (TRUE);
5663 }
5664
5665 static boolean_t
5666 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5667 {
5668         struct ipfw_ioc_flowid *ioc_id;
5669
5670         if (IPFW_STATE_SCANSKIP(s))
5671                 return (FALSE);
5672
5673         ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5674             0 : s->st_expire - time_uptime;
5675         ioc_state->pcnt = s->st_pcnt;
5676         ioc_state->bcnt = s->st_bcnt;
5677
5678         ioc_state->dyn_type = s->st_type;
5679         ioc_state->count = 0;
5680
5681         ioc_state->rulenum = s->st_rule->rulenum;
5682
5683         ioc_id = &ioc_state->id;
5684         ioc_id->type = ETHERTYPE_IP;
5685         ioc_id->u.ip.proto = s->st_proto;
5686         ipfw_key_4tuple(&s->st_key,
5687             &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5688             &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5689
5690         if (IPFW_ISXLAT(s->st_type)) {
5691                 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5692
5693                 if (x->xlat_port == 0)
5694                         ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5695                 else
5696                         ioc_state->xlat_port = ntohs(x->xlat_port);
5697                 ioc_state->xlat_addr = ntohl(x->xlat_addr);
5698
5699                 ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5700                 ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5701         }
5702
5703         return (TRUE);
5704 }
5705
5706 static void
5707 ipfw_state_copy_dispatch(netmsg_t nmsg)
5708 {
5709         struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5710         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5711         const struct ipfw_state *s;
5712         const struct ipfw_track *t;
5713
5714         ASSERT_NETISR_NCPUS(mycpuid);
5715         KASSERT(nm->state_cnt < nm->state_cntmax,
5716             ("invalid state count %d, max %d",
5717              nm->state_cnt, nm->state_cntmax));
5718
5719         TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5720                 if (ipfw_state_copy(s, nm->ioc_state)) {
5721                         nm->ioc_state++;
5722                         nm->state_cnt++;
5723                         if (nm->state_cnt == nm->state_cntmax)
5724                                 goto done;
5725                 }
5726         }
5727
5728         /*
5729          * Prepare tracks in the global track tree for userland.
5730          */
5731         TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5732                 struct ipfw_trkcnt *trk;
5733
5734                 if (t->t_count == NULL) /* anchor */
5735                         continue;
5736                 trk = t->t_trkcnt;
5737
5738                 /*
5739                  * Only one netisr can run this function at
5740                  * any time, and only this function accesses
5741                  * trkcnt's tc_expire, so this is safe w/o
5742                  * ipfw_gd.ipfw_trkcnt_token.
5743                  */
5744                 if (trk->tc_expire > t->t_expire)
5745                         continue;
5746                 trk->tc_expire = t->t_expire;
5747         }
5748
5749         /*
5750          * Copy tracks in the global track tree to userland in
5751          * the last netisr.
5752          */
5753         if (mycpuid == netisr_ncpus - 1) {
5754                 struct ipfw_trkcnt *trk;
5755
5756                 KASSERT(nm->state_cnt < nm->state_cntmax,
5757                     ("invalid state count %d, max %d",
5758                      nm->state_cnt, nm->state_cntmax));
5759
5760                 IPFW_TRKCNT_TOKGET;
5761                 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5762                         if (ipfw_track_copy(trk, nm->ioc_state)) {
5763                                 nm->ioc_state++;
5764                                 nm->state_cnt++;
5765                                 if (nm->state_cnt == nm->state_cntmax) {
5766                                         IPFW_TRKCNT_TOKREL;
5767                                         goto done;
5768                                 }
5769                         }
5770                 }
5771                 IPFW_TRKCNT_TOKREL;
5772         }
5773 done:
5774         if (nm->state_cnt == nm->state_cntmax) {
5775                 /* No more space; done. */
5776                 netisr_replymsg(&nm->base, 0);
5777         } else {
5778                 netisr_forwardmsg(&nm->base, mycpuid + 1);
5779         }
5780 }
5781
5782 static int
5783 ipfw_ctl_get_rules(struct sockopt *sopt)
5784 {
5785         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5786         struct ip_fw *rule;
5787         void *bp;
5788         size_t size;
5789         int state_cnt;
5790
5791         ASSERT_NETISR0;
5792
5793         /*
5794          * pass up a copy of the current rules. Static rules
5795          * come first (the last of which has number IPFW_DEFAULT_RULE),
5796          * followed by a possibly empty list of states.
5797          */
5798
5799         size = static_ioc_len;  /* size of static rules */
5800
5801         /*
5802          * Size of the states.
5803          * XXX take tracks as state for userland compat.
5804          */
5805         state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5806         state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5807         size += state_cnt * sizeof(struct ipfw_ioc_state);
5808
5809         if (sopt->sopt_valsize < size) {
5810                 /* short length, no need to return incomplete rules */
5811                 /* XXX: if superuser, no need to zero buffer */
5812                 bzero(sopt->sopt_val, sopt->sopt_valsize);
5813                 return 0;
5814         }
5815         bp = sopt->sopt_val;
5816
5817         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5818                 bp = ipfw_copy_rule(ctx, rule, bp);
5819
5820         if (state_cnt) {
5821                 struct netmsg_cpstate nm;
5822 #ifdef INVARIANTS
5823                 size_t old_size = size;
5824 #endif
5825
5826                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5827                     MSGF_PRIORITY, ipfw_state_copy_dispatch);
5828                 nm.ioc_state = bp;
5829                 nm.state_cntmax = state_cnt;
5830                 nm.state_cnt = 0;
5831                 netisr_domsg_global(&nm.base);
5832
5833                 /*
5834                  * The # of states may be shrinked after the snapshot
5835                  * of the state count was taken.  To give user a correct
5836                  * state count, nm->state_cnt is used to recalculate
5837                  * the actual size.
5838                  */
5839                 size = static_ioc_len +
5840                     (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5841                 KKASSERT(size <= old_size);
5842         }
5843
5844         sopt->sopt_valsize = size;
5845         return 0;
5846 }
5847
5848 static void
5849 ipfw_set_disable_dispatch(netmsg_t nmsg)
5850 {
5851         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5852
5853         ASSERT_NETISR_NCPUS(mycpuid);
5854
5855         ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5856         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5857 }
5858
5859 static void
5860 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5861 {
5862         struct netmsg_base nmsg;
5863         uint32_t set_disable;
5864
5865         ASSERT_NETISR0;
5866
5867         /* IPFW_DEFAULT_SET is always enabled */
5868         enable |= (1 << IPFW_DEFAULT_SET);
5869         set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5870
5871         bzero(&nmsg, sizeof(nmsg));
5872         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5873             ipfw_set_disable_dispatch);
5874         nmsg.lmsg.u.ms_result32 = set_disable;
5875
5876         netisr_domsg_global(&nmsg);
5877 }
5878
5879 static void
5880 ipfw_table_create_dispatch(netmsg_t nm)
5881 {
5882         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5883         int tblid = nm->lmsg.u.ms_result;
5884
5885         ASSERT_NETISR_NCPUS(mycpuid);
5886
5887         if (!rn_inithead(&ctx->ipfw_tables[tblid], rn_cpumaskhead(mycpuid),
5888                          offsetof(struct sockaddr_in, sin_addr)))
5889                 panic("ipfw: create table%d failed", tblid);
5890
5891         netisr_forwardmsg(&nm->base, mycpuid + 1);
5892 }
5893
5894 static int
5895 ipfw_table_create(struct sockopt *sopt)
5896 {
5897         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5898         struct ipfw_ioc_table *tbl;
5899         struct netmsg_base nm;
5900
5901         ASSERT_NETISR0;
5902
5903         if (sopt->sopt_valsize != sizeof(*tbl))
5904                 return (EINVAL);
5905
5906         tbl = sopt->sopt_val;
5907         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5908                 return (EINVAL);
5909
5910         if (ctx->ipfw_tables[tbl->tableid] != NULL)
5911                 return (EEXIST);
5912
5913         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5914             ipfw_table_create_dispatch);
5915         nm.lmsg.u.ms_result = tbl->tableid;
5916         netisr_domsg_global(&nm);
5917
5918         return (0);
5919 }
5920
5921 static void
5922 ipfw_table_killent(struct radix_node *rn)
5923 {
5924         struct ipfw_tblent *te;
5925
5926         te = (struct ipfw_tblent *)rn;
5927         kfree(te, M_IPFW);
5928 }
5929
5930 static void
5931 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5932     int destroy)
5933 {
5934         struct radix_node_head *rnh;
5935
5936         ASSERT_NETISR_NCPUS(mycpuid);
5937
5938         rnh = ctx->ipfw_tables[tableid];
5939         rn_flush(rnh, ipfw_table_killent);
5940         if (destroy) {
5941                 rn_freehead(rnh);
5942                 ctx->ipfw_tables[tableid] = NULL;
5943         }
5944 }
5945
5946 static void
5947 ipfw_table_flush_dispatch(netmsg_t nmsg)
5948 {
5949         struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5950         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5951
5952         ASSERT_NETISR_NCPUS(mycpuid);
5953
5954         ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5955         netisr_forwardmsg(&nm->base, mycpuid + 1);
5956 }
5957
5958 static void
5959 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5960 {
5961         int i;
5962
5963         ASSERT_NETISR_NCPUS(mycpuid);
5964
5965         for (i = 0; i < ipfw_table_max; ++i) {
5966                 if (ctx->ipfw_tables[i] != NULL)
5967                         ipfw_table_flush_oncpu(ctx, i, destroy);
5968         }
5969 }
5970
5971 static void
5972 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5973 {
5974         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5975
5976         ASSERT_NETISR_NCPUS(mycpuid);
5977
5978         ipfw_table_flushall_oncpu(ctx, 0);
5979         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5980 }
5981
5982 static int
5983 ipfw_table_flush(struct sockopt *sopt)
5984 {
5985         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5986         struct ipfw_ioc_table *tbl;
5987         struct netmsg_tblflush nm;
5988
5989         ASSERT_NETISR0;
5990
5991         if (sopt->sopt_valsize != sizeof(*tbl))
5992                 return (EINVAL);
5993
5994         tbl = sopt->sopt_val;
5995         if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
5996                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5997                     MSGF_PRIORITY, ipfw_table_flushall_dispatch);
5998                 netisr_domsg_global(&nm.base);
5999                 return (0);
6000         }
6001
6002         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6003                 return (EINVAL);
6004
6005         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6006                 return (ENOENT);
6007
6008         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6009             ipfw_table_flush_dispatch);
6010         nm.tableid = tbl->tableid;
6011         nm.destroy = 0;
6012         if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6013                 nm.destroy = 1;
6014         netisr_domsg_global(&nm.base);
6015
6016         return (0);
6017 }
6018
6019 static int
6020 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6021 {
6022         int *cnt = xcnt;
6023
6024         (*cnt)++;
6025         return (0);
6026 }
6027
6028 static int
6029 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6030 {
6031         struct ipfw_table_cp *cp = xcp;
6032         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6033         struct ipfw_ioc_tblent *ioc_te;
6034 #ifdef INVARIANTS
6035         int cnt;
6036 #endif
6037
6038         KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6039             cp->te_idx, cp->te_cnt));
6040         ioc_te = &cp->te[cp->te_idx];
6041
6042         if (te->te_nodes->rn_mask != NULL) {
6043                 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6044                     *te->te_nodes->rn_mask);
6045         } else {
6046                 ioc_te->netmask.sin_len = 0;
6047         }
6048         memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6049
6050         ioc_te->use = te->te_use;
6051         ioc_te->last_used = te->te_lastuse;
6052 #ifdef INVARIANTS
6053         cnt = 1;
6054 #endif
6055
6056         while ((te = te->te_sibling) != NULL) {
6057 #ifdef INVARIANTS
6058                 ++cnt;
6059 #endif
6060                 ioc_te->use += te->te_use;
6061                 if (te->te_lastuse > ioc_te->last_used)
6062                         ioc_te->last_used = te->te_lastuse;
6063         }
6064         KASSERT(cnt == netisr_ncpus,
6065             ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6066
6067         cp->te_idx++;
6068
6069         return (0);
6070 }
6071
6072 static int
6073 ipfw_table_get(struct sockopt *sopt)
6074 {
6075         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6076         struct radix_node_head *rnh;
6077         struct ipfw_ioc_table *tbl;
6078         struct ipfw_ioc_tblcont *cont;
6079         struct ipfw_table_cp cp;
6080         int cnt = 0, sz;
6081
6082         ASSERT_NETISR0;
6083
6084         if (sopt->sopt_valsize < sizeof(*tbl))
6085                 return (EINVAL);
6086
6087         tbl = sopt->sopt_val;
6088         if (tbl->tableid < 0) {
6089                 struct ipfw_ioc_tbllist *list;
6090                 int i;
6091
6092                 /*
6093                  * List available table ids.
6094                  */
6095                 for (i = 0; i < ipfw_table_max; ++i) {
6096                         if (ctx->ipfw_tables[i] != NULL)
6097                                 ++cnt;
6098                 }
6099
6100                 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6101                 if (sopt->sopt_valsize < sz) {
6102                         bzero(sopt->sopt_val, sopt->sopt_valsize);
6103                         return (E2BIG);
6104                 }
6105                 list = sopt->sopt_val;
6106                 list->tablecnt = cnt;
6107
6108                 cnt = 0;
6109                 for (i = 0; i < ipfw_table_max; ++i) {
6110                         if (ctx->ipfw_tables[i] != NULL) {
6111                                 KASSERT(cnt < list->tablecnt,
6112                                     ("invalid idx %d, cnt %d",
6113                                      cnt, list->tablecnt));
6114                                 list->tables[cnt++] = i;
6115                         }
6116                 }
6117                 sopt->sopt_valsize = sz;
6118                 return (0);
6119         } else if (tbl->tableid >= ipfw_table_max) {
6120                 return (EINVAL);
6121         }
6122
6123         rnh = ctx->ipfw_tables[tbl->tableid];
6124         if (rnh == NULL)
6125                 return (ENOENT);
6126         rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6127
6128         sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6129         if (sopt->sopt_valsize < sz) {
6130                 bzero(sopt->sopt_val, sopt->sopt_valsize);
6131                 return (E2BIG);
6132         }
6133         cont = sopt->sopt_val;
6134         cont->entcnt = cnt;
6135
6136         cp.te = cont->ent;
6137         cp.te_idx = 0;
6138         cp.te_cnt = cnt;
6139         rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6140
6141         sopt->sopt_valsize = sz;
6142         return (0);
6143 }
6144
6145 static void
6146 ipfw_table_add_dispatch(netmsg_t nmsg)
6147 {
6148         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6149         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6150         struct radix_node_head *rnh;
6151         struct ipfw_tblent *te;
6152
6153         ASSERT_NETISR_NCPUS(mycpuid);
6154
6155         rnh = ctx->ipfw_tables[nm->tableid];
6156
6157         te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6158         te->te_nodes->rn_key = (char *)&te->te_key;
6159         memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6160
6161         if (rnh->rnh_addaddr(&te->te_key, nm->netmask, rnh, te->te_nodes)
6162             == NULL) {
6163                 if (mycpuid == 0) {
6164                         kfree(te, M_IPFW);
6165                         netisr_replymsg(&nm->base, EEXIST);
6166                         return;
6167                 }
6168                 panic("rnh_addaddr failed");
6169         }
6170
6171         /* Link siblings. */
6172         if (nm->sibling != NULL)
6173                 nm->sibling->te_sibling = te;
6174         nm->sibling = te;
6175
6176         netisr_forwardmsg(&nm->base, mycpuid + 1);
6177 }
6178
6179 static void
6180 ipfw_table_del_dispatch(netmsg_t nmsg)
6181 {
6182         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6183         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6184         struct radix_node_head *rnh;
6185         struct radix_node *rn;
6186
6187         ASSERT_NETISR_NCPUS(mycpuid);
6188
6189         rnh = ctx->ipfw_tables[nm->tableid];
6190         rn = rnh->rnh_deladdr(nm->key, nm->netmask, rnh);
6191         if (rn == NULL) {
6192                 if (mycpuid == 0) {
6193                         netisr_replymsg(&nm->base, ESRCH);
6194                         return;
6195                 }
6196                 panic("rnh_deladdr failed");
6197         }
6198         kfree(rn, M_IPFW);
6199
6200         netisr_forwardmsg(&nm->base, mycpuid + 1);
6201 }
6202
6203 static int
6204 ipfw_table_alt(struct sockopt *sopt)
6205 {
6206         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6207         struct ipfw_ioc_tblcont *tbl;
6208         struct ipfw_ioc_tblent *te;
6209         struct sockaddr_in key0;
6210         struct sockaddr *netmask = NULL, *key;
6211         struct netmsg_tblent nm;
6212
6213         ASSERT_NETISR0;
6214
6215         if (sopt->sopt_valsize != sizeof(*tbl))
6216                 return (EINVAL);
6217         tbl = sopt->sopt_val;
6218
6219         if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6220                 return (EINVAL);
6221         if (tbl->entcnt != 1)
6222                 return (EINVAL);
6223
6224         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6225                 return (ENOENT);
6226         te = &tbl->ent[0];
6227
6228         if (te->key.sin_family != AF_INET ||
6229             te->key.sin_port != 0 ||
6230             te->key.sin_len != sizeof(struct sockaddr_in))
6231                 return (EINVAL);
6232         key = (struct sockaddr *)&te->key;
6233
6234         if (te->netmask.sin_len != 0) {
6235                 if (te->netmask.sin_port != 0 ||
6236                     te->netmask.sin_len > sizeof(struct sockaddr_in))
6237                         return (EINVAL);
6238                 netmask = (struct sockaddr *)&te->netmask;
6239                 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6240                 key = (struct sockaddr *)&key0;
6241         }
6242
6243         if (sopt->sopt_name == IP_FW_TBL_ADD) {
6244                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6245                     MSGF_PRIORITY, ipfw_table_add_dispatch);
6246         } else {
6247                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6248                     MSGF_PRIORITY, ipfw_table_del_dispatch);
6249         }
6250         nm.key = key;
6251         nm.netmask = netmask;
6252         nm.tableid = tbl->tableid;
6253         nm.sibling = NULL;
6254         return (netisr_domsg_global(&nm.base));
6255 }
6256
6257 static int
6258 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6259 {
6260         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6261
6262         te->te_use = 0;
6263         te->te_lastuse = 0;
6264         return (0);
6265 }
6266
6267 static void
6268 ipfw_table_zero_dispatch(netmsg_t nmsg)
6269 {
6270         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6271         struct radix_node_head *rnh;
6272
6273         ASSERT_NETISR_NCPUS(mycpuid);
6274
6275         rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6276         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6277
6278         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6279 }
6280
6281 static void
6282 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6283 {
6284         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6285         int i;
6286
6287         ASSERT_NETISR_NCPUS(mycpuid);
6288
6289         for (i = 0; i < ipfw_table_max; ++i) {
6290                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6291
6292                 if (rnh != NULL)
6293                         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6294         }
6295         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6296 }
6297
6298 static int
6299 ipfw_table_zero(struct sockopt *sopt)
6300 {
6301         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6302         struct netmsg_base nm;
6303         struct ipfw_ioc_table *tbl;
6304
6305         ASSERT_NETISR0;
6306
6307         if (sopt->sopt_valsize != sizeof(*tbl))
6308                 return (EINVAL);
6309         tbl = sopt->sopt_val;
6310
6311         if (tbl->tableid < 0) {
6312                 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6313                     ipfw_table_zeroall_dispatch);
6314                 netisr_domsg_global(&nm);
6315                 return (0);
6316         } else if (tbl->tableid >= ipfw_table_max) {
6317                 return (EINVAL);
6318         } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6319                 return (ENOENT);
6320         }
6321
6322         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6323             ipfw_table_zero_dispatch);
6324         nm.lmsg.u.ms_result = tbl->tableid;
6325         netisr_domsg_global(&nm);
6326
6327         return (0);
6328 }
6329
6330 static int
6331 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6332 {
6333         struct netmsg_tblexp *nm = xnm;
6334         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6335         struct radix_node *ret;
6336
6337         if (te->te_expired) {
6338                 ret = nm->rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, nm->rnh);
6339                 if (ret != rn)
6340                         panic("deleted other table entry");
6341                 kfree(ret, M_IPFW);
6342                 nm->expcnt++;
6343         }
6344         return (0);
6345 }
6346
6347 static void
6348 ipfw_table_expire_dispatch(netmsg_t nmsg)
6349 {
6350         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6351         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6352         struct radix_node_head *rnh;
6353
6354         ASSERT_NETISR_NCPUS(mycpuid);
6355
6356         rnh = ctx->ipfw_tables[nm->tableid];
6357         nm->rnh = rnh;
6358         rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6359
6360         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6361             ("not all expired addresses (%d) were deleted (%d)",
6362              nm->cnt * (mycpuid + 1), nm->expcnt));
6363
6364         netisr_forwardmsg(&nm->base, mycpuid + 1);
6365 }
6366
6367 static void
6368 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6369 {
6370         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6371         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6372         int i;
6373
6374         ASSERT_NETISR_NCPUS(mycpuid);
6375
6376         for (i = 0; i < ipfw_table_max; ++i) {
6377                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6378
6379                 if (rnh == NULL)
6380                         continue;
6381                 nm->rnh = rnh;
6382                 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6383         }
6384
6385         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6386             ("not all expired addresses (%d) were deleted (%d)",
6387              nm->cnt * (mycpuid + 1), nm->expcnt));
6388
6389         netisr_forwardmsg(&nm->base, mycpuid + 1);
6390 }
6391
6392 static int
6393 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6394 {
6395         struct netmsg_tblexp *nm = xnm;
6396         struct ipfw_tblent *te;
6397         time_t lastuse;
6398
6399         te = (struct ipfw_tblent *)rn;
6400         lastuse = te->te_lastuse;
6401
6402         while ((te = te->te_sibling) != NULL) {
6403                 if (te->te_lastuse > lastuse)
6404                         lastuse = te->te_lastuse;
6405         }
6406         if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6407                 /* Not expired */
6408                 return (0);
6409         }
6410
6411         te = (struct ipfw_tblent *)rn;
6412         te->te_expired = 1;
6413         while ((te = te->te_sibling) != NULL)
6414                 te->te_expired = 1;
6415         nm->cnt++;
6416
6417         return (0);
6418 }
6419
6420 static int
6421 ipfw_table_expire(struct sockopt *sopt)
6422 {
6423         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6424         struct netmsg_tblexp nm;
6425         struct ipfw_ioc_tblexp *tbl;
6426         struct radix_node_head *rnh;
6427
6428         ASSERT_NETISR0;
6429
6430         if (sopt->sopt_valsize != sizeof(*tbl))
6431                 return (EINVAL);
6432         tbl = sopt->sopt_val;
6433         tbl->expcnt = 0;
6434
6435         nm.expcnt = 0;
6436         nm.cnt = 0;
6437         nm.expire = tbl->expire;
6438
6439         if (tbl->tableid < 0) {
6440                 int i;
6441
6442                 for (i = 0; i < ipfw_table_max; ++i) {
6443                         rnh = ctx->ipfw_tables[i];
6444                         if (rnh == NULL)
6445                                 continue;
6446                         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6447                 }
6448                 if (nm.cnt == 0) {
6449                         /* No addresses can be expired. */
6450                         return (0);
6451                 }
6452                 tbl->expcnt = nm.cnt;
6453
6454                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6455                     MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6456                 nm.tableid = -1;
6457                 netisr_domsg_global(&nm.base);
6458                 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6459                     ("not all expired addresses (%d) were deleted (%d)",
6460                      nm.cnt * netisr_ncpus, nm.expcnt));
6461
6462                 return (0);
6463         } else if (tbl->tableid >= ipfw_table_max) {
6464                 return (EINVAL);
6465         }
6466
6467         rnh = ctx->ipfw_tables[tbl->tableid];
6468         if (rnh == NULL)
6469                 return (ENOENT);
6470         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6471         if (nm.cnt == 0) {
6472                 /* No addresses can be expired. */
6473                 return (0);
6474         }
6475         tbl->expcnt = nm.cnt;
6476
6477         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6478             ipfw_table_expire_dispatch);
6479         nm.tableid = tbl->tableid;
6480         netisr_domsg_global(&nm.base);
6481         KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6482             ("not all expired addresses (%d) were deleted (%d)",
6483              nm.cnt * netisr_ncpus, nm.expcnt));
6484         return (0);
6485 }
6486
6487 static void
6488 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6489 {
6490         struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6491
6492         KKASSERT((rule->rule_flags &
6493             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6494             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6495         ipfw_free_rule(rule);
6496
6497         netisr_replymsg(&nmsg->base, 0);
6498 }
6499
6500 static void
6501 ipfw_crossref_reap(void)
6502 {
6503         struct ip_fw *rule, *prev = NULL;
6504
6505         ASSERT_NETISR0;
6506
6507         rule = ipfw_gd.ipfw_crossref_free;
6508         while (rule != NULL) {
6509                 uint64_t inflight = 0;
6510                 int i;
6511
6512                 for (i = 0; i < netisr_ncpus; ++i)
6513                         inflight += rule->cross_rules[i]->cross_refs;
6514                 if (inflight == 0) {
6515                         struct ip_fw *f = rule;
6516
6517                         /*
6518                          * Unlink.
6519                          */
6520                         rule = rule->next;
6521                         if (prev != NULL)
6522                                 prev->next = rule;
6523                         else
6524                                 ipfw_gd.ipfw_crossref_free = rule;
6525
6526                         /*
6527                          * Free.
6528                          */
6529                         for (i = 1; i < netisr_ncpus; ++i) {
6530                                 struct netmsg_base nm;
6531
6532                                 netmsg_init(&nm, NULL, &curthread->td_msgport,
6533                                     MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6534                                 nm.lmsg.u.ms_resultp = f->cross_rules[i];
6535                                 netisr_domsg(&nm, i);
6536                         }
6537                         KKASSERT((f->rule_flags &
6538                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6539                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6540                         ipfw_unref_rule(f);
6541                 } else {
6542                         prev = rule;
6543                         rule = rule->next;
6544                 }
6545         }
6546
6547         if (ipfw_gd.ipfw_crossref_free != NULL) {
6548                 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6549                     ipfw_crossref_timeo, NULL);
6550         }
6551 }
6552
6553 /*
6554  * {set|get}sockopt parser.
6555  */
6556 static int
6557 ipfw_ctl(struct sockopt *sopt)
6558 {
6559         int error, rulenum;
6560         uint32_t *masks;
6561         size_t size;
6562
6563         ASSERT_NETISR0;
6564
6565         error = 0;
6566
6567         switch (sopt->sopt_name) {
6568         case IP_FW_GET:
6569                 error = ipfw_ctl_get_rules(sopt);
6570                 break;
6571
6572         case IP_FW_FLUSH:
6573                 ipfw_flush(0 /* keep default rule */);
6574                 break;
6575
6576         case IP_FW_ADD:
6577                 error = ipfw_ctl_add_rule(sopt);
6578                 break;
6579
6580         case IP_FW_DEL:
6581                 /*
6582                  * IP_FW_DEL is used for deleting single rules or sets,
6583                  * and (ab)used to atomically manipulate sets.
6584                  * Argument size is used to distinguish between the two:
6585                  *    sizeof(uint32_t)
6586                  *      delete single rule or set of rules,
6587                  *      or reassign rules (or sets) to a different set.
6588                  *    2 * sizeof(uint32_t)
6589                  *      atomic disable/enable sets.
6590                  *      first uint32_t contains sets to be disabled,
6591                  *      second uint32_t contains sets to be enabled.
6592                  */
6593                 masks = sopt->sopt_val;
6594                 size = sopt->sopt_valsize;
6595                 if (size == sizeof(*masks)) {
6596                         /*
6597                          * Delete or reassign static rule
6598                          */
6599                         error = ipfw_ctl_alter(masks[0]);
6600                 } else if (size == (2 * sizeof(*masks))) {
6601                         /*
6602                          * Set enable/disable
6603                          */
6604                         ipfw_ctl_set_disable(masks[0], masks[1]);
6605                 } else {
6606                         error = EINVAL;
6607                 }
6608                 break;
6609
6610         case IP_FW_ZERO:
6611         case IP_FW_RESETLOG: /* argument is an int, the rule number */
6612                 rulenum = 0;
6613
6614                 if (sopt->sopt_val != 0) {
6615                     error = soopt_to_kbuf(sopt, &rulenum,
6616                             sizeof(int), sizeof(int));
6617                     if (error)
6618                         break;
6619                 }
6620                 error = ipfw_ctl_zero_entry(rulenum,
6621                         sopt->sopt_name == IP_FW_RESETLOG);
6622                 break;
6623
6624         case IP_FW_TBL_CREATE:
6625                 error = ipfw_table_create(sopt);
6626                 break;
6627
6628         case IP_FW_TBL_ADD:
6629         case IP_FW_TBL_DEL:
6630                 error = ipfw_table_alt(sopt);
6631                 break;
6632
6633         case IP_FW_TBL_FLUSH:
6634         case IP_FW_TBL_DESTROY:
6635                 error = ipfw_table_flush(sopt);
6636                 break;
6637
6638         case IP_FW_TBL_GET:
6639                 error = ipfw_table_get(sopt);
6640                 break;
6641
6642         case IP_FW_TBL_ZERO:
6643                 error = ipfw_table_zero(sopt);
6644                 break;
6645
6646         case IP_FW_TBL_EXPIRE:
6647                 error = ipfw_table_expire(sopt);
6648                 break;
6649
6650         default:
6651                 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6652                 error = EINVAL;
6653         }
6654
6655         ipfw_crossref_reap();
6656         return error;
6657 }
6658
6659 static void
6660 ipfw_keepalive_done(struct ipfw_context *ctx)
6661 {
6662
6663         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6664             ("keepalive is not in progress"));
6665         ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6666         callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6667             ipfw_keepalive, NULL);
6668 }
6669
6670 static void
6671 ipfw_keepalive_more(struct ipfw_context *ctx)
6672 {
6673         struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6674
6675         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6676             ("keepalive is not in progress"));
6677         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6678             ("keepalive more did not finish"));
6679         netisr_sendmsg_oncpu(nm);
6680 }
6681
6682 static void
6683 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6684 {
6685         struct ipfw_state *s;
6686         int scanned = 0, expired = 0, kept = 0;
6687
6688         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6689             ("keepalive is not in progress"));
6690
6691         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6692                 uint32_t ack_rev, ack_fwd;
6693                 struct ipfw_flow_id id;
6694                 uint8_t send_dir;
6695
6696                 if (scanned++ >= ipfw_state_scan_max) {
6697                         ipfw_keepalive_more(ctx);
6698                         return;
6699                 }
6700
6701                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6702                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6703
6704                 /*
6705                  * NOTE:
6706                  * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6707                  * on slave xlat.
6708                  */
6709                 if (s->st_type == O_ANCHOR)
6710                         continue;
6711
6712                 if (IPFW_STATE_ISDEAD(s)) {
6713                         ipfw_state_remove(ctx, s);
6714                         if (++expired >= ipfw_state_expire_max) {
6715                                 ipfw_keepalive_more(ctx);
6716                                 return;
6717                         }
6718                         continue;
6719                 }
6720
6721                 /*
6722                  * Keep alive processing
6723                  */
6724
6725                 if (s->st_proto != IPPROTO_TCP)
6726                         continue;
6727                 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6728                         continue;
6729                 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6730                     s->st_expire))
6731                         continue;       /* too early */
6732
6733                 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6734                     &id.dst_ip, &id.dst_port);
6735                 ack_rev = s->st_ack_rev;
6736                 ack_fwd = s->st_ack_fwd;
6737
6738 #define SEND_FWD        0x1
6739 #define SEND_REV        0x2
6740
6741                 if (IPFW_ISXLAT(s->st_type)) {
6742                         const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6743
6744                         if (x->xlat_dir == MATCH_FORWARD)
6745                                 send_dir = SEND_FWD;
6746                         else
6747                                 send_dir = SEND_REV;
6748                 } else {
6749                         send_dir = SEND_FWD | SEND_REV;
6750                 }
6751
6752                 if (send_dir & SEND_REV)
6753                         send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6754                 if (send_dir & SEND_FWD)
6755                         send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6756
6757 #undef SEND_FWD
6758 #undef SEND_REV
6759
6760                 if (++kept >= ipfw_keepalive_max) {
6761                         ipfw_keepalive_more(ctx);
6762                         return;
6763                 }
6764         }
6765         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6766         ipfw_keepalive_done(ctx);
6767 }
6768
6769 static void
6770 ipfw_keepalive_more_dispatch(netmsg_t nm)
6771 {
6772         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6773         struct ipfw_state *anchor;
6774
6775         ASSERT_NETISR_NCPUS(mycpuid);
6776         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6777             ("keepalive is not in progress"));
6778
6779         /* Reply ASAP */
6780         netisr_replymsg(&nm->base, 0);
6781
6782         anchor = &ctx->ipfw_keepalive_anch;
6783         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6784                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6785                 ipfw_keepalive_done(ctx);
6786                 return;
6787         }
6788         ipfw_keepalive_loop(ctx, anchor);
6789 }
6790
6791 /*
6792  * This procedure is only used to handle keepalives. It is invoked
6793  * every dyn_keepalive_period
6794  */
6795 static void
6796 ipfw_keepalive_dispatch(netmsg_t nm)
6797 {
6798         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6799         struct ipfw_state *anchor;
6800
6801         ASSERT_NETISR_NCPUS(mycpuid);
6802         KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6803             ("keepalive is in progress"));
6804         ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6805
6806         /* Reply ASAP */
6807         crit_enter();
6808         netisr_replymsg(&nm->base, 0);
6809         crit_exit();
6810
6811         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6812                 ipfw_keepalive_done(ctx);
6813                 return;
6814         }
6815
6816         anchor = &ctx->ipfw_keepalive_anch;
6817         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6818         ipfw_keepalive_loop(ctx, anchor);
6819 }
6820
6821 /*
6822  * This procedure is only used to handle keepalives. It is invoked
6823  * every dyn_keepalive_period
6824  */
6825 static void
6826 ipfw_keepalive(void *dummy __unused)
6827 {
6828         struct netmsg_base *msg;
6829
6830         KKASSERT(mycpuid < netisr_ncpus);
6831         msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6832
6833         crit_enter();
6834         if (msg->lmsg.ms_flags & MSGF_DONE)
6835                 netisr_sendmsg_oncpu(msg);
6836         crit_exit();
6837 }
6838
6839 static void
6840 ipfw_ip_input_dispatch(netmsg_t nmsg)
6841 {
6842         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6843         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6844         struct mbuf *m = nm->m;
6845         struct ip_fw *rule = nm->arg1;
6846
6847         ASSERT_NETISR_NCPUS(mycpuid);
6848         KASSERT(rule->cpuid == mycpuid,
6849             ("rule does not belong to cpu%d", mycpuid));
6850         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6851             ("mbuf does not have ipfw continue rule"));
6852
6853         KASSERT(ctx->ipfw_cont_rule == NULL,
6854             ("pending ipfw continue rule"));
6855         ctx->ipfw_cont_rule = rule;
6856         ip_input(m);
6857
6858         /* May not be cleared, if ipfw was unload/disabled. */
6859         ctx->ipfw_cont_rule = NULL;
6860
6861         /*
6862          * This rule is no longer used; decrement its cross_refs,
6863          * so this rule can be deleted.
6864          */
6865         rule->cross_refs--;
6866 }
6867
6868 static void
6869 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6870 {
6871         struct netmsg_genpkt *nm;
6872
6873         KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6874
6875         /*
6876          * NOTE:
6877          * Bump cross_refs to prevent this rule and its siblings
6878          * from being deleted, while this mbuf is inflight.  The
6879          * cross_refs of the sibling rule on the target cpu will
6880          * be decremented, once this mbuf is going to be filtered
6881          * on the target cpu.
6882          */
6883         rule->cross_refs++;
6884         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6885
6886         nm = &m->m_hdr.mh_genmsg;
6887         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6888             ipfw_ip_input_dispatch);
6889         nm->m = m;
6890         nm->arg1 = rule->cross_rules[cpuid];
6891         netisr_sendmsg(&nm->base, cpuid);
6892 }
6893
6894 static void
6895 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6896 {
6897
6898         args->flags = 0;
6899         args->rule = NULL;
6900         args->xlat = NULL;
6901
6902         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6903                 struct m_tag *mtag;
6904
6905                 /* Extract info from dummynet tag */
6906                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6907                 KKASSERT(mtag != NULL);
6908                 args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6909                 KKASSERT(args->rule != NULL);
6910
6911                 m_tag_delete(m, mtag);
6912                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6913         } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6914                 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6915
6916                 KKASSERT(ctx->ipfw_cont_rule != NULL);
6917                 args->rule = ctx->ipfw_cont_rule;
6918                 ctx->ipfw_cont_rule = NULL;
6919
6920                 if (ctx->ipfw_cont_xlat != NULL) {
6921                         args->xlat = ctx->ipfw_cont_xlat;
6922                         ctx->ipfw_cont_xlat = NULL;
6923                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6924                                 args->flags |= IP_FWARG_F_XLATINS;
6925                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6926                         }
6927                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6928                                 args->flags |= IP_FWARG_F_XLATFWD;
6929                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6930                         }
6931                 }
6932                 KKASSERT((m->m_pkthdr.fw_flags &
6933                     (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6934
6935                 args->flags |= IP_FWARG_F_CONT;
6936                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6937         }
6938
6939         args->eh = NULL;
6940         args->oif = oif;
6941         args->m = m;
6942 }
6943
6944 static int
6945 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6946 {
6947         struct ip_fw_args args;
6948         struct mbuf *m = *m0;
6949         int tee = 0, error = 0, ret;
6950
6951         ipfw_init_args(&args, m, NULL);
6952
6953         ret = ipfw_chk(&args);
6954         m = args.m;
6955         if (m == NULL) {
6956                 if (ret != IP_FW_REDISPATCH)
6957                         error = EACCES;
6958                 goto back;
6959         }
6960
6961         switch (ret) {
6962         case IP_FW_PASS:
6963                 break;
6964
6965         case IP_FW_DENY:
6966                 m_freem(m);
6967                 m = NULL;
6968                 error = EACCES;
6969                 break;
6970
6971         case IP_FW_DUMMYNET:
6972                 /* Send packet to the appropriate pipe */
6973                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6974                 break;
6975
6976         case IP_FW_TEE:
6977                 tee = 1;
6978                 /* FALL THROUGH */
6979
6980         case IP_FW_DIVERT:
6981                 /*
6982                  * Must clear bridge tag when changing
6983                  */
6984                 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
6985                 if (ip_divert_p != NULL) {
6986                         m = ip_divert_p(m, tee, 1);
6987                 } else {
6988                         m_freem(m);
6989                         m = NULL;
6990                         /* not sure this is the right error msg */
6991                         error = EACCES;
6992                 }
6993                 break;
6994
6995         default:
6996                 panic("unknown ipfw return value: %d", ret);
6997         }
6998 back:
6999         *m0 = m;
7000         return error;
7001 }
7002
7003 static int
7004 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7005 {
7006         struct ip_fw_args args;
7007         struct mbuf *m = *m0;
7008         int tee = 0, error = 0, ret;
7009
7010         ipfw_init_args(&args, m, ifp);
7011
7012         ret = ipfw_chk(&args);
7013         m = args.m;
7014         if (m == NULL) {
7015                 if (ret != IP_FW_REDISPATCH)
7016                         error = EACCES;
7017                 goto back;
7018         }
7019
7020         switch (ret) {
7021         case IP_FW_PASS:
7022                 break;
7023
7024         case IP_FW_DENY:
7025                 m_freem(m);
7026                 m = NULL;
7027                 error = EACCES;
7028                 break;
7029
7030         case IP_FW_DUMMYNET:
7031                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7032                 break;
7033
7034         case IP_FW_TEE:
7035                 tee = 1;
7036                 /* FALL THROUGH */
7037
7038         case IP_FW_DIVERT:
7039                 if (ip_divert_p != NULL) {
7040                         m = ip_divert_p(m, tee, 0);
7041                 } else {
7042                         m_freem(m);
7043                         m = NULL;
7044                         /* not sure this is the right error msg */
7045                         error = EACCES;
7046                 }
7047                 break;
7048
7049         default:
7050                 panic("unknown ipfw return value: %d", ret);
7051         }
7052 back:
7053         *m0 = m;
7054         return error;
7055 }
7056
7057 static void
7058 ipfw_hook(void)
7059 {
7060         struct pfil_head *pfh;
7061
7062         ASSERT_NETISR0;
7063
7064         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7065         if (pfh == NULL)
7066                 return;
7067
7068         pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7069         pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7070 }
7071
7072 static void
7073 ipfw_dehook(void)
7074 {
7075         struct pfil_head *pfh;
7076
7077         ASSERT_NETISR0;
7078
7079         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7080         if (pfh == NULL)
7081                 return;
7082
7083         pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7084         pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7085 }
7086
7087 static int
7088 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7089 {
7090         int dyn_cnt;
7091
7092         dyn_cnt = ipfw_state_cntcoll();
7093         dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7094
7095         return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7096 }
7097
7098 static int
7099 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7100 {
7101         int state_cnt;
7102
7103         state_cnt = ipfw_state_cntcoll();
7104         return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7105 }
7106
7107 static int
7108 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7109 {
7110         int state_max, error;
7111
7112         state_max = ipfw_state_max;
7113         error = sysctl_handle_int(oidp, &state_max, 0, req);
7114         if (error || req->newptr == NULL)
7115                 return (error);
7116
7117         if (state_max < 1)
7118                 return (EINVAL);
7119
7120         ipfw_state_max_set(state_max);
7121         return (0);
7122 }
7123
7124 static int
7125 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7126 {
7127         int dyn_max, error;
7128
7129         dyn_max = ipfw_state_max + ipfw_track_max;
7130
7131         error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7132         if (error || req->newptr == NULL)
7133                 return (error);
7134
7135         if (dyn_max < 2)
7136                 return (EINVAL);
7137
7138         ipfw_state_max_set(dyn_max / 2);
7139         ipfw_track_max = dyn_max / 2;
7140         return (0);
7141 }
7142
7143 static void
7144 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7145 {
7146         int enable = nmsg->lmsg.u.ms_result;
7147
7148         ASSERT_NETISR0;
7149
7150         if (fw_enable == enable)
7151                 goto reply;
7152
7153         fw_enable = enable;
7154         if (fw_enable)
7155                 ipfw_hook();
7156         else
7157                 ipfw_dehook();
7158 reply:
7159         netisr_replymsg(&nmsg->base, 0);
7160 }
7161
7162 static int
7163 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7164 {
7165         struct netmsg_base nmsg;
7166         int enable, error;
7167
7168         enable = fw_enable;
7169         error = sysctl_handle_int(oidp, &enable, 0, req);
7170         if (error || req->newptr == NULL)
7171                 return error;
7172
7173         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7174             ipfw_sysctl_enable_dispatch);
7175         nmsg.lmsg.u.ms_result = enable;
7176
7177         return netisr_domsg(&nmsg, 0);
7178 }
7179
7180 static int
7181 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7182 {
7183         return sysctl_int_range(oidp, arg1, arg2, req,
7184                IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7185 }
7186
7187 static int
7188 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7189 {
7190
7191         return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7192 }
7193
7194 static int
7195 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7196 {
7197         u_long stat = 0;
7198         int cpu, error;
7199
7200         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7201                 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7202
7203         error = sysctl_handle_long(oidp, &stat, 0, req);
7204         if (error || req->newptr == NULL)
7205                 return (error);
7206
7207         /* Zero out this stat. */
7208         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7209                 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7210         return (0);
7211 }
7212
7213 static void
7214 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7215 {
7216         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7217         struct ipfw_context *ctx;
7218         struct ip_fw *def_rule;
7219
7220         ASSERT_NETISR_NCPUS(mycpuid);
7221
7222         ctx = kmalloc(__offsetof(struct ipfw_context,
7223             ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7224
7225         RB_INIT(&ctx->ipfw_state_tree);
7226         TAILQ_INIT(&ctx->ipfw_state_list);
7227
7228         RB_INIT(&ctx->ipfw_track_tree);
7229         TAILQ_INIT(&ctx->ipfw_track_list);
7230
7231         callout_init_mp(&ctx->ipfw_stateto_ch);
7232         netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7233             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7234         ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7235         netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7236             MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7237
7238         callout_init_mp(&ctx->ipfw_trackto_ch);
7239         netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7240             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7241         netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7242             MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7243
7244         callout_init_mp(&ctx->ipfw_keepalive_ch);
7245         netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7246             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7247         ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7248         netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7249             MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7250
7251         callout_init_mp(&ctx->ipfw_xlatreap_ch);
7252         netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7253             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7254         TAILQ_INIT(&ctx->ipfw_xlatreap);
7255
7256         ipfw_ctx[mycpuid] = ctx;
7257
7258         def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7259
7260         def_rule->act_ofs = 0;
7261         def_rule->rulenum = IPFW_DEFAULT_RULE;
7262         def_rule->cmd_len = 1;
7263         def_rule->set = IPFW_DEFAULT_SET;
7264
7265         def_rule->cmd[0].len = 1;
7266 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7267         def_rule->cmd[0].opcode = O_ACCEPT;
7268 #else
7269         if (filters_default_to_accept)
7270                 def_rule->cmd[0].opcode = O_ACCEPT;
7271         else
7272                 def_rule->cmd[0].opcode = O_DENY;
7273 #endif
7274
7275         def_rule->refcnt = 1;
7276         def_rule->cpuid = mycpuid;
7277
7278         /* Install the default rule */
7279         ctx->ipfw_default_rule = def_rule;
7280         ctx->ipfw_layer3_chain = def_rule;
7281
7282         /* Link rule CPU sibling */
7283         ipfw_link_sibling(fwmsg, def_rule);
7284
7285         /* Statistics only need to be updated once */
7286         if (mycpuid == 0)
7287                 ipfw_inc_static_count(def_rule);
7288
7289         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7290 }
7291
7292 static void
7293 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7294 {
7295
7296         crit_enter();
7297         /* Reply ASAP */
7298         netisr_replymsg(&nmsg->base, 0);
7299         crit_exit();
7300         ipfw_crossref_reap();
7301 }
7302
7303 static void
7304 ipfw_crossref_timeo(void *dummy __unused)
7305 {
7306         struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7307
7308         KKASSERT(mycpuid == 0);
7309
7310         crit_enter();
7311         if (msg->lmsg.ms_flags & MSGF_DONE)
7312                 netisr_sendmsg_oncpu(msg);
7313         crit_exit();
7314 }
7315
7316 static void
7317 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7318 {
7319         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7320         struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7321         struct ip_fw *f;
7322
7323         ASSERT_NETISR_NCPUS(mycpuid);
7324
7325         for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7326                 int l, cmdlen;
7327                 ipfw_insn *cmd;
7328
7329                 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7330                         continue;
7331
7332                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
7333                      l -= cmdlen, cmd += cmdlen) {
7334                         cmdlen = F_LEN(cmd);
7335                         if (cmd->opcode == O_IP_SRC_IFIP ||
7336                             cmd->opcode == O_IP_DST_IFIP) {
7337                                 if (strncmp(ifp->if_xname,
7338                                     ((ipfw_insn_ifip *)cmd)->ifname,
7339                                     IFNAMSIZ) == 0)
7340                                         cmd->arg1 &= ~IPFW_IFIP_VALID;
7341                         }
7342                 }
7343         }
7344         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7345 }
7346
7347 static void
7348 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7349     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7350 {
7351         struct netmsg_base nm;
7352
7353         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7354             ipfw_ifaddr_dispatch);
7355         nm.lmsg.u.ms_resultp = ifp;
7356         netisr_domsg_global(&nm);
7357 }
7358
7359 static void
7360 ipfw_init_dispatch(netmsg_t nmsg)
7361 {
7362         struct netmsg_ipfw fwmsg;
7363         int error = 0, cpu;
7364
7365         ASSERT_NETISR0;
7366
7367         if (IPFW_LOADED) {
7368                 kprintf("IP firewall already loaded\n");
7369                 error = EEXIST;
7370                 goto reply;
7371         }
7372
7373         if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7374                 ipfw_table_max = UINT16_MAX;
7375
7376         /* Initialize global track tree. */
7377         RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7378         IPFW_TRKCNT_TOKINIT;
7379
7380         /* GC for freed crossref rules. */
7381         callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7382         netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7383             MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7384
7385         ipfw_state_max_set(ipfw_state_max);
7386         ipfw_state_headroom = 8 * netisr_ncpus;
7387
7388         bzero(&fwmsg, sizeof(fwmsg));
7389         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7390             ipfw_ctx_init_dispatch);
7391         netisr_domsg_global(&fwmsg.base);
7392
7393         ip_fw_chk_ptr = ipfw_chk;
7394         ip_fw_ctl_ptr = ipfw_ctl;
7395         ip_fw_dn_io_ptr = ipfw_dummynet_io;
7396
7397         kprintf("ipfw2 initialized, default to %s, logging ",
7398                 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7399                 O_ACCEPT ? "accept" : "deny");
7400
7401 #ifdef IPFIREWALL_VERBOSE
7402         fw_verbose = 1;
7403 #endif
7404 #ifdef IPFIREWALL_VERBOSE_LIMIT
7405         verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7406 #endif
7407         if (fw_verbose == 0) {
7408                 kprintf("disabled\n");
7409         } else if (verbose_limit == 0) {
7410                 kprintf("unlimited\n");
7411         } else {
7412                 kprintf("limited to %d packets/entry by default\n",
7413                         verbose_limit);
7414         }
7415
7416         ip_fw_loaded = 1;
7417         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7418                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7419                     ipfw_state_expire_ipifunc, NULL, cpu);
7420                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7421                     ipfw_track_expire_ipifunc, NULL, cpu);
7422                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7423                     ipfw_keepalive, NULL, cpu);
7424         }
7425
7426         if (fw_enable)
7427                 ipfw_hook();
7428
7429         ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7430             NULL, EVENTHANDLER_PRI_ANY);
7431         if (ipfw_ifaddr_event == NULL)
7432                 kprintf("ipfw: ifaddr_event register failed\n");
7433
7434 reply:
7435         netisr_replymsg(&nmsg->base, error);
7436 }
7437
7438 static int
7439 ipfw_init(void)
7440 {
7441         struct netmsg_base smsg;
7442
7443         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7444             ipfw_init_dispatch);
7445         return netisr_domsg(&smsg, 0);
7446 }
7447
7448 #ifdef KLD_MODULE
7449
7450 static void
7451 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7452 {
7453         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7454
7455         ASSERT_NETISR_NCPUS(mycpuid);
7456
7457         callout_cancel(&ctx->ipfw_stateto_ch);
7458         callout_cancel(&ctx->ipfw_trackto_ch);
7459         callout_cancel(&ctx->ipfw_keepalive_ch);
7460         callout_cancel(&ctx->ipfw_xlatreap_ch);
7461
7462         crit_enter();
7463         netisr_dropmsg(&ctx->ipfw_stateexp_more);
7464         netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7465         netisr_dropmsg(&ctx->ipfw_trackexp_more);
7466         netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7467         netisr_dropmsg(&ctx->ipfw_keepalive_more);
7468         netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7469         netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7470         crit_exit();
7471
7472         ipfw_table_flushall_oncpu(ctx, 1);
7473
7474         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7475 }
7476
7477 static void
7478 ipfw_fini_dispatch(netmsg_t nmsg)
7479 {
7480         struct netmsg_base nm;
7481         int error = 0, cpu;
7482
7483         ASSERT_NETISR0;
7484
7485         ipfw_crossref_reap();
7486
7487         if (ipfw_gd.ipfw_refcnt != 0) {
7488                 error = EBUSY;
7489                 goto reply;
7490         }
7491
7492         ip_fw_loaded = 0;
7493         ipfw_dehook();
7494
7495         /* Synchronize any inflight state/track expire IPIs. */
7496         lwkt_synchronize_ipiqs("ipfwfini");
7497
7498         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7499             ipfw_ctx_fini_dispatch);
7500         netisr_domsg_global(&nm);
7501
7502         callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7503         crit_enter();
7504         netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7505         crit_exit();
7506
7507         if (ipfw_ifaddr_event != NULL)
7508                 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7509
7510         ip_fw_chk_ptr = NULL;
7511         ip_fw_ctl_ptr = NULL;
7512         ip_fw_dn_io_ptr = NULL;
7513         ipfw_flush(1 /* kill default rule */);
7514
7515         /* Free pre-cpu context */
7516         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7517                 kfree(ipfw_ctx[cpu], M_IPFW);
7518
7519         kprintf("IP firewall unloaded\n");
7520 reply:
7521         netisr_replymsg(&nmsg->base, error);
7522 }
7523
7524 static void
7525 ipfw_fflush_dispatch(netmsg_t nmsg)
7526 {
7527
7528         ipfw_flush(0 /* keep default rule */);
7529         ipfw_crossref_reap();
7530         netisr_replymsg(&nmsg->base, 0);
7531 }
7532
7533 static int
7534 ipfw_fini(void)
7535 {
7536         struct netmsg_base smsg;
7537         int i = 0;
7538
7539         for (;;) {
7540                 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7541                     ipfw_fflush_dispatch);
7542                 netisr_domsg(&smsg, 0);
7543
7544                 if (ipfw_gd.ipfw_refcnt == 0)
7545                         break;
7546                 kprintf("ipfw: flush pending %d\n", ++i);
7547                 tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7548         }
7549
7550         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7551             ipfw_fini_dispatch);
7552         return netisr_domsg(&smsg, 0);
7553 }
7554
7555 #endif  /* KLD_MODULE */
7556
7557 static int
7558 ipfw_modevent(module_t mod, int type, void *unused)
7559 {
7560         int err = 0;
7561
7562         switch (type) {
7563         case MOD_LOAD:
7564                 err = ipfw_init();
7565                 break;
7566
7567         case MOD_UNLOAD:
7568 #ifndef KLD_MODULE
7569                 kprintf("ipfw statically compiled, cannot unload\n");
7570                 err = EBUSY;
7571 #else
7572                 err = ipfw_fini();
7573 #endif
7574                 break;
7575         default:
7576                 break;
7577         }
7578         return err;
7579 }
7580
7581 static moduledata_t ipfwmod = {
7582         "ipfw",
7583         ipfw_modevent,
7584         0
7585 };
7586 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7587 MODULE_VERSION(ipfw, 1);