kernel - Rewrite the callout_*() API
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27
28 /*
29  * Implement IP packet firewall (new version)
30  */
31
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
62
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79
80 #include <net/ipfw/ip_fw2.h>
81
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85         if (fw_debug > 0) \
86                 kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...)       ((void)0)
90 #endif
91
92 /*
93  * Description about per-CPU rule duplication:
94  *
95  * Module loading/unloading and all ioctl operations are serialized
96  * by netisr0, so we don't have any ordering or locking problems.
97  *
98  * Following graph shows how operation on per-CPU rule list is
99  * performed [2 CPU case]:
100  *
101  *   CPU0                 CPU1
102  *
103  * netisr0 <------------------------------------+
104  *  domsg                                       |
105  *    :                                         |
106  *    :(delete/add...)                          |
107  *    :                                         |
108  *    :         netmsg                          | netmsg
109  *  forwardmsg---------->netisr1                |
110  *                          :                   |
111  *                          :(delete/add...)    |
112  *                          :                   |
113  *                          :                   |
114  *                        replymsg--------------+
115  *
116  *
117  *
118  * Rule structure [2 CPU case]
119  *
120  *    CPU0               CPU1
121  *
122  * layer3_chain       layer3_chain
123  *     |                  |
124  *     V                  V
125  * +-------+ sibling  +-------+ sibling
126  * | rule1 |--------->| rule1 |--------->NULL
127  * +-------+          +-------+
128  *     |                  |
129  *     |next              |next
130  *     V                  V
131  * +-------+ sibling  +-------+ sibling
132  * | rule2 |--------->| rule2 |--------->NULL
133  * +-------+          +-------+
134  *
135  * ip_fw.sibling:
136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
137  *    iterate layer3_chain in netisr0; the current rule's duplication
138  *    to the other CPUs could safely be read-only accessed through
139  *    ip_fw.sibling.
140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141  *    a) In netisr0 rule3 is determined to be inserted between rule1
142  *       and rule2.  To make this decision we need to iterate the
143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
145  *       in netisr0 as next_rule.
146  *    b) After the insertion in netisr0 is done, we will move on to
147  *       netisr1.  But instead of relocating the rule3's position in
148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
149  *       netmsg's prev_rule to rule1->sibling and next_rule to
150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
151  *       netisr0.
152  */
153
154 /*
155  * Description of states and tracks.
156  *
157  * Both states and tracks are stored in per-cpu RB trees instead of
158  * per-cpu hash tables to avoid the worst case hash degeneration.
159  *
160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161  * measured in seconds and depending on the flags.
162  *
163  * When a packet is received, its address fields are first masked with
164  * the mask defined for the rule, then matched against the entries in
165  * the per-cpu state RB tree.  States are generated by 'keep-state'
166  * and 'limit' options.
167  *
168  * The max number of states is ipfw_state_max.  When we reach the
169  * maximum number of states we do not create anymore.  This is done to
170  * avoid consuming too much memory, but also too much time when
171  * searching on each packet.
172  *
173  * Each state holds a pointer to the parent ipfw rule of the current
174  * CPU so we know what action to perform.  States are removed when the
175  * parent rule is deleted.  XXX we should make them survive.
176  *
177  * There are some limitations with states -- we do not obey the
178  * 'randomized match', and we do not do multiple passes through the
179  * firewall.  XXX check the latter!!!
180  *
181  * States grow independently on each CPU, e.g. 2 CPU case:
182  *
183  *        CPU0                     CPU1
184  * ...................      ...................
185  * :  state RB tree  :      :  state RB tree  :
186  * :                 :      :                 :
187  * : state1   state2 :      :      state3     :
188  * :     |    |      :      :        |        :
189  * :.....|....|......:      :........|........:
190  *       |    |                      |
191  *       |    |                      |st_rule
192  *       |    |                      |
193  *       V    V                      V
194  *     +-------+                 +-------+
195  *     | rule1 |                 | rule1 |
196  *     +-------+                 +-------+
197  *
198  * Tracks are used to enforce limits on the number of sessions.  Tracks
199  * are generated by 'limit' option.
200  *
201  * The max number of tracks is ipfw_track_max.  When we reach the
202  * maximum number of tracks we do not create anymore.  This is done to
203  * avoid consuming too much memory.
204  *
205  * Tracks are organized into two layers, track counter RB tree is
206  * shared between CPUs, track RB tree is per-cpu.  States generated by
207  * 'limit' option are linked to the track in addition to the per-cpu
208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
209  *
210  *             ..............................
211  *             :    track counter RB tree   :
212  *             :                            :
213  *             :        +-----------+       :
214  *             :        |  trkcnt1  |       :
215  *             :        |           |       :
216  *             :      +--->counter<----+    :
217  *             :      | |           |  |    :
218  *             :      | +-----------+  |    :
219  *             :......|................|....:
220  *                    |                |
221  *        CPU0        |                |         CPU1
222  * .................  |t_count         |  .................
223  * : track RB tree :  |                |  : track RB tree :
224  * :               :  |                |  :               :
225  * : +-->track1-------+                +--------track2    :
226  * : |     A       :                      :               :
227  * : |     |       :                      :               :
228  * :.|.....|.......:                      :...............:
229  *   |     +----------------+
230  *   | .................... |
231  *   | :   state RB tree  : |st_track
232  *   | :                  : |
233  *   +---state1    state2---+
234  *     :     |       |    :
235  *     :.....|.......|....:
236  *           |       |
237  *           |       |st_rule
238  *           V       V
239  *         +----------+
240  *         |   rule1  |
241  *         +----------+
242  */
243
244 #define IPFW_AUTOINC_STEP_MIN   1
245 #define IPFW_AUTOINC_STEP_MAX   1000
246 #define IPFW_AUTOINC_STEP_DEF   100
247
248 #define IPFW_TABLE_MAX_DEF      64
249
250 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
252
253 #define MATCH_REVERSE           0
254 #define MATCH_FORWARD           1
255 #define MATCH_NONE              2
256 #define MATCH_UNKNOWN           3
257
258 #define TIME_LEQ(a, b)          ((a) - (b) <= 0)
259
260 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
261 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
262                                  (IPFW_STATE_TCPFLAGS << 8))
263
264 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
265 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
266 #define BOTH_RST                (TH_RST | (TH_RST << 8))
267 /* TH_ACK here means FIN was ACKed. */
268 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
269
270 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
271                                  (((s)->st_state & BOTH_RST) ||         \
272                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
273
274 #define O_ANCHOR                O_NOP
275
276 #define IPFW_ISXLAT(type)       ((type) == O_REDIRECT)
277 #define IPFW_XLAT_INVALID(s)    (IPFW_ISXLAT((s)->st_type) &&   \
278                                  ((struct ipfw_xlat *)(s))->xlat_invalid)
279
280 #define IPFW_MBUF_XLATINS       FW_MBUF_PRIVATE1
281 #define IPFW_MBUF_XLATFWD       FW_MBUF_PRIVATE2
282
283 #define IPFW_XLATE_INSERT       0x0001
284 #define IPFW_XLATE_FORWARD      0x0002
285 #define IPFW_XLATE_OUTPUT       0x0004
286
287 struct netmsg_ipfw {
288         struct netmsg_base      base;
289         const struct ipfw_ioc_rule *ioc_rule;
290         struct ip_fw            *next_rule;
291         struct ip_fw            *prev_rule;
292         struct ip_fw            *sibling;
293         uint32_t                rule_flags;
294         struct ip_fw            **cross_rules;
295 };
296
297 struct netmsg_del {
298         struct netmsg_base      base;
299         struct ip_fw            *start_rule;
300         struct ip_fw            *prev_rule;
301         uint16_t                rulenum;
302         uint8_t                 from_set;
303         uint8_t                 to_set;
304 };
305
306 struct netmsg_zent {
307         struct netmsg_base      base;
308         struct ip_fw            *start_rule;
309         uint16_t                rulenum;
310         uint16_t                log_only;
311 };
312
313 struct netmsg_cpstate {
314         struct netmsg_base      base;
315         struct ipfw_ioc_state   *ioc_state;
316         int                     state_cntmax;
317         int                     state_cnt;
318 };
319
320 struct netmsg_tblent {
321         struct netmsg_base      base;
322         struct sockaddr         *key;
323         struct sockaddr         *netmask;
324         struct ipfw_tblent      *sibling;
325         int                     tableid;
326 };
327
328 struct netmsg_tblflush {
329         struct netmsg_base      base;
330         int                     tableid;
331         int                     destroy;
332 };
333
334 struct netmsg_tblexp {
335         struct netmsg_base      base;
336         time_t                  expire;
337         int                     tableid;
338         int                     cnt;
339         int                     expcnt;
340         struct radix_node_head  *rnh;
341 };
342
343 struct ipfw_table_cp {
344         struct ipfw_ioc_tblent  *te;
345         int                     te_idx;
346         int                     te_cnt;
347 };
348
349 struct ip_fw_local {
350         /*
351          * offset       The offset of a fragment. offset != 0 means that
352          *      we have a fragment at this offset of an IPv4 packet.
353          *      offset == 0 means that (if this is an IPv4 packet)
354          *      this is the first or only fragment.
355          */
356         u_short                 offset;
357
358         /*
359          * Local copies of addresses. They are only valid if we have
360          * an IP packet.
361          *
362          * proto        The protocol. Set to 0 for non-ip packets,
363          *      or to the protocol read from the packet otherwise.
364          *      proto != 0 means that we have an IPv4 packet.
365          *
366          * src_port, dst_port   port numbers, in HOST format. Only
367          *      valid for TCP and UDP packets.
368          *
369          * src_ip, dst_ip       ip addresses, in NETWORK format.
370          *      Only valid for IPv4 packets.
371          */
372         uint8_t                 proto;
373         uint16_t                src_port;       /* NOTE: host format    */
374         uint16_t                dst_port;       /* NOTE: host format    */
375         struct in_addr          src_ip;         /* NOTE: network format */
376         struct in_addr          dst_ip;         /* NOTE: network format */
377         uint16_t                ip_len;
378         struct tcphdr           *tcp;
379 };
380
381 struct ipfw_addrs {
382         uint32_t                addr1;  /* host byte order */
383         uint32_t                addr2;  /* host byte order */
384 };
385
386 struct ipfw_ports {
387         uint16_t                port1;  /* host byte order */
388         uint16_t                port2;  /* host byte order */
389 };
390
391 struct ipfw_key {
392         union {
393                 struct ipfw_addrs addrs;
394                 uint64_t        value;
395         } addr_u;
396         union {
397                 struct ipfw_ports ports;
398                 uint32_t        value;
399         } port_u;
400         uint8_t                 proto;
401         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
402         uint16_t                rsvd2;
403 };
404
405 #define IPFW_KEY_SWAP_ADDRS     0x1
406 #define IPFW_KEY_SWAP_PORTS     0x2
407 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
408
409 struct ipfw_trkcnt {
410         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
411         struct ipfw_key         tc_key;
412         uintptr_t               tc_ruleid;
413         int                     tc_refs;
414         int                     tc_count;
415         time_t                  tc_expire;      /* userland get-only */
416         uint16_t                tc_rulenum;     /* userland get-only */
417 } __cachealign;
418
419 #define tc_addrs                tc_key.addr_u.value
420 #define tc_ports                tc_key.port_u.value
421 #define tc_proto                tc_key.proto
422 #define tc_saddr                tc_key.addr_u.addrs.addr1
423 #define tc_daddr                tc_key.addr_u.addrs.addr2
424 #define tc_sport                tc_key.port_u.ports.port1
425 #define tc_dport                tc_key.port_u.ports.port2
426
427 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
428
429 struct ipfw_state;
430
431 struct ipfw_track {
432         RB_ENTRY(ipfw_track)    t_rblink;
433         struct ipfw_key         t_key;
434         struct ip_fw            *t_rule;
435         time_t                  t_lastexp;
436         LIST_HEAD(, ipfw_state) t_state_list;
437         time_t                  t_expire;
438         volatile int            *t_count;
439         struct ipfw_trkcnt      *t_trkcnt;
440         TAILQ_ENTRY(ipfw_track) t_link;
441 };
442
443 #define t_addrs                 t_key.addr_u.value
444 #define t_ports                 t_key.port_u.value
445 #define t_proto                 t_key.proto
446 #define t_saddr                 t_key.addr_u.addrs.addr1
447 #define t_daddr                 t_key.addr_u.addrs.addr2
448 #define t_sport                 t_key.port_u.ports.port1
449 #define t_dport                 t_key.port_u.ports.port2
450
451 RB_HEAD(ipfw_track_tree, ipfw_track);
452 TAILQ_HEAD(ipfw_track_list, ipfw_track);
453
454 struct ipfw_state {
455         RB_ENTRY(ipfw_state)    st_rblink;
456         struct ipfw_key         st_key;
457
458         time_t                  st_expire;      /* expire time */
459         struct ip_fw            *st_rule;
460
461         uint64_t                st_pcnt;        /* packets */
462         uint64_t                st_bcnt;        /* bytes */
463
464         /*
465          * st_state:
466          * State of this rule, typically a combination of TCP flags.
467          *
468          * st_ack_fwd/st_ack_rev:
469          * Most recent ACKs in forward and reverse direction.  They
470          * are used to generate keepalives.
471          */
472         uint32_t                st_state;
473         uint32_t                st_ack_fwd;     /* host byte order */
474         uint32_t                st_seq_fwd;     /* host byte order */
475         uint32_t                st_ack_rev;     /* host byte order */
476         uint32_t                st_seq_rev;     /* host byte order */
477
478         uint16_t                st_flags;       /* IPFW_STATE_F_ */
479         uint16_t                st_type;        /* KEEP_STATE/LIMIT/RDR */
480         struct ipfw_track       *st_track;
481
482         LIST_ENTRY(ipfw_state)  st_trklink;
483         TAILQ_ENTRY(ipfw_state) st_link;
484 };
485
486 #define st_addrs                st_key.addr_u.value
487 #define st_ports                st_key.port_u.value
488 #define st_proto                st_key.proto
489 #define st_swap                 st_key.swap
490
491 #define IPFW_STATE_F_ACKFWD     0x0001
492 #define IPFW_STATE_F_SEQFWD     0x0002
493 #define IPFW_STATE_F_ACKREV     0x0004
494 #define IPFW_STATE_F_SEQREV     0x0008
495 #define IPFW_STATE_F_XLATSRC    0x0010
496 #define IPFW_STATE_F_XLATSLAVE  0x0020
497 #define IPFW_STATE_F_LINKED     0x0040
498
499 #define IPFW_STATE_SCANSKIP(s)  ((s)->st_type == O_ANCHOR ||    \
500                                  ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
501
502 /* Expired or being deleted. */
503 #define IPFW_STATE_ISDEAD(s)    (TIME_LEQ((s)->st_expire, time_uptime) || \
504                                  IPFW_XLAT_INVALID((s)))
505
506 TAILQ_HEAD(ipfw_state_list, ipfw_state);
507 RB_HEAD(ipfw_state_tree, ipfw_state);
508
509 struct ipfw_xlat {
510         struct ipfw_state       xlat_st;        /* MUST be the first field */
511         uint32_t                xlat_addr;      /* network byte order */
512         uint16_t                xlat_port;      /* network byte order */
513         uint16_t                xlat_dir;       /* MATCH_ */
514         struct ifnet            *xlat_ifp;      /* matching ifnet */
515         struct ipfw_xlat        *xlat_pair;     /* paired state */
516         int                     xlat_pcpu;      /* paired cpu */
517         volatile int            xlat_invalid;   /* invalid, but not dtor yet */
518         volatile uint64_t       xlat_crefs;     /* cross references */
519         struct netmsg_base      xlat_freenm;    /* for remote free */
520 };
521
522 #define xlat_type               xlat_st.st_type
523 #define xlat_flags              xlat_st.st_flags
524 #define xlat_rule               xlat_st.st_rule
525 #define xlat_bcnt               xlat_st.st_bcnt
526 #define xlat_pcnt               xlat_st.st_pcnt
527
528 struct ipfw_tblent {
529         struct radix_node       te_nodes[2];
530         struct sockaddr_in      te_key;
531         u_long                  te_use;
532         time_t                  te_lastuse;
533         struct ipfw_tblent      *te_sibling;
534         volatile int            te_expired;
535 };
536
537 struct ipfw_context {
538         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
539         struct ip_fw            *ipfw_default_rule;     /* default rule */
540         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
541
542         /*
543          * ipfw_set_disable contains one bit per set value (0..31).
544          * If the bit is set, all rules with the corresponding set
545          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
546          * default rule and CANNOT be disabled.
547          */
548         uint32_t                ipfw_set_disable;
549
550         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
551
552         struct ip_fw            *ipfw_cont_rule;
553         struct ipfw_xlat        *ipfw_cont_xlat;
554
555         struct ipfw_state_tree  ipfw_state_tree;
556         struct ipfw_state_list  ipfw_state_list;
557         int                     ipfw_state_loosecnt;
558         int                     ipfw_state_cnt;
559
560         union {
561                 struct ipfw_state state;
562                 struct ipfw_track track;
563                 struct ipfw_trkcnt trkcnt;
564         } ipfw_tmpkey;
565
566         struct ipfw_track_tree  ipfw_track_tree;
567         struct ipfw_track_list  ipfw_track_list;
568         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
569
570         struct callout          ipfw_stateto_ch;
571         time_t                  ipfw_state_lastexp;
572         struct netmsg_base      ipfw_stateexp_nm;
573         struct netmsg_base      ipfw_stateexp_more;
574         struct ipfw_state       ipfw_stateexp_anch;
575
576         struct callout          ipfw_trackto_ch;
577         time_t                  ipfw_track_lastexp;
578         struct netmsg_base      ipfw_trackexp_nm;
579         struct netmsg_base      ipfw_trackexp_more;
580         struct ipfw_track       ipfw_trackexp_anch;
581
582         struct callout          ipfw_keepalive_ch;
583         struct netmsg_base      ipfw_keepalive_nm;
584         struct netmsg_base      ipfw_keepalive_more;
585         struct ipfw_state       ipfw_keepalive_anch;
586
587         struct callout          ipfw_xlatreap_ch;
588         struct netmsg_base      ipfw_xlatreap_nm;
589         struct ipfw_state_list  ipfw_xlatreap;
590
591         /*
592          * Statistics
593          */
594         u_long                  ipfw_sts_reap;
595         u_long                  ipfw_sts_reapfailed;
596         u_long                  ipfw_sts_overflow;
597         u_long                  ipfw_sts_nomem;
598         u_long                  ipfw_sts_tcprecycled;
599
600         u_long                  ipfw_tks_nomem;
601         u_long                  ipfw_tks_reap;
602         u_long                  ipfw_tks_reapfailed;
603         u_long                  ipfw_tks_overflow;
604         u_long                  ipfw_tks_cntnomem;
605
606         u_long                  ipfw_frags;
607         u_long                  ipfw_defraged;
608         u_long                  ipfw_defrag_remote;
609
610         u_long                  ipfw_xlated;
611         u_long                  ipfw_xlate_split;
612         u_long                  ipfw_xlate_conflicts;
613         u_long                  ipfw_xlate_cresolved;
614
615         /* Last field */
616         struct radix_node_head  *ipfw_tables[];
617 };
618
619 #define IPFW_FLAG_KEEPALIVE     0x01
620 #define IPFW_FLAG_STATEEXP      0x02
621 #define IPFW_FLAG_TRACKEXP      0x04
622 #define IPFW_FLAG_STATEREAP     0x08
623 #define IPFW_FLAG_TRACKREAP     0x10
624
625 #define ipfw_state_tmpkey       ipfw_tmpkey.state
626 #define ipfw_track_tmpkey       ipfw_tmpkey.track
627 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
628
629 struct ipfw_global {
630         int                     ipfw_state_loosecnt;    /* cache aligned */
631         time_t                  ipfw_state_globexp __cachealign;
632
633         struct lwkt_token       ipfw_trkcnt_token __cachealign;
634         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
635         int                     ipfw_trkcnt_cnt;
636         time_t                  ipfw_track_globexp;
637
638         /* Accessed in netisr0. */
639         struct ip_fw            *ipfw_crossref_free __cachealign;
640         struct callout          ipfw_crossref_ch;
641         struct netmsg_base      ipfw_crossref_nm;
642
643 #ifdef KLD_MODULE
644         /*
645          * Module can not be unloaded, if there are references to
646          * certains rules of ipfw(4), e.g. dummynet(4)
647          */
648         int                     ipfw_refcnt __cachealign;
649 #endif
650 } __cachealign;
651
652 static struct ipfw_context      *ipfw_ctx[MAXCPU];
653
654 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
655
656 /*
657  * Following two global variables are accessed and updated only
658  * in netisr0.
659  */
660 static uint32_t static_count;   /* # of static rules */
661 static uint32_t static_ioc_len; /* bytes of static rules */
662
663 /*
664  * If 1, then ipfw static rules are being flushed,
665  * ipfw_chk() will skip to the default rule.
666  */
667 static int ipfw_flushing;
668
669 static int fw_verbose;
670 static int verbose_limit;
671
672 static int fw_debug;
673 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
674
675 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
676
677 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
678 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
679
680 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
681
682 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
683 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
684     "Firewall statistics");
685
686 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
687     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
688 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
689     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
690     "Rule number autincrement step");
691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
692     &fw_one_pass, 0,
693     "Only do a single pass through ipfw when using dummynet(4)");
694 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
695     &fw_debug, 0, "Enable printing of debug ip_fw statements");
696 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
697     &fw_verbose, 0, "Log matches to ipfw rules");
698 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
699     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
700 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
701     &ipfw_table_max, 0, "Max # of tables");
702
703 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
704 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
705 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
706 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
707 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
708 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
709
710 /*
711  * Timeouts for various events in handing states.
712  *
713  * NOTE:
714  * 1 == 0~1 second.
715  * 2 == 1~2 second(s).
716  *
717  * We use 2 seconds for FIN lifetime, so that the states will not be
718  * ripped prematurely.
719  */
720 static uint32_t dyn_ack_lifetime = 300;
721 static uint32_t dyn_syn_lifetime = 20;
722 static uint32_t dyn_finwait_lifetime = 20;
723 static uint32_t dyn_fin_lifetime = 2;
724 static uint32_t dyn_rst_lifetime = 2;
725 static uint32_t dyn_udp_lifetime = 10;
726 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
727
728 /*
729  * Keepalives are sent if dyn_keepalive is set. They are sent every
730  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
731  * seconds of lifetime of a rule.
732  */
733 static uint32_t dyn_keepalive_interval = 20;
734 static uint32_t dyn_keepalive_period = 5;
735 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
736
737 static struct ipfw_global       ipfw_gd;
738 static int      ipfw_state_loosecnt_updthr;
739 static int      ipfw_state_max = 4096;  /* max # of states */
740 static int      ipfw_track_max = 4096;  /* max # of tracks */
741
742 static int      ipfw_state_headroom;    /* setup at module load time */
743 static int      ipfw_state_reap_min = 8;
744 static int      ipfw_state_expire_max = 32;
745 static int      ipfw_state_scan_max = 256;
746 static int      ipfw_keepalive_max = 8;
747 static int      ipfw_track_reap_max = 4;
748 static int      ipfw_track_expire_max = 16;
749 static int      ipfw_track_scan_max = 128;
750
751 static eventhandler_tag ipfw_ifaddr_event;
752
753 /* Compat */
754 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
755     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
756     "Number of states and tracks");
757 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
758     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
759     "Max number of states and tracks");
760
761 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
762     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
763     "Number of states");
764 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
765     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
766     "Max number of states");
767 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
768     &ipfw_state_headroom, 0, "headroom for state reap");
769 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
770     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
771 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
772     &ipfw_track_max, 0, "Max number of tracks");
773 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
774     &static_count, 0, "Number of static rules");
775 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
776     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
777 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
778     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
779 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
780     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
781 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
782     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
783 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
784     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
785 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
786     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
787 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
788     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
789 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
790     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
791 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
792     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
793     "I", "# of states to scan for each expire iteration");
794 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
795     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
796     "I", "# of states to expire for each expire iteration");
797 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
798     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
799     "I", "# of states to expire for each expire iteration");
800 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
801     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
802     "I", "# of states to reap for state shortage");
803 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
804     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
805     "I", "# of tracks to scan for each expire iteration");
806 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
807     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
808     "I", "# of tracks to expire for each expire iteration");
809 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
810     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
811     "I", "# of tracks to reap for track shortage");
812
813 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
814     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
815     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
816     "LU", "# of state reaps due to states shortage");
817 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
818     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
819     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
820     "LU", "# of state reap failure");
821 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
822     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
823     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
824     "LU", "# of state overflow");
825 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
826     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
827     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
828     "LU", "# of state allocation failure");
829 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
830     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
831     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
832     "LU", "# of state deleted due to fast TCP port recycling");
833
834 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
835     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
836     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
837     "LU", "# of track allocation failure");
838 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
839     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
840     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
841     "LU", "# of track reap due to tracks shortage");
842 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
843     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
844     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
845     "LU", "# of track reap failure");
846 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
847     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
848     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
849     "LU", "# of track overflow");
850 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
851     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
852     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
853     "LU", "# of track counter allocation failure");
854 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
855     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
856     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
857     "LU", "# of IP fragements defraged");
858 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
859     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
860     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
861     "LU", "# of IP packets after defrag");
862 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
863     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
864     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
865     "LU", "# of IP packets after defrag dispatched to remote cpus");
866 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
867     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
868     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
869     "LU", "# address/port translations");
870 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
871     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
872     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
873     "LU", "# address/port translations split between different cpus");
874 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
875     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
876     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
877     "LU", "# address/port translations conflicts on remote cpu");
878 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
879     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
880     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
881     "LU", "# address/port translations conflicts resolved on remote cpu");
882
883 static int              ipfw_state_cmp(struct ipfw_state *,
884                             struct ipfw_state *);
885 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
886                             struct ipfw_trkcnt *);
887 static int              ipfw_track_cmp(struct ipfw_track *,
888                             struct ipfw_track *);
889
890 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
892
893 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
895
896 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
898
899 static int              ipfw_chk(struct ip_fw_args *);
900 static void             ipfw_track_expire_ipifunc(void *);
901 static void             ipfw_state_expire_ipifunc(void *);
902 static void             ipfw_keepalive(void *);
903 static int              ipfw_state_expire_start(struct ipfw_context *,
904                             int, int);
905 static void             ipfw_crossref_timeo(void *);
906 static void             ipfw_state_remove(struct ipfw_context *,
907                             struct ipfw_state *);
908 static void             ipfw_xlat_reap_timeo(void *);
909 static void             ipfw_defrag_redispatch(struct mbuf *, int,
910                             struct ip_fw *);
911
912 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
914 #define IPFW_TRKCNT_TOKINIT     \
915         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
916
917 static void
918 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
919     const struct sockaddr *netmask)
920 {
921         const u_char *cp1 = (const u_char *)src;
922         u_char *cp2 = (u_char *)dst;
923         const u_char *cp3 = (const u_char *)netmask;
924         u_char *cplim = cp2 + *cp3;
925         u_char *cplim2 = cp2 + *cp1;
926
927         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
928         cp3 += 2;
929         if (cplim > cplim2)
930                 cplim = cplim2;
931         while (cp2 < cplim)
932                 *cp2++ = *cp1++ & *cp3++;
933         if (cp2 < cplim2)
934                 bzero(cp2, cplim2 - cp2);
935 }
936
937 static __inline uint16_t
938 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
939 {
940         uint32_t l;
941
942         if (udp && !cksum)
943                 return (0x0000);
944         l = cksum + old - new;
945         l = (l >> 16) + (l & 65535);
946         l = l & 65535;
947         if (udp && !l)
948                 return (0xFFFF);
949         return (l);
950 }
951
952 static __inline void
953 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
954     in_addr_t daddr, uint16_t dport, uint8_t proto)
955 {
956
957         key->proto = proto;
958         key->swap = 0;
959
960         if (saddr < daddr) {
961                 key->addr_u.addrs.addr1 = daddr;
962                 key->addr_u.addrs.addr2 = saddr;
963                 key->swap |= IPFW_KEY_SWAP_ADDRS;
964         } else {
965                 key->addr_u.addrs.addr1 = saddr;
966                 key->addr_u.addrs.addr2 = daddr;
967         }
968
969         if (sport < dport) {
970                 key->port_u.ports.port1 = dport;
971                 key->port_u.ports.port2 = sport;
972                 key->swap |= IPFW_KEY_SWAP_PORTS;
973         } else {
974                 key->port_u.ports.port1 = sport;
975                 key->port_u.ports.port2 = dport;
976         }
977
978         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
979                 key->swap |= IPFW_KEY_SWAP_PORTS;
980         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
981                 key->swap |= IPFW_KEY_SWAP_ADDRS;
982 }
983
984 static __inline void
985 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
986     in_addr_t *daddr, uint16_t *dport)
987 {
988
989         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
990                 *saddr = key->addr_u.addrs.addr2;
991                 *daddr = key->addr_u.addrs.addr1;
992         } else {
993                 *saddr = key->addr_u.addrs.addr1;
994                 *daddr = key->addr_u.addrs.addr2;
995         }
996
997         if (key->swap & IPFW_KEY_SWAP_PORTS) {
998                 *sport = key->port_u.ports.port2;
999                 *dport = key->port_u.ports.port1;
1000         } else {
1001                 *sport = key->port_u.ports.port1;
1002                 *dport = key->port_u.ports.port2;
1003         }
1004 }
1005
1006 static int
1007 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1008 {
1009
1010         if (s1->st_proto > s2->st_proto)
1011                 return (1);
1012         if (s1->st_proto < s2->st_proto)
1013                 return (-1);
1014
1015         if (s1->st_addrs > s2->st_addrs)
1016                 return (1);
1017         if (s1->st_addrs < s2->st_addrs)
1018                 return (-1);
1019
1020         if (s1->st_ports > s2->st_ports)
1021                 return (1);
1022         if (s1->st_ports < s2->st_ports)
1023                 return (-1);
1024
1025         if (s1->st_swap == s2->st_swap ||
1026             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1027                 return (0);
1028
1029         if (s1->st_swap > s2->st_swap)
1030                 return (1);
1031         else
1032                 return (-1);
1033 }
1034
1035 static int
1036 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1037 {
1038
1039         if (t1->tc_proto > t2->tc_proto)
1040                 return (1);
1041         if (t1->tc_proto < t2->tc_proto)
1042                 return (-1);
1043
1044         if (t1->tc_addrs > t2->tc_addrs)
1045                 return (1);
1046         if (t1->tc_addrs < t2->tc_addrs)
1047                 return (-1);
1048
1049         if (t1->tc_ports > t2->tc_ports)
1050                 return (1);
1051         if (t1->tc_ports < t2->tc_ports)
1052                 return (-1);
1053
1054         if (t1->tc_ruleid > t2->tc_ruleid)
1055                 return (1);
1056         if (t1->tc_ruleid < t2->tc_ruleid)
1057                 return (-1);
1058
1059         return (0);
1060 }
1061
1062 static int
1063 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1064 {
1065
1066         if (t1->t_proto > t2->t_proto)
1067                 return (1);
1068         if (t1->t_proto < t2->t_proto)
1069                 return (-1);
1070
1071         if (t1->t_addrs > t2->t_addrs)
1072                 return (1);
1073         if (t1->t_addrs < t2->t_addrs)
1074                 return (-1);
1075
1076         if (t1->t_ports > t2->t_ports)
1077                 return (1);
1078         if (t1->t_ports < t2->t_ports)
1079                 return (-1);
1080
1081         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1082                 return (1);
1083         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1084                 return (-1);
1085
1086         return (0);
1087 }
1088
1089 static __inline struct ipfw_state *
1090 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1091 {
1092         struct ipfw_state *dup;
1093
1094         KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1095             ("state %p was linked", s));
1096         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1097         if (dup == NULL) {
1098                 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1099                 s->st_flags |= IPFW_STATE_F_LINKED;
1100         }
1101         return (dup);
1102 }
1103
1104 static __inline void
1105 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1106 {
1107
1108         KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1109             ("state %p was not linked", s));
1110         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1111         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1112         s->st_flags &= ~IPFW_STATE_F_LINKED;
1113 }
1114
1115 static void
1116 ipfw_state_max_set(int state_max)
1117 {
1118
1119         ipfw_state_max = state_max;
1120         /* Allow 5% states over-allocation. */
1121         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1122 }
1123
1124 static __inline int
1125 ipfw_state_cntcoll(void)
1126 {
1127         int cpu, state_cnt = 0;
1128
1129         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1130                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1131         return (state_cnt);
1132 }
1133
1134 static __inline int
1135 ipfw_state_cntsync(void)
1136 {
1137         int state_cnt;
1138
1139         state_cnt = ipfw_state_cntcoll();
1140         ipfw_gd.ipfw_state_loosecnt = state_cnt;
1141         return (state_cnt);
1142 }
1143
1144 static __inline int
1145 ipfw_free_rule(struct ip_fw *rule)
1146 {
1147         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1148         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1149         rule->refcnt--;
1150         if (rule->refcnt == 0) {
1151                 if (rule->cross_rules != NULL)
1152                         kfree(rule->cross_rules, M_IPFW);
1153                 kfree(rule, M_IPFW);
1154                 return 1;
1155         }
1156         return 0;
1157 }
1158
1159 static void
1160 ipfw_unref_rule(void *priv)
1161 {
1162         ipfw_free_rule(priv);
1163 #ifdef KLD_MODULE
1164         KASSERT(ipfw_gd.ipfw_refcnt > 0,
1165             ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1166         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1167 #endif
1168 }
1169
1170 static __inline void
1171 ipfw_ref_rule(struct ip_fw *rule)
1172 {
1173         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1174 #ifdef KLD_MODULE
1175         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1176 #endif
1177         rule->refcnt++;
1178 }
1179
1180 /*
1181  * This macro maps an ip pointer into a layer3 header pointer of type T
1182  */
1183 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1184
1185 static __inline int
1186 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1187 {
1188         int type = L3HDR(struct icmp,ip)->icmp_type;
1189         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1190         int idx = type / 32;
1191
1192         if (idx >= idx_max)
1193                 return (0);
1194         return (cmd->d[idx] & (1 << (type % 32)));
1195 }
1196
1197 static __inline int
1198 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1199 {
1200         int code = L3HDR(struct icmp,ip)->icmp_code;
1201         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1202         int idx = code / 32;
1203
1204         if (idx >= idx_max)
1205                 return (0);
1206         return (cmd->d[idx] & (1 << (code % 32)));
1207 }
1208
1209 #define TT      ((1 << ICMP_ECHO) | \
1210                  (1 << ICMP_ROUTERSOLICIT) | \
1211                  (1 << ICMP_TSTAMP) | \
1212                  (1 << ICMP_IREQ) | \
1213                  (1 << ICMP_MASKREQ))
1214
1215 static int
1216 is_icmp_query(struct ip *ip)
1217 {
1218         int type = L3HDR(struct icmp, ip)->icmp_type;
1219
1220         return (type < 32 && (TT & (1 << type)));
1221 }
1222
1223 #undef TT
1224
1225 /*
1226  * The following checks use two arrays of 8 or 16 bits to store the
1227  * bits that we want set or clear, respectively. They are in the
1228  * low and high half of cmd->arg1 or cmd->d[0].
1229  *
1230  * We scan options and store the bits we find set. We succeed if
1231  *
1232  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1233  *
1234  * The code is sometimes optimized not to store additional variables.
1235  */
1236 static int
1237 flags_match(ipfw_insn *cmd, uint8_t bits)
1238 {
1239         u_char want_clear;
1240         bits = ~bits;
1241
1242         if (((cmd->arg1 & 0xff) & bits) != 0)
1243                 return 0; /* some bits we want set were clear */
1244
1245         want_clear = (cmd->arg1 >> 8) & 0xff;
1246         if ((want_clear & bits) != want_clear)
1247                 return 0; /* some bits we want clear were set */
1248         return 1;
1249 }
1250
1251 static int
1252 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1253 {
1254         int optlen, bits = 0;
1255         u_char *cp = (u_char *)(ip + 1);
1256         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1257
1258         for (; x > 0; x -= optlen, cp += optlen) {
1259                 int opt = cp[IPOPT_OPTVAL];
1260
1261                 if (opt == IPOPT_EOL)
1262                         break;
1263
1264                 if (opt == IPOPT_NOP) {
1265                         optlen = 1;
1266                 } else {
1267                         optlen = cp[IPOPT_OLEN];
1268                         if (optlen <= 0 || optlen > x)
1269                                 return 0; /* invalid or truncated */
1270                 }
1271
1272                 switch (opt) {
1273                 case IPOPT_LSRR:
1274                         bits |= IP_FW_IPOPT_LSRR;
1275                         break;
1276
1277                 case IPOPT_SSRR:
1278                         bits |= IP_FW_IPOPT_SSRR;
1279                         break;
1280
1281                 case IPOPT_RR:
1282                         bits |= IP_FW_IPOPT_RR;
1283                         break;
1284
1285                 case IPOPT_TS:
1286                         bits |= IP_FW_IPOPT_TS;
1287                         break;
1288
1289                 default:
1290                         break;
1291                 }
1292         }
1293         return (flags_match(cmd, bits));
1294 }
1295
1296 static int
1297 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1298 {
1299         int optlen, bits = 0;
1300         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1301         u_char *cp = (u_char *)(tcp + 1);
1302         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1303
1304         for (; x > 0; x -= optlen, cp += optlen) {
1305                 int opt = cp[0];
1306
1307                 if (opt == TCPOPT_EOL)
1308                         break;
1309
1310                 if (opt == TCPOPT_NOP) {
1311                         optlen = 1;
1312                 } else {
1313                         optlen = cp[1];
1314                         if (optlen <= 0)
1315                                 break;
1316                 }
1317
1318                 switch (opt) {
1319                 case TCPOPT_MAXSEG:
1320                         bits |= IP_FW_TCPOPT_MSS;
1321                         break;
1322
1323                 case TCPOPT_WINDOW:
1324                         bits |= IP_FW_TCPOPT_WINDOW;
1325                         break;
1326
1327                 case TCPOPT_SACK_PERMITTED:
1328                 case TCPOPT_SACK:
1329                         bits |= IP_FW_TCPOPT_SACK;
1330                         break;
1331
1332                 case TCPOPT_TIMESTAMP:
1333                         bits |= IP_FW_TCPOPT_TS;
1334                         break;
1335
1336                 case TCPOPT_CC:
1337                 case TCPOPT_CCNEW:
1338                 case TCPOPT_CCECHO:
1339                         bits |= IP_FW_TCPOPT_CC;
1340                         break;
1341
1342                 default:
1343                         break;
1344                 }
1345         }
1346         return (flags_match(cmd, bits));
1347 }
1348
1349 static int
1350 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1351 {
1352         if (ifp == NULL)        /* no iface with this packet, match fails */
1353                 return 0;
1354
1355         /* Check by name or by IP address */
1356         if (cmd->name[0] != '\0') { /* match by name */
1357                 /* Check name */
1358                 if (cmd->p.glob) {
1359                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1360                                 return(1);
1361                 } else {
1362                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1363                                 return(1);
1364                 }
1365         } else {
1366                 struct ifaddr_container *ifac;
1367
1368                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1369                         struct ifaddr *ia = ifac->ifa;
1370
1371                         if (ia->ifa_addr == NULL)
1372                                 continue;
1373                         if (ia->ifa_addr->sa_family != AF_INET)
1374                                 continue;
1375                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1376                             (ia->ifa_addr))->sin_addr.s_addr)
1377                                 return(1);      /* match */
1378                 }
1379         }
1380         return(0);      /* no match, fail ... */
1381 }
1382
1383 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1384
1385 /*
1386  * We enter here when we have a rule with O_LOG.
1387  * XXX this function alone takes about 2Kbytes of code!
1388  */
1389 static void
1390 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1391     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1392 {
1393         char *action;
1394         int limit_reached = 0;
1395         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1396
1397         fragment[0] = '\0';
1398         proto[0] = '\0';
1399
1400         if (f == NULL) {        /* bogus pkt */
1401                 if (verbose_limit != 0 &&
1402                     ctx->ipfw_norule_counter >= verbose_limit)
1403                         return;
1404                 ctx->ipfw_norule_counter++;
1405                 if (ctx->ipfw_norule_counter == verbose_limit)
1406                         limit_reached = verbose_limit;
1407                 action = "Refuse";
1408         } else {        /* O_LOG is the first action, find the real one */
1409                 ipfw_insn *cmd = ACTION_PTR(f);
1410                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1411
1412                 if (l->max_log != 0 && l->log_left == 0)
1413                         return;
1414                 l->log_left--;
1415                 if (l->log_left == 0)
1416                         limit_reached = l->max_log;
1417                 cmd += F_LEN(cmd);      /* point to first action */
1418                 if (cmd->opcode == O_PROB)
1419                         cmd += F_LEN(cmd);
1420
1421                 action = action2;
1422                 switch (cmd->opcode) {
1423                 case O_DENY:
1424                         action = "Deny";
1425                         break;
1426
1427                 case O_REJECT:
1428                         if (cmd->arg1==ICMP_REJECT_RST) {
1429                                 action = "Reset";
1430                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1431                                 action = "Reject";
1432                         } else {
1433                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1434                                           cmd->arg1);
1435                         }
1436                         break;
1437
1438                 case O_ACCEPT:
1439                         action = "Accept";
1440                         break;
1441
1442                 case O_COUNT:
1443                         action = "Count";
1444                         break;
1445
1446                 case O_DIVERT:
1447                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1448                         break;
1449
1450                 case O_TEE:
1451                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1452                         break;
1453
1454                 case O_SKIPTO:
1455                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1456                         break;
1457
1458                 case O_PIPE:
1459                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1460                         break;
1461
1462                 case O_QUEUE:
1463                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1464                         break;
1465
1466                 case O_FORWARD_IP:
1467                         {
1468                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1469                                 int len;
1470
1471                                 len = ksnprintf(SNPARGS(action2, 0),
1472                                     "Forward to %s",
1473                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1474                                 if (sa->sa.sin_port) {
1475                                         ksnprintf(SNPARGS(action2, len), ":%d",
1476                                                   sa->sa.sin_port);
1477                                 }
1478                         }
1479                         break;
1480
1481                 default:
1482                         action = "UNKNOWN";
1483                         break;
1484                 }
1485         }
1486
1487         if (hlen == 0) {        /* non-ip */
1488                 ksnprintf(SNPARGS(proto, 0), "MAC");
1489         } else {
1490                 struct ip *ip = mtod(m, struct ip *);
1491                 /* these three are all aliases to the same thing */
1492                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1493                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1494                 struct udphdr *const udp = (struct udphdr *)icmp;
1495
1496                 int ip_off, offset, ip_len;
1497                 int len;
1498
1499                 if (eh != NULL) { /* layer 2 packets are as on the wire */
1500                         ip_off = ntohs(ip->ip_off);
1501                         ip_len = ntohs(ip->ip_len);
1502                 } else {
1503                         ip_off = ip->ip_off;
1504                         ip_len = ip->ip_len;
1505                 }
1506                 offset = ip_off & IP_OFFMASK;
1507                 switch (ip->ip_p) {
1508                 case IPPROTO_TCP:
1509                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1510                                         kinet_ntoa(ip->ip_src, abuf));
1511                         if (offset == 0) {
1512                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1513                                           ntohs(tcp->th_sport),
1514                                           kinet_ntoa(ip->ip_dst, abuf),
1515                                           ntohs(tcp->th_dport));
1516                         } else {
1517                                 ksnprintf(SNPARGS(proto, len), " %s",
1518                                           kinet_ntoa(ip->ip_dst, abuf));
1519                         }
1520                         break;
1521
1522                 case IPPROTO_UDP:
1523                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1524                                         kinet_ntoa(ip->ip_src, abuf));
1525                         if (offset == 0) {
1526                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1527                                           ntohs(udp->uh_sport),
1528                                           kinet_ntoa(ip->ip_dst, abuf),
1529                                           ntohs(udp->uh_dport));
1530                         } else {
1531                                 ksnprintf(SNPARGS(proto, len), " %s",
1532                                           kinet_ntoa(ip->ip_dst, abuf));
1533                         }
1534                         break;
1535
1536                 case IPPROTO_ICMP:
1537                         if (offset == 0) {
1538                                 len = ksnprintf(SNPARGS(proto, 0),
1539                                                 "ICMP:%u.%u ",
1540                                                 icmp->icmp_type,
1541                                                 icmp->icmp_code);
1542                         } else {
1543                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1544                         }
1545                         len += ksnprintf(SNPARGS(proto, len), "%s",
1546                                          kinet_ntoa(ip->ip_src, abuf));
1547                         ksnprintf(SNPARGS(proto, len), " %s",
1548                                   kinet_ntoa(ip->ip_dst, abuf));
1549                         break;
1550
1551                 default:
1552                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1553                                         kinet_ntoa(ip->ip_src, abuf));
1554                         ksnprintf(SNPARGS(proto, len), " %s",
1555                                   kinet_ntoa(ip->ip_dst, abuf));
1556                         break;
1557                 }
1558
1559                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1560                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1561                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1562                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1563                 }
1564         }
1565
1566         if (oif || m->m_pkthdr.rcvif) {
1567                 log(LOG_SECURITY | LOG_INFO,
1568                     "ipfw: %d %s %s %s via %s%s\n",
1569                     f ? f->rulenum : -1,
1570                     action, proto, oif ? "out" : "in",
1571                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1572                     fragment);
1573         } else {
1574                 log(LOG_SECURITY | LOG_INFO,
1575                     "ipfw: %d %s %s [no if info]%s\n",
1576                     f ? f->rulenum : -1,
1577                     action, proto, fragment);
1578         }
1579
1580         if (limit_reached) {
1581                 log(LOG_SECURITY | LOG_NOTICE,
1582                     "ipfw: limit %d reached on entry %d\n",
1583                     limit_reached, f ? f->rulenum : -1);
1584         }
1585 }
1586
1587 #undef SNPARGS
1588
1589 static void
1590 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1591 {
1592         struct ip_fw *rule = slave_x->xlat_rule;
1593
1594         KKASSERT(rule->cpuid == mycpuid);
1595
1596         /* No more cross references; free this pair now. */
1597         kfree(x, M_IPFW);
1598         kfree(slave_x, M_IPFW);
1599
1600         /* See the comment in ipfw_ip_xlate_dispatch(). */
1601         rule->cross_refs--;
1602 }
1603
1604 static void
1605 ipfw_xlat_reap_dispatch(netmsg_t nm)
1606 {
1607         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1608         struct ipfw_state *s, *ns;
1609
1610         ASSERT_NETISR_NCPUS(mycpuid);
1611
1612         crit_enter();
1613         /* Reply ASAP. */
1614         netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1615         crit_exit();
1616
1617         /* TODO: limit scanning depth */
1618         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1619                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1620                 struct ipfw_xlat *slave_x = x->xlat_pair;
1621                 uint64_t crefs;
1622
1623                 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1624                 if (crefs == 0) {
1625                         TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1626                         ipfw_xlat_reap(x, slave_x);
1627                 }
1628         }
1629         if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1630                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1631                     &ctx->ipfw_xlatreap_nm);
1632         }
1633 }
1634
1635 static void
1636 ipfw_xlat_reap_timeo(void *xnm)
1637 {
1638         struct netmsg_base *nm = xnm;
1639
1640         KKASSERT(mycpuid < netisr_ncpus);
1641
1642         crit_enter();
1643         if (nm->lmsg.ms_flags & MSGF_DONE)
1644                 netisr_sendmsg_oncpu(nm);
1645         crit_exit();
1646 }
1647
1648 static void
1649 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1650 {
1651         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1652         struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1653         struct ipfw_xlat *slave_x = x->xlat_pair;
1654         uint64_t crefs;
1655
1656         ASSERT_NETISR_NCPUS(mycpuid);
1657
1658         KKASSERT(slave_x != NULL);
1659         KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1660
1661         KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1662             ("master xlat is still linked"));
1663         if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1664                 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1665
1666         /* See the comment in ipfw_ip_xlate_dispatch(). */
1667         slave_x->xlat_crefs--;
1668
1669         crefs = slave_x->xlat_crefs + x->xlat_crefs;
1670         if (crefs == 0) {
1671                 ipfw_xlat_reap(x, slave_x);
1672                 return;
1673         }
1674
1675         if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1676                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1677                     &ctx->ipfw_xlatreap_nm);
1678         }
1679
1680         /*
1681          * This pair is still referenced; defer its destruction.
1682          * YYY reuse st_link.
1683          */
1684         TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1685 }
1686
1687 static __inline void
1688 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1689 {
1690
1691         x->xlat_invalid = 1;
1692         x->xlat_pair->xlat_invalid = 1;
1693 }
1694
1695 static void
1696 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1697 {
1698         struct ipfw_xlat *x, *slave_x;
1699         struct netmsg_base *nm;
1700
1701         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1702             IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1703         KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1704             ("delete slave xlat"));
1705
1706         KASSERT(ctx->ipfw_state_cnt > 0,
1707             ("invalid state count %d", ctx->ipfw_state_cnt));
1708         ctx->ipfw_state_cnt--;
1709         if (ctx->ipfw_state_loosecnt > 0)
1710                 ctx->ipfw_state_loosecnt--;
1711
1712         /*
1713          * Unhook this state.
1714          */
1715         if (s->st_track != NULL) {
1716                 struct ipfw_track *t = s->st_track;
1717
1718                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1719                     ("track state list is empty"));
1720                 LIST_REMOVE(s, st_trklink);
1721
1722                 KASSERT(*t->t_count > 0,
1723                     ("invalid track count %d", *t->t_count));
1724                 atomic_subtract_int(t->t_count, 1);
1725         }
1726         ipfw_state_unlink(ctx, s);
1727
1728         /*
1729          * Free this state.  Xlat requires special processing,
1730          * since xlat are paired state and they could be on
1731          * different cpus.
1732          */
1733
1734         if (!IPFW_ISXLAT(s->st_type)) {
1735                 /* Not xlat; free now. */
1736                 kfree(s, M_IPFW);
1737                 /* Done! */
1738                 return;
1739         }
1740         x = (struct ipfw_xlat *)s;
1741
1742         if (x->xlat_pair == NULL) {
1743                 /* Not setup yet; free now. */
1744                 kfree(x, M_IPFW);
1745                 /* Done! */
1746                 return;
1747         }
1748         slave_x = x->xlat_pair;
1749         KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1750
1751         if (x->xlat_pcpu == mycpuid) {
1752                 /*
1753                  * Paired states are on the same cpu; delete this
1754                  * pair now.
1755                  */
1756                 KKASSERT(x->xlat_crefs == 0);
1757                 KKASSERT(slave_x->xlat_crefs == 0);
1758                 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1759                         ipfw_state_unlink(ctx, &slave_x->xlat_st);
1760                 kfree(x, M_IPFW);
1761                 kfree(slave_x, M_IPFW);
1762                 return;
1763         }
1764
1765         /*
1766          * Free the paired states on the cpu owning the slave xlat.
1767          */
1768
1769         /* 
1770          * Mark the state pair invalid; completely deleting them
1771          * may take some time.
1772          */
1773         ipfw_xlat_invalidate(x);
1774
1775         nm = &x->xlat_freenm;
1776         netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1777             ipfw_xlat_free_dispatch);
1778         nm->lmsg.u.ms_resultp = x;
1779
1780         /* See the comment in ipfw_xlate_redispatch(). */
1781         x->xlat_rule->cross_refs++;
1782         x->xlat_crefs++;
1783
1784         netisr_sendmsg(nm, x->xlat_pcpu);
1785 }
1786
1787 static void
1788 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1789 {
1790
1791         if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1792                 KKASSERT(IPFW_ISXLAT(s->st_type));
1793                 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1794                 ipfw_state_unlink(ctx, s);
1795                 return;
1796         }
1797         ipfw_state_del(ctx, s);
1798 }
1799
1800 static int
1801 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1802 {
1803         struct ipfw_state *s, *anchor;
1804         int expired;
1805
1806         if (reap_max < ipfw_state_reap_min)
1807                 reap_max = ipfw_state_reap_min;
1808
1809         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1810                 /*
1811                  * Kick start state expiring.  Ignore scan limit,
1812                  * we are short of states.
1813                  */
1814                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1815                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1816                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1817                 return (expired);
1818         }
1819
1820         /*
1821          * States are being expired.
1822          */
1823
1824         if (ctx->ipfw_state_cnt == 0)
1825                 return (0);
1826
1827         expired = 0;
1828         anchor = &ctx->ipfw_stateexp_anch;
1829         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1830                 /*
1831                  * Ignore scan limit; we are short of states.
1832                  */
1833
1834                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1835                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1836
1837                 if (IPFW_STATE_SCANSKIP(s))
1838                         continue;
1839
1840                 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1841                         ipfw_state_del(ctx, s);
1842                         if (++expired >= reap_max)
1843                                 break;
1844                         if ((expired & 0xff) == 0 && 
1845                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1846                             ipfw_state_max)
1847                                 break;
1848                 }
1849         }
1850         /*
1851          * NOTE:
1852          * Leave the anchor on the list, even if the end of the list has
1853          * been reached.  ipfw_state_expire_more_dispatch() will handle
1854          * the removal.
1855          */
1856         return (expired);
1857 }
1858
1859 static void
1860 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1861 {
1862         struct ipfw_state *s, *sn;
1863
1864         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1865                 if (IPFW_STATE_SCANSKIP(s))
1866                         continue;
1867                 if (rule != NULL && s->st_rule != rule)
1868                         continue;
1869                 ipfw_state_del(ctx, s);
1870         }
1871 }
1872
1873 static void
1874 ipfw_state_expire_done(struct ipfw_context *ctx)
1875 {
1876
1877         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1878             ("stateexp is not in progress"));
1879         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1880         callout_reset(&ctx->ipfw_stateto_ch, hz,
1881             ipfw_state_expire_ipifunc, NULL);
1882 }
1883
1884 static void
1885 ipfw_state_expire_more(struct ipfw_context *ctx)
1886 {
1887         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1888
1889         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1890             ("stateexp is not in progress"));
1891         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1892             ("stateexp more did not finish"));
1893         netisr_sendmsg_oncpu(nm);
1894 }
1895
1896 static int
1897 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1898     int scan_max, int expire_max)
1899 {
1900         struct ipfw_state *s;
1901         int scanned = 0, expired = 0;
1902
1903         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1904             ("stateexp is not in progress"));
1905
1906         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1907                 if (scanned++ >= scan_max) {
1908                         ipfw_state_expire_more(ctx);
1909                         return (expired);
1910                 }
1911
1912                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1913                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1914
1915                 if (IPFW_STATE_SCANSKIP(s))
1916                         continue;
1917
1918                 if (IPFW_STATE_ISDEAD(s) ||
1919                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1920                      IPFW_STATE_TCPCLOSED(s))) {
1921                         ipfw_state_del(ctx, s);
1922                         if (++expired >= expire_max) {
1923                                 ipfw_state_expire_more(ctx);
1924                                 return (expired);
1925                         }
1926                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1927                             (expired & 0xff) == 0 &&
1928                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1929                             ipfw_state_max) {
1930                                 ipfw_state_expire_more(ctx);
1931                                 return (expired);
1932                         }
1933                 }
1934         }
1935         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1936         ipfw_state_expire_done(ctx);
1937         return (expired);
1938 }
1939
1940 static void
1941 ipfw_state_expire_more_dispatch(netmsg_t nm)
1942 {
1943         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1944         struct ipfw_state *anchor;
1945
1946         ASSERT_NETISR_NCPUS(mycpuid);
1947         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1948             ("statexp is not in progress"));
1949
1950         /* Reply ASAP */
1951         netisr_replymsg(&nm->base, 0);
1952
1953         anchor = &ctx->ipfw_stateexp_anch;
1954         if (ctx->ipfw_state_cnt == 0) {
1955                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1956                 ipfw_state_expire_done(ctx);
1957                 return;
1958         }
1959         ipfw_state_expire_loop(ctx, anchor,
1960             ipfw_state_scan_max, ipfw_state_expire_max);
1961 }
1962
1963 static int
1964 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1965 {
1966         struct ipfw_state *anchor;
1967
1968         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1969             ("stateexp is in progress"));
1970         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1971
1972         if (ctx->ipfw_state_cnt == 0) {
1973                 ipfw_state_expire_done(ctx);
1974                 return (0);
1975         }
1976
1977         /*
1978          * Do not expire more than once per second, it is useless.
1979          */
1980         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1981             ctx->ipfw_state_lastexp == time_uptime) {
1982                 ipfw_state_expire_done(ctx);
1983                 return (0);
1984         }
1985         ctx->ipfw_state_lastexp = time_uptime;
1986
1987         anchor = &ctx->ipfw_stateexp_anch;
1988         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1989         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1990 }
1991
1992 static void
1993 ipfw_state_expire_dispatch(netmsg_t nm)
1994 {
1995         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1996
1997         ASSERT_NETISR_NCPUS(mycpuid);
1998
1999         /* Reply ASAP */
2000         crit_enter();
2001         netisr_replymsg(&nm->base, 0);
2002         crit_exit();
2003
2004         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2005                 /* Running; done. */
2006                 return;
2007         }
2008         ipfw_state_expire_start(ctx,
2009             ipfw_state_scan_max, ipfw_state_expire_max);
2010 }
2011
2012 static void
2013 ipfw_state_expire_ipifunc(void *dummy __unused)
2014 {
2015         struct netmsg_base *msg;
2016
2017         KKASSERT(mycpuid < netisr_ncpus);
2018         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2019
2020         crit_enter();
2021         if (msg->lmsg.ms_flags & MSGF_DONE)
2022                 netisr_sendmsg_oncpu(msg);
2023         crit_exit();
2024 }
2025
2026 static boolean_t
2027 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2028 {
2029         uint32_t seq = ntohl(tcp->th_seq);
2030         uint32_t ack = ntohl(tcp->th_ack);
2031
2032         if (tcp->th_flags & TH_RST)
2033                 return (TRUE);
2034
2035         if (dir == MATCH_FORWARD) {
2036                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2037                         s->st_flags |= IPFW_STATE_F_SEQFWD;
2038                         s->st_seq_fwd = seq;
2039                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2040                         s->st_seq_fwd = seq;
2041                 } else {
2042                         /* Out-of-sequence; done. */
2043                         return (FALSE);
2044                 }
2045                 if (tcp->th_flags & TH_ACK) {
2046                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2047                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
2048                                 s->st_ack_fwd = ack;
2049                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2050                                 s->st_ack_fwd = ack;
2051                         } else {
2052                                 /* Out-of-sequence; done. */
2053                                 return (FALSE);
2054                         }
2055
2056                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2057                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2058                                 s->st_state |= (TH_ACK << 8);
2059                 }
2060         } else {
2061                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2062                         s->st_flags |= IPFW_STATE_F_SEQREV;
2063                         s->st_seq_rev = seq;
2064                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2065                         s->st_seq_rev = seq;
2066                 } else {
2067                         /* Out-of-sequence; done. */
2068                         return (FALSE);
2069                 }
2070                 if (tcp->th_flags & TH_ACK) {
2071                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2072                                 s->st_flags |= IPFW_STATE_F_ACKREV;
2073                                 s->st_ack_rev= ack;
2074                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2075                                 s->st_ack_rev = ack;
2076                         } else {
2077                                 /* Out-of-sequence; done. */
2078                                 return (FALSE);
2079                         }
2080
2081                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2082                             s->st_ack_rev == s->st_seq_fwd + 1)
2083                                 s->st_state |= TH_ACK;
2084                 }
2085         }
2086         return (TRUE);
2087 }
2088
2089 static void
2090 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2091     const struct tcphdr *tcp, struct ipfw_state *s)
2092 {
2093
2094         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2095                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2096
2097                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2098                         return;
2099
2100                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2101                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2102                 case TH_SYN:                            /* opening */
2103                         s->st_expire = time_uptime + dyn_syn_lifetime;
2104                         break;
2105
2106                 case BOTH_SYN:                  /* move to established */
2107                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
2108                 case BOTH_SYN | (TH_FIN << 8):
2109                         s->st_expire = time_uptime + dyn_ack_lifetime;
2110                         break;
2111
2112                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
2113                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2114                                 /* And both FINs were ACKed. */
2115                                 s->st_expire = time_uptime + dyn_fin_lifetime;
2116                         } else {
2117                                 s->st_expire = time_uptime +
2118                                     dyn_finwait_lifetime;
2119                         }
2120                         break;
2121
2122                 default:
2123 #if 0
2124                         /*
2125                          * reset or some invalid combination, but can also
2126                          * occur if we use keep-state the wrong way.
2127                          */
2128                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2129                                 kprintf("invalid state: 0x%x\n", s->st_state);
2130 #endif
2131                         s->st_expire = time_uptime + dyn_rst_lifetime;
2132                         break;
2133                 }
2134         } else if (pkt->proto == IPPROTO_UDP) {
2135                 s->st_expire = time_uptime + dyn_udp_lifetime;
2136         } else {
2137                 /* other protocols */
2138                 s->st_expire = time_uptime + dyn_short_lifetime;
2139         }
2140 }
2141
2142 /*
2143  * Lookup a state.
2144  */
2145 static struct ipfw_state *
2146 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2147     int *match_direction, const struct tcphdr *tcp)
2148 {
2149         struct ipfw_state *key, *s;
2150         int dir = MATCH_NONE;
2151
2152         key = &ctx->ipfw_state_tmpkey;
2153         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2154             pkt->dst_ip, pkt->dst_port, pkt->proto);
2155         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2156         if (s == NULL)
2157                 goto done; /* not found. */
2158         if (IPFW_STATE_ISDEAD(s)) {
2159                 ipfw_state_remove(ctx, s);
2160                 s = NULL;
2161                 goto done;
2162         }
2163         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2164                 /* TCP ports recycling is too fast. */
2165                 ctx->ipfw_sts_tcprecycled++;
2166                 ipfw_state_remove(ctx, s);
2167                 s = NULL;
2168                 goto done;
2169         }
2170
2171         if (s->st_swap == key->st_swap) {
2172                 dir = MATCH_FORWARD;
2173         } else {
2174                 KASSERT((s->st_swap & key->st_swap) == 0,
2175                     ("found mismatch state"));
2176                 dir = MATCH_REVERSE;
2177         }
2178
2179         /* Update this state. */
2180         ipfw_state_update(pkt, dir, tcp, s);
2181
2182         if (s->st_track != NULL) {
2183                 /* This track has been used. */
2184                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2185         }
2186 done:
2187         if (match_direction)
2188                 *match_direction = dir;
2189         return (s);
2190 }
2191
2192 static struct ipfw_state *
2193 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2194     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2195 {
2196         struct ipfw_state *s;
2197         size_t sz;
2198
2199         KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2200             ("invalid state type %u", type));
2201
2202         sz = sizeof(struct ipfw_state);
2203         if (IPFW_ISXLAT(type))
2204                 sz = sizeof(struct ipfw_xlat);
2205
2206         s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2207         if (s == NULL) {
2208                 ctx->ipfw_sts_nomem++;
2209                 return (NULL);
2210         }
2211
2212         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2213             id->dst_ip, id->dst_port, id->proto);
2214
2215         s->st_rule = rule;
2216         s->st_type = type;
2217         if (IPFW_ISXLAT(type)) {
2218                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2219
2220                 x->xlat_dir = MATCH_NONE;
2221                 x->xlat_pcpu = -1;
2222         }
2223
2224         /*
2225          * Update this state:
2226          * Set st_expire and st_state.
2227          */
2228         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2229
2230         return (s);
2231 }
2232
2233 static struct ipfw_state *
2234 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2235     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2236     const struct tcphdr *tcp)
2237 {
2238         struct ipfw_state *s, *dup;
2239
2240         s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2241         if (s == NULL)
2242                 return (NULL);
2243
2244         ctx->ipfw_state_cnt++;
2245         ctx->ipfw_state_loosecnt++;
2246         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2247                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2248                 ctx->ipfw_state_loosecnt = 0;
2249         }
2250
2251         dup = ipfw_state_link(ctx, s);
2252         if (dup != NULL)
2253                 panic("ipfw: %u state exists %p", type, dup);
2254
2255         if (t != NULL) {
2256                 /* Keep the track referenced. */
2257                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2258                 s->st_track = t;
2259         }
2260         return (s);
2261 }
2262
2263 static boolean_t
2264 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2265 {
2266         struct ipfw_trkcnt *trk;
2267         boolean_t trk_freed = FALSE;
2268
2269         KASSERT(t->t_count != NULL, ("track anchor"));
2270         KASSERT(LIST_EMPTY(&t->t_state_list),
2271             ("invalid track is still referenced"));
2272
2273         trk = t->t_trkcnt;
2274         KASSERT(trk != NULL, ("track has no trkcnt"));
2275
2276         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2277         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2278         kfree(t, M_IPFW);
2279
2280         /*
2281          * fdrop() style reference counting.
2282          * See kern/kern_descrip.c fdrop().
2283          */
2284         for (;;) {
2285                 int refs = trk->tc_refs;
2286
2287                 cpu_ccfence();
2288                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2289                 if (refs == 1) {
2290                         IPFW_TRKCNT_TOKGET;
2291                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2292                                 KASSERT(trk->tc_count == 0,
2293                                     ("%d states reference this trkcnt",
2294                                      trk->tc_count));
2295                                 RB_REMOVE(ipfw_trkcnt_tree,
2296                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
2297
2298                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2299                                     ("invalid trkcnt cnt %d",
2300                                      ipfw_gd.ipfw_trkcnt_cnt));
2301                                 ipfw_gd.ipfw_trkcnt_cnt--;
2302                                 IPFW_TRKCNT_TOKREL;
2303
2304                                 if (ctx->ipfw_trkcnt_spare == NULL)
2305                                         ctx->ipfw_trkcnt_spare = trk;
2306                                 else
2307                                         kfree(trk, M_IPFW);
2308                                 trk_freed = TRUE;
2309                                 break; /* done! */
2310                         }
2311                         IPFW_TRKCNT_TOKREL;
2312                         /* retry */
2313                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2314                         break; /* done! */
2315                 }
2316                 /* retry */
2317         }
2318         return (trk_freed);
2319 }
2320
2321 static void
2322 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2323 {
2324         struct ipfw_track *t, *tn;
2325
2326         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2327                 if (t->t_count == NULL) /* anchor */
2328                         continue;
2329                 if (rule != NULL && t->t_rule != rule)
2330                         continue;
2331                 ipfw_track_free(ctx, t);
2332         }
2333 }
2334
2335 static boolean_t
2336 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2337     boolean_t reap)
2338 {
2339         struct ipfw_state *s, *sn;
2340         boolean_t ret = FALSE;
2341
2342         KASSERT(t->t_count != NULL, ("track anchor"));
2343
2344         if (LIST_EMPTY(&t->t_state_list))
2345                 return (FALSE);
2346
2347         /*
2348          * Do not expire more than once per second, it is useless.
2349          */
2350         if (t->t_lastexp == time_uptime)
2351                 return (FALSE);
2352         t->t_lastexp = time_uptime;
2353
2354         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2355                 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2356                         KASSERT(s->st_track == t,
2357                             ("state track %p does not match %p",
2358                              s->st_track, t));
2359                         ipfw_state_del(ctx, s);
2360                         ret = TRUE;
2361                 }
2362         }
2363         return (ret);
2364 }
2365
2366 static __inline struct ipfw_trkcnt *
2367 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2368 {
2369         struct ipfw_trkcnt *trk;
2370
2371         if (ctx->ipfw_trkcnt_spare != NULL) {
2372                 trk = ctx->ipfw_trkcnt_spare;
2373                 ctx->ipfw_trkcnt_spare = NULL;
2374         } else {
2375                 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2376                     M_INTWAIT | M_NULLOK);
2377         }
2378         return (trk);
2379 }
2380
2381 static void
2382 ipfw_track_expire_done(struct ipfw_context *ctx)
2383 {
2384
2385         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2386             ("trackexp is not in progress"));
2387         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2388         callout_reset(&ctx->ipfw_trackto_ch, hz,
2389             ipfw_track_expire_ipifunc, NULL);
2390 }
2391
2392 static void
2393 ipfw_track_expire_more(struct ipfw_context *ctx)
2394 {
2395         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2396
2397         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2398             ("trackexp is not in progress"));
2399         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2400             ("trackexp more did not finish"));
2401         netisr_sendmsg_oncpu(nm);
2402 }
2403
2404 static int
2405 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2406     int scan_max, int expire_max)
2407 {
2408         struct ipfw_track *t;
2409         int scanned = 0, expired = 0;
2410         boolean_t reap = FALSE;
2411
2412         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2413             ("trackexp is not in progress"));
2414
2415         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2416                 reap = TRUE;
2417
2418         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2419                 if (scanned++ >= scan_max) {
2420                         ipfw_track_expire_more(ctx);
2421                         return (expired);
2422                 }
2423
2424                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2425                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2426
2427                 if (t->t_count == NULL) /* anchor */
2428                         continue;
2429
2430                 ipfw_track_state_expire(ctx, t, reap);
2431                 if (!LIST_EMPTY(&t->t_state_list)) {
2432                         /* There are states referencing this track. */
2433                         continue;
2434                 }
2435
2436                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2437                         /* Expired. */
2438                         if (ipfw_track_free(ctx, t)) {
2439                                 if (++expired >= expire_max) {
2440                                         ipfw_track_expire_more(ctx);
2441                                         return (expired);
2442                                 }
2443                         }
2444                 }
2445         }
2446         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2447         ipfw_track_expire_done(ctx);
2448         return (expired);
2449 }
2450
2451 static int
2452 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2453 {
2454         struct ipfw_track *anchor;
2455
2456         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2457             ("trackexp is in progress"));
2458         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2459
2460         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2461                 ipfw_track_expire_done(ctx);
2462                 return (0);
2463         }
2464
2465         /*
2466          * Do not expire more than once per second, it is useless.
2467          */
2468         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2469             ctx->ipfw_track_lastexp == time_uptime) {
2470                 ipfw_track_expire_done(ctx);
2471                 return (0);
2472         }
2473         ctx->ipfw_track_lastexp = time_uptime;
2474
2475         anchor = &ctx->ipfw_trackexp_anch;
2476         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2477         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2478 }
2479
2480 static void
2481 ipfw_track_expire_more_dispatch(netmsg_t nm)
2482 {
2483         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2484         struct ipfw_track *anchor;
2485
2486         ASSERT_NETISR_NCPUS(mycpuid);
2487         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2488             ("trackexp is not in progress"));
2489
2490         /* Reply ASAP */
2491         netisr_replymsg(&nm->base, 0);
2492
2493         anchor = &ctx->ipfw_trackexp_anch;
2494         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2495                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2496                 ipfw_track_expire_done(ctx);
2497                 return;
2498         }
2499         ipfw_track_expire_loop(ctx, anchor,
2500             ipfw_track_scan_max, ipfw_track_expire_max);
2501 }
2502
2503 static void
2504 ipfw_track_expire_dispatch(netmsg_t nm)
2505 {
2506         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2507
2508         ASSERT_NETISR_NCPUS(mycpuid);
2509
2510         /* Reply ASAP */
2511         crit_enter();
2512         netisr_replymsg(&nm->base, 0);
2513         crit_exit();
2514
2515         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2516                 /* Running; done. */
2517                 return;
2518         }
2519         ipfw_track_expire_start(ctx,
2520             ipfw_track_scan_max, ipfw_track_expire_max);
2521 }
2522
2523 static void
2524 ipfw_track_expire_ipifunc(void *dummy __unused)
2525 {
2526         struct netmsg_base *msg;
2527
2528         KKASSERT(mycpuid < netisr_ncpus);
2529         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2530
2531         crit_enter();
2532         if (msg->lmsg.ms_flags & MSGF_DONE)
2533                 netisr_sendmsg_oncpu(msg);
2534         crit_exit();
2535 }
2536
2537 static int
2538 ipfw_track_reap(struct ipfw_context *ctx)
2539 {
2540         struct ipfw_track *t, *anchor;
2541         int expired;
2542
2543         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2544                 /*
2545                  * Kick start track expiring.  Ignore scan limit,
2546                  * we are short of tracks.
2547                  */
2548                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2549                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2550                     ipfw_track_reap_max);
2551                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2552                 return (expired);
2553         }
2554
2555         /*
2556          * Tracks are being expired.
2557          */
2558
2559         if (RB_EMPTY(&ctx->ipfw_track_tree))
2560                 return (0);
2561
2562         expired = 0;
2563         anchor = &ctx->ipfw_trackexp_anch;
2564         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2565                 /*
2566                  * Ignore scan limit; we are short of tracks.
2567                  */
2568
2569                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2570                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2571
2572                 if (t->t_count == NULL) /* anchor */
2573                         continue;
2574
2575                 ipfw_track_state_expire(ctx, t, TRUE);
2576                 if (!LIST_EMPTY(&t->t_state_list)) {
2577                         /* There are states referencing this track. */
2578                         continue;
2579                 }
2580
2581                 if (ipfw_track_free(ctx, t)) {
2582                         if (++expired >= ipfw_track_reap_max) {
2583                                 ipfw_track_expire_more(ctx);
2584                                 break;
2585                         }
2586                 }
2587         }
2588         /*
2589          * NOTE:
2590          * Leave the anchor on the list, even if the end of the list has
2591          * been reached.  ipfw_track_expire_more_dispatch() will handle
2592          * the removal.
2593          */
2594         return (expired);
2595 }
2596
2597 static struct ipfw_track *
2598 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2599     uint16_t limit_mask, struct ip_fw *rule)
2600 {
2601         struct ipfw_track *key, *t, *dup;
2602         struct ipfw_trkcnt *trk, *ret;
2603         boolean_t do_expire = FALSE;
2604
2605         KASSERT(rule->track_ruleid != 0,
2606             ("rule %u has no track ruleid", rule->rulenum));
2607
2608         key = &ctx->ipfw_track_tmpkey;
2609         key->t_proto = id->proto;
2610         key->t_addrs = 0;
2611         key->t_ports = 0;
2612         key->t_rule = rule;
2613         if (limit_mask & DYN_SRC_ADDR)
2614                 key->t_saddr = id->src_ip;
2615         if (limit_mask & DYN_DST_ADDR)
2616                 key->t_daddr = id->dst_ip;
2617         if (limit_mask & DYN_SRC_PORT)
2618                 key->t_sport = id->src_port;
2619         if (limit_mask & DYN_DST_PORT)
2620                 key->t_dport = id->dst_port;
2621
2622         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2623         if (t != NULL)
2624                 goto done;
2625
2626         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2627         if (t == NULL) {
2628                 ctx->ipfw_tks_nomem++;
2629                 return (NULL);
2630         }
2631
2632         t->t_key = key->t_key;
2633         t->t_rule = rule;
2634         t->t_lastexp = 0;
2635         LIST_INIT(&t->t_state_list);
2636
2637         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2638                 time_t globexp, uptime;
2639
2640                 trk = NULL;
2641                 do_expire = TRUE;
2642
2643                 /*
2644                  * Do not expire globally more than once per second,
2645                  * it is useless.
2646                  */
2647                 uptime = time_uptime;
2648                 globexp = ipfw_gd.ipfw_track_globexp;
2649                 if (globexp != uptime &&
2650                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2651                     globexp, uptime)) {
2652                         int cpu;
2653
2654                         /* Expire tracks on other CPUs. */
2655                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2656                                 if (cpu == mycpuid)
2657                                         continue;
2658                                 lwkt_send_ipiq(globaldata_find(cpu),
2659                                     ipfw_track_expire_ipifunc, NULL);
2660                         }
2661                 }
2662         } else {
2663                 trk = ipfw_trkcnt_alloc(ctx);
2664         }
2665         if (trk == NULL) {
2666                 struct ipfw_trkcnt *tkey;
2667
2668                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2669                 key = NULL; /* tkey overlaps key */
2670
2671                 tkey->tc_key = t->t_key;
2672                 tkey->tc_ruleid = rule->track_ruleid;
2673
2674                 IPFW_TRKCNT_TOKGET;
2675                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2676                     tkey);
2677                 if (trk == NULL) {
2678                         IPFW_TRKCNT_TOKREL;
2679                         if (do_expire) {
2680                                 ctx->ipfw_tks_reap++;
2681                                 if (ipfw_track_reap(ctx) > 0) {
2682                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2683                                             ipfw_track_max) {
2684                                                 trk = ipfw_trkcnt_alloc(ctx);
2685                                                 if (trk != NULL)
2686                                                         goto install;
2687                                                 ctx->ipfw_tks_cntnomem++;
2688                                         } else {
2689                                                 ctx->ipfw_tks_overflow++;
2690                                         }
2691                                 } else {
2692                                         ctx->ipfw_tks_reapfailed++;
2693                                         ctx->ipfw_tks_overflow++;
2694                                 }
2695                         } else {
2696                                 ctx->ipfw_tks_cntnomem++;
2697                         }
2698                         kfree(t, M_IPFW);
2699                         return (NULL);
2700                 }
2701                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2702                     ("invalid trkcnt refs %d", trk->tc_refs));
2703                 atomic_add_int(&trk->tc_refs, 1);
2704                 IPFW_TRKCNT_TOKREL;
2705         } else {
2706 install:
2707                 trk->tc_key = t->t_key;
2708                 trk->tc_ruleid = rule->track_ruleid;
2709                 trk->tc_refs = 0;
2710                 trk->tc_count = 0;
2711                 trk->tc_expire = 0;
2712                 trk->tc_rulenum = rule->rulenum;
2713
2714                 IPFW_TRKCNT_TOKGET;
2715                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2716                     trk);
2717                 if (ret != NULL) {
2718                         KASSERT(ret->tc_refs > 0 &&
2719                             ret->tc_refs < netisr_ncpus,
2720                             ("invalid trkcnt refs %d", ret->tc_refs));
2721                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2722                             ("trkcnt spare was installed"));
2723                         ctx->ipfw_trkcnt_spare = trk;
2724                         trk = ret;
2725                 } else {
2726                         ipfw_gd.ipfw_trkcnt_cnt++;
2727                 }
2728                 atomic_add_int(&trk->tc_refs, 1);
2729                 IPFW_TRKCNT_TOKREL;
2730         }
2731         t->t_count = &trk->tc_count;
2732         t->t_trkcnt = trk;
2733
2734         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2735         if (dup != NULL)
2736                 panic("ipfw: track exists");
2737         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2738 done:
2739         t->t_expire = time_uptime + dyn_short_lifetime;
2740         return (t);
2741 }
2742
2743 /*
2744  * Install state for rule type cmd->o.opcode
2745  *
2746  * Returns NULL if state is not installed because of errors or because
2747  * states limitations are enforced.
2748  */
2749 static struct ipfw_state *
2750 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2751     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2752 {
2753         struct ipfw_state *s;
2754         struct ipfw_track *t;
2755         int count, diff;
2756
2757         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2758             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2759                 boolean_t overflow = TRUE;
2760
2761                 ctx->ipfw_sts_reap++;
2762                 if (ipfw_state_reap(ctx, diff) == 0)
2763                         ctx->ipfw_sts_reapfailed++;
2764                 if (ipfw_state_cntsync() < ipfw_state_max)
2765                         overflow = FALSE;
2766
2767                 if (overflow) {
2768                         time_t globexp, uptime;
2769                         int cpu;
2770
2771                         /*
2772                          * Do not expire globally more than once per second,
2773                          * it is useless.
2774                          */
2775                         uptime = time_uptime;
2776                         globexp = ipfw_gd.ipfw_state_globexp;
2777                         if (globexp == uptime ||
2778                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2779                             globexp, uptime)) {
2780                                 ctx->ipfw_sts_overflow++;
2781                                 return (NULL);
2782                         }
2783
2784                         /* Expire states on other CPUs. */
2785                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2786                                 if (cpu == mycpuid)
2787                                         continue;
2788                                 lwkt_send_ipiq(globaldata_find(cpu),
2789                                     ipfw_state_expire_ipifunc, NULL);
2790                         }
2791                         ctx->ipfw_sts_overflow++;
2792                         return (NULL);
2793                 }
2794         }
2795
2796         switch (cmd->o.opcode) {
2797         case O_KEEP_STATE: /* bidir rule */
2798         case O_REDIRECT:
2799                 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2800                     tcp);
2801                 if (s == NULL)
2802                         return (NULL);
2803                 break;
2804
2805         case O_LIMIT: /* limit number of sessions */
2806                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2807                 if (t == NULL)
2808                         return (NULL);
2809
2810                 if (*t->t_count >= cmd->conn_limit) {
2811                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2812                                 return (NULL);
2813                 }
2814                 for (;;) {
2815                         count = *t->t_count;
2816                         if (count >= cmd->conn_limit)
2817                                 return (NULL);
2818                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2819                                 break;
2820                 }
2821
2822                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2823                 if (s == NULL) {
2824                         /* Undo damage. */
2825                         atomic_subtract_int(t->t_count, 1);
2826                         return (NULL);
2827                 }
2828                 break;
2829
2830         default:
2831                 panic("unknown state type %u\n", cmd->o.opcode);
2832         }
2833
2834         if (s->st_type == O_REDIRECT) {
2835                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2836                 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2837
2838                 x->xlat_addr = r->addr.s_addr;
2839                 x->xlat_port = r->port;
2840                 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2841                 x->xlat_dir = MATCH_FORWARD;
2842                 KKASSERT(x->xlat_ifp != NULL);
2843         }
2844         return (s);
2845 }
2846
2847 static int
2848 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2849     const struct in_addr *in)
2850 {
2851         struct radix_node_head *rnh;
2852         struct sockaddr_in sin;
2853         struct ipfw_tblent *te;
2854
2855         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2856         rnh = ctx->ipfw_tables[tableid];
2857         if (rnh == NULL)
2858                 return (0); /* no match */
2859
2860         memset(&sin, 0, sizeof(sin));
2861         sin.sin_family = AF_INET;
2862         sin.sin_len = sizeof(sin);
2863         sin.sin_addr = *in;
2864
2865         te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2866         if (te == NULL)
2867                 return (0); /* no match */
2868
2869         te->te_use++;
2870         te->te_lastuse = time_second;
2871         return (1); /* match */
2872 }
2873
2874 /*
2875  * Transmit a TCP packet, containing either a RST or a keepalive.
2876  * When flags & TH_RST, we are sending a RST packet, because of a
2877  * "reset" action matched the packet.
2878  * Otherwise we are sending a keepalive, and flags & TH_
2879  *
2880  * Only {src,dst}_{ip,port} of "id" are used.
2881  */
2882 static void
2883 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2884 {
2885         struct mbuf *m;
2886         struct ip *ip;
2887         struct tcphdr *tcp;
2888         struct route sro;       /* fake route */
2889
2890         MGETHDR(m, M_NOWAIT, MT_HEADER);
2891         if (m == NULL)
2892                 return;
2893         m->m_pkthdr.rcvif = NULL;
2894         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2895         m->m_data += max_linkhdr;
2896
2897         ip = mtod(m, struct ip *);
2898         bzero(ip, m->m_len);
2899         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2900         ip->ip_p = IPPROTO_TCP;
2901         tcp->th_off = 5;
2902
2903         /*
2904          * Assume we are sending a RST (or a keepalive in the reverse
2905          * direction), swap src and destination addresses and ports.
2906          */
2907         ip->ip_src.s_addr = htonl(id->dst_ip);
2908         ip->ip_dst.s_addr = htonl(id->src_ip);
2909         tcp->th_sport = htons(id->dst_port);
2910         tcp->th_dport = htons(id->src_port);
2911         if (flags & TH_RST) {   /* we are sending a RST */
2912                 if (flags & TH_ACK) {
2913                         tcp->th_seq = htonl(ack);
2914                         tcp->th_ack = htonl(0);
2915                         tcp->th_flags = TH_RST;
2916                 } else {
2917                         if (flags & TH_SYN)
2918                                 seq++;
2919                         tcp->th_seq = htonl(0);
2920                         tcp->th_ack = htonl(seq);
2921                         tcp->th_flags = TH_RST | TH_ACK;
2922                 }
2923         } else {
2924                 /*
2925                  * We are sending a keepalive. flags & TH_SYN determines
2926                  * the direction, forward if set, reverse if clear.
2927                  * NOTE: seq and ack are always assumed to be correct
2928                  * as set by the caller. This may be confusing...
2929                  */
2930                 if (flags & TH_SYN) {
2931                         /*
2932                          * we have to rewrite the correct addresses!
2933                          */
2934                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2935                         ip->ip_src.s_addr = htonl(id->src_ip);
2936                         tcp->th_dport = htons(id->dst_port);
2937                         tcp->th_sport = htons(id->src_port);
2938                 }
2939                 tcp->th_seq = htonl(seq);
2940                 tcp->th_ack = htonl(ack);
2941                 tcp->th_flags = TH_ACK;
2942         }
2943
2944         /*
2945          * set ip_len to the payload size so we can compute
2946          * the tcp checksum on the pseudoheader
2947          * XXX check this, could save a couple of words ?
2948          */
2949         ip->ip_len = htons(sizeof(struct tcphdr));
2950         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2951
2952         /*
2953          * now fill fields left out earlier
2954          */
2955         ip->ip_ttl = ip_defttl;
2956         ip->ip_len = m->m_pkthdr.len;
2957
2958         bzero(&sro, sizeof(sro));
2959         ip_rtaddr(ip->ip_dst, &sro);
2960
2961         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2962         ip_output(m, NULL, &sro, 0, NULL, NULL);
2963         if (sro.ro_rt)
2964                 RTFREE(sro.ro_rt);
2965 }
2966
2967 /*
2968  * Send a reject message, consuming the mbuf passed as an argument.
2969  */
2970 static void
2971 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2972 {
2973         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2974                 /* We need the IP header in host order for icmp_error(). */
2975                 if (args->eh != NULL) {
2976                         struct ip *ip = mtod(args->m, struct ip *);
2977
2978                         ip->ip_len = ntohs(ip->ip_len);
2979                         ip->ip_off = ntohs(ip->ip_off);
2980                 }
2981                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2982         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2983                 struct tcphdr *const tcp =
2984                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2985
2986                 if ((tcp->th_flags & TH_RST) == 0) {
2987                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2988                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2989                 }
2990                 m_freem(args->m);
2991         } else {
2992                 m_freem(args->m);
2993         }
2994         args->m = NULL;
2995 }
2996
2997 /*
2998  * Given an ip_fw *, lookup_next_rule will return a pointer
2999  * to the next rule, which can be either the jump
3000  * target (for skipto instructions) or the next one in the list (in
3001  * all other cases including a missing jump target).
3002  * The result is also written in the "next_rule" field of the rule.
3003  * Backward jumps are not allowed, so start looking from the next
3004  * rule...
3005  *
3006  * This never returns NULL -- in case we do not have an exact match,
3007  * the next rule is returned. When the ruleset is changed,
3008  * pointers are flushed so we are always correct.
3009  */
3010 static struct ip_fw *
3011 lookup_next_rule(struct ip_fw *me)
3012 {
3013         struct ip_fw *rule = NULL;
3014         ipfw_insn *cmd;
3015
3016         /* look for action, in case it is a skipto */
3017         cmd = ACTION_PTR(me);
3018         if (cmd->opcode == O_LOG)
3019                 cmd += F_LEN(cmd);
3020         if (cmd->opcode == O_SKIPTO) {
3021                 for (rule = me->next; rule; rule = rule->next) {
3022                         if (rule->rulenum >= cmd->arg1)
3023                                 break;
3024                 }
3025         }
3026         if (rule == NULL)                       /* failure or not a skipto */
3027                 rule = me->next;
3028         me->next_rule = rule;
3029         return rule;
3030 }
3031
3032 static int
3033 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3034                 enum ipfw_opcodes opcode, uid_t uid)
3035 {
3036         struct in_addr src_ip, dst_ip;
3037         struct inpcbinfo *pi;
3038         boolean_t wildcard;
3039         struct inpcb *pcb;
3040
3041         if (fid->proto == IPPROTO_TCP) {
3042                 wildcard = FALSE;
3043                 pi = &tcbinfo[mycpuid];
3044         } else if (fid->proto == IPPROTO_UDP) {
3045                 wildcard = TRUE;
3046                 pi = &udbinfo[mycpuid];
3047         } else {
3048                 return 0;
3049         }
3050
3051         /*
3052          * Values in 'fid' are in host byte order
3053          */
3054         dst_ip.s_addr = htonl(fid->dst_ip);
3055         src_ip.s_addr = htonl(fid->src_ip);
3056         if (oif) {
3057                 pcb = in_pcblookup_hash(pi,
3058                         dst_ip, htons(fid->dst_port),
3059                         src_ip, htons(fid->src_port),
3060                         wildcard, oif);
3061         } else {
3062                 pcb = in_pcblookup_hash(pi,
3063                         src_ip, htons(fid->src_port),
3064                         dst_ip, htons(fid->dst_port),
3065                         wildcard, NULL);
3066         }
3067         if (pcb == NULL || pcb->inp_socket == NULL)
3068                 return 0;
3069
3070         if (opcode == O_UID) {
3071 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3072                 return !socheckuid(pcb->inp_socket, uid);
3073 #undef socheckuid
3074         } else  {
3075                 return groupmember(uid, pcb->inp_socket->so_cred);
3076         }
3077 }
3078
3079 static int
3080 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3081 {
3082
3083         if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3084                 struct ifaddr_container *ifac;
3085                 struct ifnet *ifp;
3086
3087                 ifp = ifunit_netisr(cmd->ifname);
3088                 if (ifp == NULL)
3089                         return (0);
3090
3091                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3092                         struct ifaddr *ia = ifac->ifa;
3093
3094                         if (ia->ifa_addr == NULL)
3095                                 continue;
3096                         if (ia->ifa_addr->sa_family != AF_INET)
3097                                 continue;
3098
3099                         cmd->mask.s_addr = INADDR_ANY;
3100                         if (cmd->o.arg1 & IPFW_IFIP_NET) {
3101                                 cmd->mask = ((struct sockaddr_in *)
3102                                     ia->ifa_netmask)->sin_addr;
3103                         }
3104                         if (cmd->mask.s_addr == INADDR_ANY)
3105                                 cmd->mask.s_addr = INADDR_BROADCAST;
3106
3107                         cmd->addr =
3108                             ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3109                         cmd->addr.s_addr &= cmd->mask.s_addr;
3110
3111                         cmd->o.arg1 |= IPFW_IFIP_VALID;
3112                         break;
3113                 }
3114                 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3115                         return (0);
3116         }
3117         return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3118 }
3119
3120 static void
3121 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3122     struct in_addr *old_addr, uint16_t *old_port)
3123 {
3124         struct ip *ip = mtod(m, struct ip *);
3125         struct in_addr *addr;
3126         uint16_t *port, *csum, dlen = 0;
3127         uint8_t udp = 0;
3128         boolean_t pseudo = FALSE;
3129
3130         if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3131                 addr = &ip->ip_src;
3132                 switch (ip->ip_p) {
3133                 case IPPROTO_TCP:
3134                         port = &L3HDR(struct tcphdr, ip)->th_sport;
3135                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3136                         break;
3137                 case IPPROTO_UDP:
3138                         port = &L3HDR(struct udphdr, ip)->uh_sport;
3139                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3140                         udp = 1;
3141                         break;
3142                 default:
3143                         panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3144                 }
3145         } else {
3146                 addr = &ip->ip_dst;
3147                 switch (ip->ip_p) {
3148                 case IPPROTO_TCP:
3149                         port = &L3HDR(struct tcphdr, ip)->th_dport;
3150                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3151                         break;
3152                 case IPPROTO_UDP:
3153                         port = &L3HDR(struct udphdr, ip)->uh_dport;
3154                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3155                         udp = 1;
3156                         break;
3157                 default:
3158                         panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3159                 }
3160         }
3161         if (old_addr != NULL)
3162                 *old_addr = *addr;
3163         if (old_port != NULL) {
3164                 if (x->xlat_port != 0)
3165                         *old_port = *port;
3166                 else
3167                         *old_port = 0;
3168         }
3169
3170         if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3171                 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3172                         dlen = ip->ip_len - (ip->ip_hl << 2);
3173                 pseudo = TRUE;
3174         }
3175
3176         if (!pseudo) {
3177                 const uint16_t *oaddr, *naddr;
3178
3179                 oaddr = (const uint16_t *)&addr->s_addr;
3180                 naddr = (const uint16_t *)&x->xlat_addr;
3181
3182                 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3183                     oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3184                 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3185                     oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3186         }
3187         addr->s_addr = x->xlat_addr;
3188
3189         if (x->xlat_port != 0) {
3190                 if (!pseudo) {
3191                         *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3192                             udp);
3193                 }
3194                 *port = x->xlat_port;
3195         }
3196
3197         if (pseudo) {
3198                 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3199                     htons(dlen + ip->ip_p));
3200         }
3201 }
3202
3203 static void
3204 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3205 {
3206         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3207         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3208         struct mbuf *m = nm->m;
3209         struct ipfw_xlat *x = nm->arg1;
3210         struct ip_fw *rule = x->xlat_rule;
3211
3212         ASSERT_NETISR_NCPUS(mycpuid);
3213         KASSERT(rule->cpuid == mycpuid,
3214             ("rule does not belong to cpu%d", mycpuid));
3215         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3216             ("mbuf does not have ipfw continue rule"));
3217
3218         KASSERT(ctx->ipfw_cont_rule == NULL,
3219             ("pending ipfw continue rule"));
3220         KASSERT(ctx->ipfw_cont_xlat == NULL,
3221             ("pending ipfw continue xlat"));
3222         ctx->ipfw_cont_rule = rule;
3223         ctx->ipfw_cont_xlat = x;
3224
3225         if (nm->arg2 == 0)
3226                 ip_input(m);
3227         else
3228                 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3229
3230         /* May not be cleared, if ipfw was unload/disabled. */
3231         ctx->ipfw_cont_rule = NULL;
3232         ctx->ipfw_cont_xlat = NULL;
3233
3234         /*
3235          * This state is no longer used; decrement its xlat_crefs,
3236          * so this state can be deleted.
3237          */
3238         x->xlat_crefs--;
3239         /*
3240          * This rule is no longer used; decrement its cross_refs,
3241          * so this rule can be deleted.
3242          *
3243          * NOTE:
3244          * Decrement cross_refs in the last step of this function,
3245          * so that the module could be unloaded safely.
3246          */
3247         rule->cross_refs--;
3248 }
3249
3250 static void
3251 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3252     uint32_t flags)
3253 {
3254         struct netmsg_genpkt *nm;
3255
3256         KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3257             x->xlat_pcpu, cpuid));
3258
3259         /*
3260          * Bump cross_refs to prevent this rule and its siblings
3261          * from being deleted, while this mbuf is inflight.  The
3262          * cross_refs of the sibling rule on the target cpu will
3263          * be decremented, once this mbuf is going to be filtered
3264          * on the target cpu.
3265          */
3266         x->xlat_rule->cross_refs++;
3267         /*
3268          * Bump xlat_crefs to prevent this state and its paired
3269          * state from being deleted, while this mbuf is inflight.
3270          * The xlat_crefs of the paired state on the target cpu
3271          * will be decremented, once this mbuf is going to be
3272          * filtered on the target cpu.
3273          */
3274         x->xlat_crefs++;
3275
3276         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3277         if (flags & IPFW_XLATE_INSERT)
3278                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3279         if (flags & IPFW_XLATE_FORWARD)
3280                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3281
3282         if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3283                 struct ip *ip = mtod(m, struct ip *);
3284
3285                 /*
3286                  * NOTE:
3287                  * ip_input() expects ip_len/ip_off are in network
3288                  * byte order.
3289                  */
3290                 ip->ip_len = htons(ip->ip_len);
3291                 ip->ip_off = htons(ip->ip_off);
3292         }
3293
3294         nm = &m->m_hdr.mh_genmsg;
3295         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3296             ipfw_ip_xlate_dispatch);
3297         nm->m = m;
3298         nm->arg1 = x->xlat_pair;
3299         nm->arg2 = 0;
3300         if (flags & IPFW_XLATE_OUTPUT)
3301                 nm->arg2 = 1;
3302         netisr_sendmsg(&nm->base, cpuid);
3303 }
3304
3305 static struct mbuf *
3306 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3307     struct ip_fw_local *local, struct ip **ip0)
3308 {
3309         struct ip *ip = mtod(m, struct ip *);
3310         struct tcphdr *tcp;
3311         struct udphdr *udp;
3312
3313         /*
3314          * Collect parameters into local variables for faster matching.
3315          */
3316         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
3317                 local->proto = args->f_id.proto = 0;    /* mark f_id invalid */
3318                 goto done;
3319         }
3320
3321         local->proto = args->f_id.proto = ip->ip_p;
3322         local->src_ip = ip->ip_src;
3323         local->dst_ip = ip->ip_dst;
3324         if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3325                 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3326                 local->ip_len = ntohs(ip->ip_len);
3327         } else {
3328                 local->offset = ip->ip_off & IP_OFFMASK;
3329                 local->ip_len = ip->ip_len;
3330         }
3331
3332 #define PULLUP_TO(len)                                  \
3333 do {                                                    \
3334         if (m->m_len < (len)) {                         \
3335                 args->m = m = m_pullup(m, (len));       \
3336                 if (m == NULL) {                        \
3337                         ip = NULL;                      \
3338                         goto done;                      \
3339                 }                                       \
3340                 ip = mtod(m, struct ip *);              \
3341         }                                               \
3342 } while (0)
3343
3344         if (local->offset == 0) {
3345                 switch (local->proto) {
3346                 case IPPROTO_TCP:
3347                         PULLUP_TO(hlen + sizeof(struct tcphdr));
3348                         local->tcp = tcp = L3HDR(struct tcphdr, ip);
3349                         local->dst_port = tcp->th_dport;
3350                         local->src_port = tcp->th_sport;
3351                         args->f_id.flags = tcp->th_flags;
3352                         break;
3353
3354                 case IPPROTO_UDP:
3355                         PULLUP_TO(hlen + sizeof(struct udphdr));
3356                         udp = L3HDR(struct udphdr, ip);
3357                         local->dst_port = udp->uh_dport;
3358                         local->src_port = udp->uh_sport;
3359                         break;
3360
3361                 case IPPROTO_ICMP:
3362                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
3363                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3364                         break;
3365
3366                 default:
3367                         break;
3368                 }
3369         }
3370
3371 #undef PULLUP_TO
3372
3373         args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3374         args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3375         args->f_id.src_port = local->src_port = ntohs(local->src_port);
3376         args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3377 done:
3378         *ip0 = ip;
3379         return (m);
3380 }
3381
3382 static struct mbuf *
3383 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3384     struct ip_fw_local *local, struct ip **ip0)
3385 {
3386         struct ip *ip = mtod(m, struct ip *);
3387
3388         ip->ip_len = htons(ip->ip_len);
3389         ip->ip_off = htons(ip->ip_off);
3390
3391         m->m_flags &= ~M_HASH;
3392         ip_hashfn(&m, 0);
3393         args->m = m;
3394         if (m == NULL) {
3395                 *ip0 = NULL;
3396                 return (NULL);
3397         }
3398         KASSERT(m->m_flags & M_HASH, ("no hash"));
3399
3400         /* 'm' might be changed by ip_hashfn(). */
3401         ip = mtod(m, struct ip *);
3402         ip->ip_len = ntohs(ip->ip_len);
3403         ip->ip_off = ntohs(ip->ip_off);
3404
3405         return (ipfw_setup_local(m, hlen, args, local, ip0));
3406 }
3407
3408 /*
3409  * The main check routine for the firewall.
3410  *
3411  * All arguments are in args so we can modify them and return them
3412  * back to the caller.
3413  *
3414  * Parameters:
3415  *
3416  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
3417  *              Starts with the IP header.
3418  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
3419  *      args->oif       Outgoing interface, or NULL if packet is incoming.
3420  *              The incoming interface is in the mbuf. (in)
3421  *
3422  *      args->rule      Pointer to the last matching rule (in/out)
3423  *      args->f_id      Addresses grabbed from the packet (out)
3424  *
3425  * Return value:
3426  *
3427  *      If the packet was denied/rejected and has been dropped, *m is equal
3428  *      to NULL upon return.
3429  *
3430  *      IP_FW_DENY      the packet must be dropped.
3431  *      IP_FW_PASS      The packet is to be accepted and routed normally.
3432  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
3433  *      IP_FW_TEE       Tee the packet to port (args->cookie)
3434  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
3435  *      IP_FW_CONTINUE  Continue processing on another cpu.
3436  */
3437 static int
3438 ipfw_chk(struct ip_fw_args *args)
3439 {
3440         /*
3441          * Local variables hold state during the processing of a packet.
3442          *
3443          * IMPORTANT NOTE: to speed up the processing of rules, there
3444          * are some assumption on the values of the variables, which
3445          * are documented here. Should you change them, please check
3446          * the implementation of the various instructions to make sure
3447          * that they still work.
3448          *
3449          * args->eh     The MAC header. It is non-null for a layer2
3450          *      packet, it is NULL for a layer-3 packet.
3451          *
3452          * m | args->m  Pointer to the mbuf, as received from the caller.
3453          *      It may change if ipfw_chk() does an m_pullup, or if it
3454          *      consumes the packet because it calls send_reject().
3455          *      XXX This has to change, so that ipfw_chk() never modifies
3456          *      or consumes the buffer.
3457          * ip   is simply an alias of the value of m, and it is kept
3458          *      in sync with it (the packet is  supposed to start with
3459          *      the ip header).
3460          */
3461         struct mbuf *m = args->m;
3462         struct ip *ip = mtod(m, struct ip *);
3463
3464         /*
3465          * oif | args->oif      If NULL, ipfw_chk has been called on the
3466          *      inbound path (ether_input, ip_input).
3467          *      If non-NULL, ipfw_chk has been called on the outbound path
3468          *      (ether_output, ip_output).
3469          */
3470         struct ifnet *oif = args->oif;
3471
3472         struct ip_fw *f = NULL;         /* matching rule */
3473         int retval = IP_FW_PASS;
3474         struct m_tag *mtag;
3475         struct divert_info *divinfo;
3476         struct ipfw_state *s;
3477
3478         /*
3479          * hlen The length of the IPv4 header.
3480          *      hlen >0 means we have an IPv4 packet.
3481          */
3482         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
3483
3484         struct ip_fw_local lc;
3485
3486         /*
3487          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3488          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
3489          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3490          */
3491         int dyn_dir = MATCH_UNKNOWN;
3492         struct ip_fw *dyn_f = NULL;
3493         int cpuid = mycpuid;
3494         struct ipfw_context *ctx;
3495
3496         ASSERT_NETISR_NCPUS(cpuid);
3497         ctx = ipfw_ctx[cpuid];
3498
3499         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3500                 return IP_FW_PASS;      /* accept */
3501
3502         if (args->eh == NULL ||         /* layer 3 packet */
3503             (m->m_pkthdr.len >= sizeof(struct ip) &&
3504              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3505                 hlen = ip->ip_hl << 2;
3506
3507         memset(&lc, 0, sizeof(lc));
3508
3509         m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3510         if (m == NULL)
3511                 goto pullup_failed;
3512
3513         if (args->rule) {
3514                 /*
3515                  * Packet has already been tagged. Look for the next rule
3516                  * to restart processing.
3517                  *
3518                  * If fw_one_pass != 0 then just accept it.
3519                  * XXX should not happen here, but optimized out in
3520                  * the caller.
3521                  */
3522                 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3523                         return IP_FW_PASS;
3524                 args->flags &= ~IP_FWARG_F_CONT;
3525
3526                 /* This rule is being/has been flushed */
3527                 if (ipfw_flushing)
3528                         return IP_FW_DENY;
3529
3530                 KASSERT(args->rule->cpuid == cpuid,
3531                         ("rule used on cpu%d", cpuid));
3532
3533                 /* This rule was deleted */
3534                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3535                         return IP_FW_DENY;
3536
3537                 if (args->xlat != NULL) {
3538                         struct ipfw_xlat *x = args->xlat;
3539
3540                         /* This xlat is being deleted. */
3541                         if (x->xlat_invalid)
3542                                 return IP_FW_DENY;
3543
3544                         f = args->rule;
3545
3546                         dyn_f = f;
3547                         dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3548                             MATCH_FORWARD : MATCH_REVERSE;
3549
3550                         if (args->flags & IP_FWARG_F_XLATINS) {
3551                                 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3552                                     ("not slave %u state", x->xlat_type));
3553                                 s = ipfw_state_link(ctx, &x->xlat_st);
3554                                 if (s != NULL) {
3555                                         ctx->ipfw_xlate_conflicts++;
3556                                         if (IPFW_STATE_ISDEAD(s)) {
3557                                                 ipfw_state_remove(ctx, s);
3558                                                 s = ipfw_state_link(ctx,
3559                                                     &x->xlat_st);
3560                                         }
3561                                         if (s != NULL) {
3562                                                 if (bootverbose) {
3563                                                         kprintf("ipfw: "
3564                                                         "slave %u state "
3565                                                         "conflicts %u state\n",
3566                                                         x->xlat_type,
3567                                                         s->st_type);
3568                                                 }
3569                                                 ipfw_xlat_invalidate(x);
3570                                                 return IP_FW_DENY;
3571                                         }
3572                                         ctx->ipfw_xlate_cresolved++;
3573                                 }
3574                         } else {
3575                                 ipfw_state_update(&args->f_id, dyn_dir,
3576                                     lc.tcp, &x->xlat_st);
3577                         }
3578                 } else {
3579                         /* TODO: setup dyn_f, dyn_dir */
3580
3581                         f = args->rule->next_rule;
3582                         if (f == NULL)
3583                                 f = lookup_next_rule(args->rule);
3584                 }
3585         } else {
3586                 /*
3587                  * Find the starting rule. It can be either the first
3588                  * one, or the one after divert_rule if asked so.
3589                  */
3590                 int skipto;
3591
3592                 KKASSERT((args->flags &
3593                     (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3594                 KKASSERT(args->xlat == NULL);
3595
3596                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3597                 if (mtag != NULL) {
3598                         divinfo = m_tag_data(mtag);
3599                         skipto = divinfo->skipto;
3600                 } else {
3601                         skipto = 0;
3602                 }
3603
3604                 f = ctx->ipfw_layer3_chain;
3605                 if (args->eh == NULL && skipto != 0) {
3606                         /* No skipto during rule flushing */
3607                         if (ipfw_flushing)
3608                                 return IP_FW_DENY;
3609
3610                         if (skipto >= IPFW_DEFAULT_RULE)
3611                                 return IP_FW_DENY; /* invalid */
3612
3613                         while (f && f->rulenum <= skipto)
3614                                 f = f->next;
3615                         if (f == NULL)  /* drop packet */
3616                                 return IP_FW_DENY;
3617                 } else if (ipfw_flushing) {
3618                         /* Rules are being flushed; skip to default rule */
3619                         f = ctx->ipfw_default_rule;
3620                 }
3621         }
3622         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3623                 m_tag_delete(m, mtag);
3624
3625         /*
3626          * Now scan the rules, and parse microinstructions for each rule.
3627          */
3628         for (; f; f = f->next) {
3629                 int l, cmdlen;
3630                 ipfw_insn *cmd;
3631                 int skip_or; /* skip rest of OR block */
3632
3633 again:
3634                 if (ctx->ipfw_set_disable & (1 << f->set)) {
3635                         args->xlat = NULL;
3636                         continue;
3637                 }
3638
3639                 if (args->xlat != NULL) {
3640                         args->xlat = NULL;
3641                         l = f->cmd_len - f->act_ofs;
3642                         cmd = ACTION_PTR(f);
3643                 } else {
3644                         l = f->cmd_len;
3645                         cmd = f->cmd;
3646                 }
3647
3648                 skip_or = 0;
3649                 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3650                         int match;
3651
3652                         /*
3653                          * check_body is a jump target used when we find a
3654                          * CHECK_STATE, and need to jump to the body of
3655                          * the target rule.
3656                          */
3657 check_body:
3658                         cmdlen = F_LEN(cmd);
3659                         /*
3660                          * An OR block (insn_1 || .. || insn_n) has the
3661                          * F_OR bit set in all but the last instruction.
3662                          * The first match will set "skip_or", and cause
3663                          * the following instructions to be skipped until
3664                          * past the one with the F_OR bit clear.
3665                          */
3666                         if (skip_or) {          /* skip this instruction */
3667                                 if ((cmd->len & F_OR) == 0)
3668                                         skip_or = 0;    /* next one is good */
3669                                 continue;
3670                         }
3671                         match = 0; /* set to 1 if we succeed */
3672
3673                         switch (cmd->opcode) {
3674                         /*
3675                          * The first set of opcodes compares the packet's
3676                          * fields with some pattern, setting 'match' if a
3677                          * match is found. At the end of the loop there is
3678                          * logic to deal with F_NOT and F_OR flags associated
3679                          * with the opcode.
3680                          */
3681                         case O_NOP:
3682                                 match = 1;
3683                                 break;
3684
3685                         case O_FORWARD_MAC:
3686                                 kprintf("ipfw: opcode %d unimplemented\n",
3687                                         cmd->opcode);
3688                                 break;
3689
3690                         case O_GID:
3691                         case O_UID:
3692                                 /*
3693                                  * We only check offset == 0 && proto != 0,
3694                                  * as this ensures that we have an IPv4
3695                                  * packet with the ports info.
3696                                  */
3697                                 if (lc.offset!=0)
3698                                         break;
3699
3700                                 match = ipfw_match_uid(&args->f_id, oif,
3701                                         cmd->opcode,
3702                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3703                                 break;
3704
3705                         case O_RECV:
3706                                 match = iface_match(m->m_pkthdr.rcvif,
3707                                     (ipfw_insn_if *)cmd);
3708                                 break;
3709
3710                         case O_XMIT:
3711                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3712                                 break;
3713
3714                         case O_VIA:
3715                                 match = iface_match(oif ? oif :
3716                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3717                                 break;
3718
3719                         case O_MACADDR2:
3720                                 if (args->eh != NULL) { /* have MAC header */
3721                                         uint32_t *want = (uint32_t *)
3722                                                 ((ipfw_insn_mac *)cmd)->addr;
3723                                         uint32_t *mask = (uint32_t *)
3724                                                 ((ipfw_insn_mac *)cmd)->mask;
3725                                         uint32_t *hdr = (uint32_t *)args->eh;
3726
3727                                         match =
3728                                         (want[0] == (hdr[0] & mask[0]) &&
3729                                          want[1] == (hdr[1] & mask[1]) &&
3730                                          want[2] == (hdr[2] & mask[2]));
3731                                 }
3732                                 break;
3733
3734                         case O_MAC_TYPE:
3735                                 if (args->eh != NULL) {
3736                                         uint16_t t =
3737                                             ntohs(args->eh->ether_type);
3738                                         uint16_t *p =
3739                                             ((ipfw_insn_u16 *)cmd)->ports;
3740                                         int i;
3741
3742                                         /* Special vlan handling */
3743                                         if (m->m_flags & M_VLANTAG)
3744                                                 t = ETHERTYPE_VLAN;
3745
3746                                         for (i = cmdlen - 1; !match && i > 0;
3747                                              i--, p += 2) {
3748                                                 match =
3749                                                 (t >= p[0] && t <= p[1]);
3750                                         }
3751                                 }
3752                                 break;
3753
3754                         case O_FRAG:
3755                                 match = (hlen > 0 && lc.offset != 0);
3756                                 break;
3757
3758                         case O_IPFRAG:
3759                                 if (hlen > 0) {
3760                                         uint16_t off;
3761
3762                                         if (args->eh != NULL)
3763                                                 off = ntohs(ip->ip_off);
3764                                         else
3765                                                 off = ip->ip_off;
3766                                         if (off & (IP_MF | IP_OFFMASK))
3767                                                 match = 1;
3768                                 }
3769                                 break;
3770
3771                         case O_IN:      /* "out" is "not in" */
3772                                 match = (oif == NULL);
3773                                 break;
3774
3775                         case O_LAYER2:
3776                                 match = (args->eh != NULL);
3777                                 break;
3778
3779                         case O_PROTO:
3780                                 /*
3781                                  * We do not allow an arg of 0 so the
3782                                  * check of "proto" only suffices.
3783                                  */
3784                                 match = (lc.proto == cmd->arg1);
3785                                 break;
3786
3787                         case O_IP_SRC:
3788                                 match = (hlen > 0 &&
3789                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3790                                     lc.src_ip.s_addr);
3791                                 break;
3792
3793                         case O_IP_SRC_MASK:
3794                                 match = (hlen > 0 &&
3795                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3796                                      (lc.src_ip.s_addr &
3797                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3798                                 break;
3799
3800                         case O_IP_SRC_ME:
3801                                 if (hlen > 0) {
3802                                         struct ifnet *tif;
3803
3804                                         tif = INADDR_TO_IFP(&lc.src_ip);
3805                                         match = (tif != NULL);
3806                                 }
3807                                 break;
3808
3809                         case O_IP_SRC_TABLE:
3810                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3811                                     &lc.src_ip);
3812                                 break;
3813
3814                         case O_IP_SRC_IFIP:
3815                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3816                                     &lc.src_ip);
3817                                 break;
3818
3819                         case O_IP_DST_SET:
3820                         case O_IP_SRC_SET:
3821                                 if (hlen > 0) {
3822                                         uint32_t *d = (uint32_t *)(cmd + 1);
3823                                         uint32_t addr =
3824                                             cmd->opcode == O_IP_DST_SET ?
3825                                                 args->f_id.dst_ip :
3826                                                 args->f_id.src_ip;
3827
3828                                         if (addr < d[0])
3829                                                 break;
3830                                         addr -= d[0]; /* subtract base */
3831                                         match =
3832                                         (addr < cmd->arg1) &&
3833                                          (d[1 + (addr >> 5)] &
3834                                           (1 << (addr & 0x1f)));
3835                                 }
3836                                 break;
3837
3838                         case O_IP_DST:
3839                                 match = (hlen > 0 &&
3840                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3841                                     lc.dst_ip.s_addr);
3842                                 break;
3843
3844                         case O_IP_DST_MASK:
3845                                 match = (hlen > 0) &&
3846                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3847                                      (lc.dst_ip.s_addr &
3848                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3849                                 break;
3850
3851                         case O_IP_DST_ME:
3852                                 if (hlen > 0) {
3853                                         struct ifnet *tif;
3854
3855                                         tif = INADDR_TO_IFP(&lc.dst_ip);
3856                                         match = (tif != NULL);
3857                                 }
3858                                 break;
3859
3860                         case O_IP_DST_TABLE:
3861                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3862                                     &lc.dst_ip);
3863                                 break;
3864
3865                         case O_IP_DST_IFIP:
3866                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3867                                     &lc.dst_ip);
3868                                 break;
3869
3870                         case O_IP_SRCPORT:
3871                         case O_IP_DSTPORT:
3872                                 /*
3873                                  * offset == 0 && proto != 0 is enough
3874                                  * to guarantee that we have an IPv4
3875                                  * packet with port info.
3876                                  */
3877                                 if ((lc.proto==IPPROTO_UDP ||
3878                                      lc.proto==IPPROTO_TCP)
3879                                     && lc.offset == 0) {
3880                                         uint16_t x =
3881                                             (cmd->opcode == O_IP_SRCPORT) ?
3882                                                 lc.src_port : lc.dst_port;
3883                                         uint16_t *p =
3884                                             ((ipfw_insn_u16 *)cmd)->ports;
3885                                         int i;
3886
3887                                         for (i = cmdlen - 1; !match && i > 0;
3888                                              i--, p += 2) {
3889                                                 match =
3890                                                 (x >= p[0] && x <= p[1]);
3891                                         }
3892                                 }
3893                                 break;
3894
3895                         case O_ICMPCODE:
3896                                 match = (lc.offset == 0 &&
3897                                     lc.proto==IPPROTO_ICMP &&
3898                                     icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3899                                 break;
3900
3901                         case O_ICMPTYPE:
3902                                 match = (lc.offset == 0 &&
3903                                     lc.proto==IPPROTO_ICMP &&
3904                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3905                                 break;
3906
3907                         case O_IPOPT:
3908                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3909                                 break;
3910
3911                         case O_IPVER:
3912                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3913                                 break;
3914
3915                         case O_IPTTL:
3916                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3917                                 break;
3918