sys/net/ipfw/ip_fw2.c

   1 /*
   2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
  26  */
  27
  28 /*
  29  * Implement IP packet firewall (new version)
  30  */
  31
  32 #include "opt_ipfw.h"
  33 #include "opt_inet.h"
  34 #ifndef INET
  35 #error IPFIREWALL requires INET.
  36 #endif /* INET */
  37
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/malloc.h>
  41 #include <sys/mbuf.h>
  42 #include <sys/kernel.h>
  43 #include <sys/proc.h>
  44 #include <sys/socket.h>
  45 #include <sys/socketvar.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/ucred.h>
  49 #include <sys/in_cksum.h>
  50 #include <sys/limits.h>
  51 #include <sys/lock.h>
  52 #include <sys/tree.h>
  53
  54 #include <net/if.h>
  55 #include <net/route.h>
  56 #include <net/pfil.h>
  57 #include <net/dummynet/ip_dummynet.h>
  58
  59 #include <sys/thread2.h>
  60 #include <sys/mplock2.h>
  61 #include <net/netmsg2.h>
  62
  63 #include <netinet/in.h>
  64 #include <netinet/in_systm.h>
  65 #include <netinet/in_var.h>
  66 #include <netinet/in_pcb.h>
  67 #include <netinet/ip.h>
  68 #include <netinet/ip_var.h>
  69 #include <netinet/ip_icmp.h>
  70 #include <netinet/tcp.h>
  71 #include <netinet/tcp_seq.h>
  72 #include <netinet/tcp_timer.h>
  73 #include <netinet/tcp_var.h>
  74 #include <netinet/tcpip.h>
  75 #include <netinet/udp.h>
  76 #include <netinet/udp_var.h>
  77 #include <netinet/ip_divert.h>
  78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
  79
  80 #include <net/ipfw/ip_fw2.h>
  81
  82 #ifdef IPFIREWALL_DEBUG
  83 #define DPRINTF(fmt, ...) \
  84 do { \
  85         if (fw_debug > 0) \
  86                 kprintf(fmt, __VA_ARGS__); \
  87 } while (0)
  88 #else
  89 #define DPRINTF(fmt, ...)       ((void)0)
  90 #endif
  91
  92 /*
  93  * Description about per-CPU rule duplication:
  94  *
  95  * Module loading/unloading and all ioctl operations are serialized
  96  * by netisr0, so we don't have any ordering or locking problems.
  97  *
  98  * Following graph shows how operation on per-CPU rule list is
  99  * performed [2 CPU case]:
 100  *
 101  *   CPU0                 CPU1
 102  *
 103  * netisr0 <------------------------------------+
 104  *  domsg                                       |
 105  *    :                                         |
 106  *    :(delete/add...)                          |
 107  *    :                                         |
 108  *    :         netmsg                          | netmsg
 109  *  forwardmsg---------->netisr1                |
 110  *                          :                   |
 111  *                          :(delete/add...)    |
 112  *                          :                   |
 113  *                          :                   |
 114  *                        replymsg--------------+
 115  *
 116  *
 117  *
 118  * Rule structure [2 CPU case]
 119  *
 120  *    CPU0               CPU1
 121  *
 122  * layer3_chain       layer3_chain
 123  *     |                  |
 124  *     V                  V
 125  * +-------+ sibling  +-------+ sibling
 126  * | rule1 |--------->| rule1 |--------->NULL
 127  * +-------+          +-------+
 128  *     |                  |
 129  *     |next              |next
 130  *     V                  V
 131  * +-------+ sibling  +-------+ sibling
 132  * | rule2 |--------->| rule2 |--------->NULL
 133  * +-------+          +-------+
 134  *
 135  * ip_fw.sibling:
 136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
 137  *    iterate layer3_chain in netisr0; the current rule's duplication
 138  *    to the other CPUs could safely be read-only accessed through
 139  *    ip_fw.sibling.
 140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
 141  *    a) In netisr0 rule3 is determined to be inserted between rule1
 142  *       and rule2.  To make this decision we need to iterate the
 143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
 144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
 145  *       in netisr0 as next_rule.
 146  *    b) After the insertion in netisr0 is done, we will move on to
 147  *       netisr1.  But instead of relocating the rule3's position in
 148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
 149  *       netmsg's prev_rule to rule1->sibling and next_rule to
 150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
 151  *       netisr0.
 152  */
 153
 154 /*
 155  * Description of states and tracks.
 156  *
 157  * Both states and tracks are stored in per-cpu RB trees instead of
 158  * per-cpu hash tables to avoid the worst case hash degeneration.
 159  *
 160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
 161  * measured in seconds and depending on the flags.
 162  *
 163  * When a packet is received, its address fields are first masked with
 164  * the mask defined for the rule, then matched against the entries in
 165  * the per-cpu state RB tree.  States are generated by 'keep-state'
 166  * and 'limit' options.
 167  *
 168  * The max number of states is ipfw_state_max.  When we reach the
 169  * maximum number of states we do not create anymore.  This is done to
 170  * avoid consuming too much memory, but also too much time when
 171  * searching on each packet.
 172  *
 173  * Each state holds a pointer to the parent ipfw rule of the current
 174  * CPU so we know what action to perform.  States are removed when the
 175  * parent rule is deleted.  XXX we should make them survive.
 176  *
 177  * There are some limitations with states -- we do not obey the
 178  * 'randomized match', and we do not do multiple passes through the
 179  * firewall.  XXX check the latter!!!
 180  *
 181  * States grow independently on each CPU, e.g. 2 CPU case:
 182  *
 183  *        CPU0                     CPU1
 184  * ...................      ...................
 185  * :  state RB tree  :      :  state RB tree  :
 186  * :                 :      :                 :
 187  * : state1   state2 :      :      state3     :
 188  * :     |    |      :      :        |        :
 189  * :.....|....|......:      :........|........:
 190  *       |    |                      |
 191  *       |    |                      |st_rule
 192  *       |    |                      |
 193  *       V    V                      V
 194  *     +-------+                 +-------+
 195  *     | rule1 |                 | rule1 |
 196  *     +-------+                 +-------+
 197  *
 198  * Tracks are used to enforce limits on the number of sessions.  Tracks
 199  * are generated by 'limit' option.
 200  *
 201  * The max number of tracks is ipfw_track_max.  When we reach the
 202  * maximum number of tracks we do not create anymore.  This is done to
 203  * avoid consuming too much memory.
 204  *
 205  * Tracks are organized into two layers, track counter RB tree is
 206  * shared between CPUs, track RB tree is per-cpu.  States generated by
 207  * 'limit' option are linked to the track in addition to the per-cpu
 208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
 209  *
 210  *             ..............................
 211  *             :    track counter RB tree   :
 212  *             :                            :
 213  *             :        +-----------+       :
 214  *             :        |  trkcnt1  |       :
 215  *             :        |           |       :
 216  *             :      +--->counter<----+    :
 217  *             :      | |           |  |    :
 218  *             :      | +-----------+  |    :
 219  *             :......|................|....:
 220  *                    |                |
 221  *        CPU0        |                |         CPU1
 222  * .................  |t_count         |  .................
 223  * : track RB tree :  |                |  : track RB tree :
 224  * :               :  |                |  :               :
 225  * : +-->track1-------+                +--------track2    :
 226  * : |     A       :                      :               :
 227  * : |     |       :                      :               :
 228  * :.|.....|.......:                      :...............:
 229  *   |     +----------------+
 230  *   | .................... |
 231  *   | :   state RB tree  : |st_track
 232  *   | :                  : |
 233  *   +---state1    state2---+
 234  *     :     |       |    :
 235  *     :.....|.......|....:
 236  *           |       |
 237  *           |       |st_rule
 238  *           V       V
 239  *         +----------+
 240  *         |   rule1  |
 241  *         +----------+
 242  */
 243
 244 #define IPFW_AUTOINC_STEP_MIN   1
 245 #define IPFW_AUTOINC_STEP_MAX   1000
 246 #define IPFW_AUTOINC_STEP_DEF   100
 247
 248 #define IPFW_TABLE_MAX_DEF      64
 249
 250 #define IPFW_DEFAULT_RULE       65535   /* rulenum for the default rule */
 251 #define IPFW_DEFAULT_SET        31      /* set number for the default rule */
 252
 253 #define MATCH_REVERSE           0
 254 #define MATCH_FORWARD           1
 255 #define MATCH_NONE              2
 256 #define MATCH_UNKNOWN           3
 257
 258 #define TIME_LEQ(a, b)          ((a) - (b) <= 0)
 259
 260 #define IPFW_STATE_TCPFLAGS     (TH_SYN | TH_FIN | TH_RST)
 261 #define IPFW_STATE_TCPSTATES    (IPFW_STATE_TCPFLAGS |  \
 262                                  (IPFW_STATE_TCPFLAGS << 8))
 263
 264 #define BOTH_SYN                (TH_SYN | (TH_SYN << 8))
 265 #define BOTH_FIN                (TH_FIN | (TH_FIN << 8))
 266 #define BOTH_RST                (TH_RST | (TH_RST << 8))
 267 /* TH_ACK here means FIN was ACKed. */
 268 #define BOTH_FINACK             (TH_ACK | (TH_ACK << 8))
 269
 270 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP &&        \
 271                                  (((s)->st_state & BOTH_RST) ||         \
 272                                   ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
 273
 274 #define O_ANCHOR                O_NOP
 275
 276 #define IPFW_ISXLAT(type)       ((type) == O_REDIRECT)
 277 #define IPFW_XLAT_INVALID(s)    (IPFW_ISXLAT((s)->st_type) &&   \
 278                                  ((struct ipfw_xlat *)(s))->xlat_invalid)
 279
 280 #define IPFW_MBUF_XLATINS       FW_MBUF_PRIVATE1
 281 #define IPFW_MBUF_XLATFWD       FW_MBUF_PRIVATE2
 282
 283 #define IPFW_XLATE_INSERT       0x0001
 284 #define IPFW_XLATE_FORWARD      0x0002
 285 #define IPFW_XLATE_OUTPUT       0x0004
 286
 287 struct netmsg_ipfw {
 288         struct netmsg_base      base;
 289         const struct ipfw_ioc_rule *ioc_rule;
 290         struct ip_fw            *next_rule;
 291         struct ip_fw            *prev_rule;
 292         struct ip_fw            *sibling;
 293         uint32_t                rule_flags;
 294         struct ip_fw            **cross_rules;
 295 };
 296
 297 struct netmsg_del {
 298         struct netmsg_base      base;
 299         struct ip_fw            *start_rule;
 300         struct ip_fw            *prev_rule;
 301         uint16_t                rulenum;
 302         uint8_t                 from_set;
 303         uint8_t                 to_set;
 304 };
 305
 306 struct netmsg_zent {
 307         struct netmsg_base      base;
 308         struct ip_fw            *start_rule;
 309         uint16_t                rulenum;
 310         uint16_t                log_only;
 311 };
 312
 313 struct netmsg_cpstate {
 314         struct netmsg_base      base;
 315         struct ipfw_ioc_state   *ioc_state;
 316         int                     state_cntmax;
 317         int                     state_cnt;
 318 };
 319
 320 struct netmsg_tblent {
 321         struct netmsg_base      base;
 322         struct sockaddr         *key;
 323         struct sockaddr         *netmask;
 324         struct ipfw_tblent      *sibling;
 325         int                     tableid;
 326 };
 327
 328 struct netmsg_tblflush {
 329         struct netmsg_base      base;
 330         int                     tableid;
 331         int                     destroy;
 332 };
 333
 334 struct netmsg_tblexp {
 335         struct netmsg_base      base;
 336         time_t                  expire;
 337         int                     tableid;
 338         int                     cnt;
 339         int                     expcnt;
 340         struct radix_node_head  *rnh;
 341 };
 342
 343 struct ipfw_table_cp {
 344         struct ipfw_ioc_tblent  *te;
 345         int                     te_idx;
 346         int                     te_cnt;
 347 };
 348
 349 struct ip_fw_local {
 350         /*
 351          * offset       The offset of a fragment. offset != 0 means that
 352          *      we have a fragment at this offset of an IPv4 packet.
 353          *      offset == 0 means that (if this is an IPv4 packet)
 354          *      this is the first or only fragment.
 355          */
 356         u_short                 offset;
 357
 358         /*
 359          * Local copies of addresses. They are only valid if we have
 360          * an IP packet.
 361          *
 362          * proto        The protocol. Set to 0 for non-ip packets,
 363          *      or to the protocol read from the packet otherwise.
 364          *      proto != 0 means that we have an IPv4 packet.
 365          *
 366          * src_port, dst_port   port numbers, in HOST format. Only
 367          *      valid for TCP and UDP packets.
 368          *
 369          * src_ip, dst_ip       ip addresses, in NETWORK format.
 370          *      Only valid for IPv4 packets.
 371          */
 372         uint8_t                 proto;
 373         uint16_t                src_port;       /* NOTE: host format    */
 374         uint16_t                dst_port;       /* NOTE: host format    */
 375         struct in_addr          src_ip;         /* NOTE: network format */
 376         struct in_addr          dst_ip;         /* NOTE: network format */
 377         uint16_t                ip_len;
 378         struct tcphdr           *tcp;
 379 };
 380
 381 struct ipfw_addrs {
 382         uint32_t                addr1;  /* host byte order */
 383         uint32_t                addr2;  /* host byte order */
 384 };
 385
 386 struct ipfw_ports {
 387         uint16_t                port1;  /* host byte order */
 388         uint16_t                port2;  /* host byte order */
 389 };
 390
 391 struct ipfw_key {
 392         union {
 393                 struct ipfw_addrs addrs;
 394                 uint64_t        value;
 395         } addr_u;
 396         union {
 397                 struct ipfw_ports ports;
 398                 uint32_t        value;
 399         } port_u;
 400         uint8_t                 proto;
 401         uint8_t                 swap;   /* IPFW_KEY_SWAP_ */
 402         uint16_t                rsvd2;
 403 };
 404
 405 #define IPFW_KEY_SWAP_ADDRS     0x1
 406 #define IPFW_KEY_SWAP_PORTS     0x2
 407 #define IPFW_KEY_SWAP_ALL       (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
 408
 409 struct ipfw_trkcnt {
 410         RB_ENTRY(ipfw_trkcnt)   tc_rblink;
 411         struct ipfw_key         tc_key;
 412         uintptr_t               tc_ruleid;
 413         int                     tc_refs;
 414         int                     tc_count;
 415         time_t                  tc_expire;      /* userland get-only */
 416         uint16_t                tc_rulenum;     /* userland get-only */
 417 } __cachealign;
 418
 419 #define tc_addrs                tc_key.addr_u.value
 420 #define tc_ports                tc_key.port_u.value
 421 #define tc_proto                tc_key.proto
 422 #define tc_saddr                tc_key.addr_u.addrs.addr1
 423 #define tc_daddr                tc_key.addr_u.addrs.addr2
 424 #define tc_sport                tc_key.port_u.ports.port1
 425 #define tc_dport                tc_key.port_u.ports.port2
 426
 427 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
 428
 429 struct ipfw_state;
 430
 431 struct ipfw_track {
 432         RB_ENTRY(ipfw_track)    t_rblink;
 433         struct ipfw_key         t_key;
 434         struct ip_fw            *t_rule;
 435         time_t                  t_lastexp;
 436         LIST_HEAD(, ipfw_state) t_state_list;
 437         time_t                  t_expire;
 438         volatile int            *t_count;
 439         struct ipfw_trkcnt      *t_trkcnt;
 440         TAILQ_ENTRY(ipfw_track) t_link;
 441 };
 442
 443 #define t_addrs                 t_key.addr_u.value
 444 #define t_ports                 t_key.port_u.value
 445 #define t_proto                 t_key.proto
 446 #define t_saddr                 t_key.addr_u.addrs.addr1
 447 #define t_daddr                 t_key.addr_u.addrs.addr2
 448 #define t_sport                 t_key.port_u.ports.port1
 449 #define t_dport                 t_key.port_u.ports.port2
 450
 451 RB_HEAD(ipfw_track_tree, ipfw_track);
 452 TAILQ_HEAD(ipfw_track_list, ipfw_track);
 453
 454 struct ipfw_state {
 455         RB_ENTRY(ipfw_state)    st_rblink;
 456         struct ipfw_key         st_key;
 457
 458         time_t                  st_expire;      /* expire time */
 459         struct ip_fw            *st_rule;
 460
 461         uint64_t                st_pcnt;        /* packets */
 462         uint64_t                st_bcnt;        /* bytes */
 463
 464         /*
 465          * st_state:
 466          * State of this rule, typically a combination of TCP flags.
 467          *
 468          * st_ack_fwd/st_ack_rev:
 469          * Most recent ACKs in forward and reverse direction.  They
 470          * are used to generate keepalives.
 471          */
 472         uint32_t                st_state;
 473         uint32_t                st_ack_fwd;     /* host byte order */
 474         uint32_t                st_seq_fwd;     /* host byte order */
 475         uint32_t                st_ack_rev;     /* host byte order */
 476         uint32_t                st_seq_rev;     /* host byte order */
 477
 478         uint16_t                st_flags;       /* IPFW_STATE_F_ */
 479         uint16_t                st_type;        /* KEEP_STATE/LIMIT/RDR */
 480         struct ipfw_track       *st_track;
 481
 482         LIST_ENTRY(ipfw_state)  st_trklink;
 483         TAILQ_ENTRY(ipfw_state) st_link;
 484 };
 485
 486 #define st_addrs                st_key.addr_u.value
 487 #define st_ports                st_key.port_u.value
 488 #define st_proto                st_key.proto
 489 #define st_swap                 st_key.swap
 490
 491 #define IPFW_STATE_F_ACKFWD     0x0001
 492 #define IPFW_STATE_F_SEQFWD     0x0002
 493 #define IPFW_STATE_F_ACKREV     0x0004
 494 #define IPFW_STATE_F_SEQREV     0x0008
 495 #define IPFW_STATE_F_XLATSRC    0x0010
 496 #define IPFW_STATE_F_XLATSLAVE  0x0020
 497 #define IPFW_STATE_F_LINKED     0x0040
 498
 499 #define IPFW_STATE_SCANSKIP(s)  ((s)->st_type == O_ANCHOR ||    \
 500                                  ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
 501
 502 /* Expired or being deleted. */
 503 #define IPFW_STATE_ISDEAD(s)    (TIME_LEQ((s)->st_expire, time_uptime) || \
 504                                  IPFW_XLAT_INVALID((s)))
 505
 506 TAILQ_HEAD(ipfw_state_list, ipfw_state);
 507 RB_HEAD(ipfw_state_tree, ipfw_state);
 508
 509 struct ipfw_xlat {
 510         struct ipfw_state       xlat_st;        /* MUST be the first field */
 511         uint32_t                xlat_addr;      /* network byte order */
 512         uint16_t                xlat_port;      /* network byte order */
 513         uint16_t                xlat_dir;       /* MATCH_ */
 514         struct ifnet            *xlat_ifp;      /* matching ifnet */
 515         struct ipfw_xlat        *xlat_pair;     /* paired state */
 516         int                     xlat_pcpu;      /* paired cpu */
 517         volatile int            xlat_invalid;   /* invalid, but not dtor yet */
 518         volatile uint64_t       xlat_crefs;     /* cross references */
 519         struct netmsg_base      xlat_freenm;    /* for remote free */
 520 };
 521
 522 #define xlat_type               xlat_st.st_type
 523 #define xlat_flags              xlat_st.st_flags
 524 #define xlat_rule               xlat_st.st_rule
 525 #define xlat_bcnt               xlat_st.st_bcnt
 526 #define xlat_pcnt               xlat_st.st_pcnt
 527
 528 struct ipfw_tblent {
 529         struct radix_node       te_nodes[2];
 530         struct sockaddr_in      te_key;
 531         u_long                  te_use;
 532         time_t                  te_lastuse;
 533         struct ipfw_tblent      *te_sibling;
 534         volatile int            te_expired;
 535 };
 536
 537 struct ipfw_context {
 538         struct ip_fw            *ipfw_layer3_chain;     /* rules for layer3 */
 539         struct ip_fw            *ipfw_default_rule;     /* default rule */
 540         uint64_t                ipfw_norule_counter;    /* ipfw_log(NULL) stat*/
 541
 542         /*
 543          * ipfw_set_disable contains one bit per set value (0..31).
 544          * If the bit is set, all rules with the corresponding set
 545          * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
 546          * default rule and CANNOT be disabled.
 547          */
 548         uint32_t                ipfw_set_disable;
 549
 550         uint8_t                 ipfw_flags;     /* IPFW_FLAG_ */
 551
 552         struct ip_fw            *ipfw_cont_rule;
 553         struct ipfw_xlat        *ipfw_cont_xlat;
 554
 555         struct ipfw_state_tree  ipfw_state_tree;
 556         struct ipfw_state_list  ipfw_state_list;
 557         int                     ipfw_state_loosecnt;
 558         int                     ipfw_state_cnt;
 559
 560         union {
 561                 struct ipfw_state state;
 562                 struct ipfw_track track;
 563                 struct ipfw_trkcnt trkcnt;
 564         } ipfw_tmpkey;
 565
 566         struct ipfw_track_tree  ipfw_track_tree;
 567         struct ipfw_track_list  ipfw_track_list;
 568         struct ipfw_trkcnt      *ipfw_trkcnt_spare;
 569
 570         struct callout          ipfw_stateto_ch;
 571         time_t                  ipfw_state_lastexp;
 572         struct netmsg_base      ipfw_stateexp_nm;
 573         struct netmsg_base      ipfw_stateexp_more;
 574         struct ipfw_state       ipfw_stateexp_anch;
 575
 576         struct callout          ipfw_trackto_ch;
 577         time_t                  ipfw_track_lastexp;
 578         struct netmsg_base      ipfw_trackexp_nm;
 579         struct netmsg_base      ipfw_trackexp_more;
 580         struct ipfw_track       ipfw_trackexp_anch;
 581
 582         struct callout          ipfw_keepalive_ch;
 583         struct netmsg_base      ipfw_keepalive_nm;
 584         struct netmsg_base      ipfw_keepalive_more;
 585         struct ipfw_state       ipfw_keepalive_anch;
 586
 587         struct callout          ipfw_xlatreap_ch;
 588         struct netmsg_base      ipfw_xlatreap_nm;
 589         struct ipfw_state_list  ipfw_xlatreap;
 590
 591         /*
 592          * Statistics
 593          */
 594         u_long                  ipfw_sts_reap;
 595         u_long                  ipfw_sts_reapfailed;
 596         u_long                  ipfw_sts_overflow;
 597         u_long                  ipfw_sts_nomem;
 598         u_long                  ipfw_sts_tcprecycled;
 599
 600         u_long                  ipfw_tks_nomem;
 601         u_long                  ipfw_tks_reap;
 602         u_long                  ipfw_tks_reapfailed;
 603         u_long                  ipfw_tks_overflow;
 604         u_long                  ipfw_tks_cntnomem;
 605
 606         u_long                  ipfw_frags;
 607         u_long                  ipfw_defraged;
 608         u_long                  ipfw_defrag_remote;
 609
 610         u_long                  ipfw_xlated;
 611         u_long                  ipfw_xlate_split;
 612         u_long                  ipfw_xlate_conflicts;
 613         u_long                  ipfw_xlate_cresolved;
 614
 615         /* Last field */
 616         struct radix_node_head  *ipfw_tables[];
 617 };
 618
 619 #define IPFW_FLAG_KEEPALIVE     0x01
 620 #define IPFW_FLAG_STATEEXP      0x02
 621 #define IPFW_FLAG_TRACKEXP      0x04
 622 #define IPFW_FLAG_STATEREAP     0x08
 623 #define IPFW_FLAG_TRACKREAP     0x10
 624
 625 #define ipfw_state_tmpkey       ipfw_tmpkey.state
 626 #define ipfw_track_tmpkey       ipfw_tmpkey.track
 627 #define ipfw_trkcnt_tmpkey      ipfw_tmpkey.trkcnt
 628
 629 struct ipfw_global {
 630         int                     ipfw_state_loosecnt;    /* cache aligned */
 631         time_t                  ipfw_state_globexp __cachealign;
 632
 633         struct lwkt_token       ipfw_trkcnt_token __cachealign;
 634         struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
 635         int                     ipfw_trkcnt_cnt;
 636         time_t                  ipfw_track_globexp;
 637
 638         /* Accessed in netisr0. */
 639         struct ip_fw            *ipfw_crossref_free __cachealign;
 640         struct callout          ipfw_crossref_ch;
 641         struct netmsg_base      ipfw_crossref_nm;
 642
 643 #ifdef KLD_MODULE
 644         /*
 645          * Module can not be unloaded, if there are references to
 646          * certains rules of ipfw(4), e.g. dummynet(4)
 647          */
 648         int                     ipfw_refcnt __cachealign;
 649 #endif
 650 } __cachealign;
 651
 652 static struct ipfw_context      *ipfw_ctx[MAXCPU];
 653
 654 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 655
 656 /*
 657  * Following two global variables are accessed and updated only
 658  * in netisr0.
 659  */
 660 static uint32_t static_count;   /* # of static rules */
 661 static uint32_t static_ioc_len; /* bytes of static rules */
 662
 663 /*
 664  * If 1, then ipfw static rules are being flushed,
 665  * ipfw_chk() will skip to the default rule.
 666  */
 667 static int ipfw_flushing;
 668
 669 static int fw_verbose;
 670 static int verbose_limit;
 671
 672 static int fw_debug;
 673 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
 674
 675 static int      ipfw_table_max = IPFW_TABLE_MAX_DEF;
 676
 677 static int      ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
 678 static int      ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
 679
 680 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
 681
 682 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 683 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
 684     "Firewall statistics");
 685
 686 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
 687     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
 688 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
 689     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
 690     "Rule number autincrement step");
 691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
 692     &fw_one_pass, 0,
 693     "Only do a single pass through ipfw when using dummynet(4)");
 694 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
 695     &fw_debug, 0, "Enable printing of debug ip_fw statements");
 696 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
 697     &fw_verbose, 0, "Log matches to ipfw rules");
 698 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
 699     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
 700 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
 701     &ipfw_table_max, 0, "Max # of tables");
 702
 703 static int      ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
 704 static int      ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
 705 static int      ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
 706 static int      ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
 707 static int      ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
 708 static int      ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
 709
 710 /*
 711  * Timeouts for various events in handing states.
 712  *
 713  * NOTE:
 714  * 1 == 0~1 second.
 715  * 2 == 1~2 second(s).
 716  *
 717  * We use 2 seconds for FIN lifetime, so that the states will not be
 718  * ripped prematurely.
 719  */
 720 static uint32_t dyn_ack_lifetime = 300;
 721 static uint32_t dyn_syn_lifetime = 20;
 722 static uint32_t dyn_finwait_lifetime = 20;
 723 static uint32_t dyn_fin_lifetime = 2;
 724 static uint32_t dyn_rst_lifetime = 2;
 725 static uint32_t dyn_udp_lifetime = 10;
 726 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
 727
 728 /*
 729  * Keepalives are sent if dyn_keepalive is set. They are sent every
 730  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
 731  * seconds of lifetime of a rule.
 732  */
 733 static uint32_t dyn_keepalive_interval = 20;
 734 static uint32_t dyn_keepalive_period = 5;
 735 static uint32_t dyn_keepalive = 1;      /* do send keepalives */
 736
 737 static struct ipfw_global       ipfw_gd;
 738 static int      ipfw_state_loosecnt_updthr;
 739 static int      ipfw_state_max = 4096;  /* max # of states */
 740 static int      ipfw_track_max = 4096;  /* max # of tracks */
 741
 742 static int      ipfw_state_headroom;    /* setup at module load time */
 743 static int      ipfw_state_reap_min = 8;
 744 static int      ipfw_state_expire_max = 32;
 745 static int      ipfw_state_scan_max = 256;
 746 static int      ipfw_keepalive_max = 8;
 747 static int      ipfw_track_reap_max = 4;
 748 static int      ipfw_track_expire_max = 16;
 749 static int      ipfw_track_scan_max = 128;
 750
 751 static eventhandler_tag ipfw_ifaddr_event;
 752
 753 /* Compat */
 754 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
 755     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
 756     "Number of states and tracks");
 757 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
 758     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
 759     "Max number of states and tracks");
 760
 761 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
 762     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
 763     "Number of states");
 764 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
 765     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
 766     "Max number of states");
 767 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
 768     &ipfw_state_headroom, 0, "headroom for state reap");
 769 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
 770     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
 771 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
 772     &ipfw_track_max, 0, "Max number of tracks");
 773 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
 774     &static_count, 0, "Number of static rules");
 775 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
 776     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
 777 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
 778     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
 779 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
 780     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
 781 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
 782     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
 783 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
 784     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
 785 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
 786     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
 787 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
 788     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
 789 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
 790     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
 791 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
 792     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
 793     "I", "# of states to scan for each expire iteration");
 794 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
 795     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
 796     "I", "# of states to expire for each expire iteration");
 797 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
 798     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
 799     "I", "# of states to expire for each expire iteration");
 800 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
 801     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
 802     "I", "# of states to reap for state shortage");
 803 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
 804     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
 805     "I", "# of tracks to scan for each expire iteration");
 806 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
 807     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
 808     "I", "# of tracks to expire for each expire iteration");
 809 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
 810     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
 811     "I", "# of tracks to reap for track shortage");
 812
 813 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
 814     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 815     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
 816     "LU", "# of state reaps due to states shortage");
 817 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
 818     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 819     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
 820     "LU", "# of state reap failure");
 821 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
 822     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 823     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
 824     "LU", "# of state overflow");
 825 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
 826     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 827     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
 828     "LU", "# of state allocation failure");
 829 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
 830     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 831     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
 832     "LU", "# of state deleted due to fast TCP port recycling");
 833
 834 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
 835     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 836     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
 837     "LU", "# of track allocation failure");
 838 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
 839     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 840     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
 841     "LU", "# of track reap due to tracks shortage");
 842 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
 843     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 844     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
 845     "LU", "# of track reap failure");
 846 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
 847     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 848     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
 849     "LU", "# of track overflow");
 850 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
 851     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 852     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
 853     "LU", "# of track counter allocation failure");
 854 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
 855     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 856     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
 857     "LU", "# of IP fragements defraged");
 858 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
 859     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 860     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
 861     "LU", "# of IP packets after defrag");
 862 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
 863     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 864     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
 865     "LU", "# of IP packets after defrag dispatched to remote cpus");
 866 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
 867     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 868     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
 869     "LU", "# address/port translations");
 870 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
 871     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 872     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
 873     "LU", "# address/port translations split between different cpus");
 874 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
 875     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 876     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
 877     "LU", "# address/port translations conflicts on remote cpu");
 878 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
 879     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
 880     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
 881     "LU", "# address/port translations conflicts resolved on remote cpu");
 882
 883 static int              ipfw_state_cmp(struct ipfw_state *,
 884                             struct ipfw_state *);
 885 static int              ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
 886                             struct ipfw_trkcnt *);
 887 static int              ipfw_track_cmp(struct ipfw_track *,
 888                             struct ipfw_track *);
 889
 890 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
 891 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
 892
 893 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
 894 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
 895
 896 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
 897 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
 898
 899 static int              ipfw_chk(struct ip_fw_args *);
 900 static void             ipfw_track_expire_ipifunc(void *);
 901 static void             ipfw_state_expire_ipifunc(void *);
 902 static void             ipfw_keepalive(void *);
 903 static int              ipfw_state_expire_start(struct ipfw_context *,
 904                             int, int);
 905 static void             ipfw_crossref_timeo(void *);
 906 static void             ipfw_state_remove(struct ipfw_context *,
 907                             struct ipfw_state *);
 908 static void             ipfw_xlat_reap_timeo(void *);
 909 static void             ipfw_defrag_redispatch(struct mbuf *, int,
 910                             struct ip_fw *);
 911
 912 #define IPFW_TRKCNT_TOKGET      lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
 913 #define IPFW_TRKCNT_TOKREL      lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
 914 #define IPFW_TRKCNT_TOKINIT     \
 915         lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
 916
 917 static void
 918 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
 919     const struct sockaddr *netmask)
 920 {
 921         const u_char *cp1 = (const u_char *)src;
 922         u_char *cp2 = (u_char *)dst;
 923         const u_char *cp3 = (const u_char *)netmask;
 924         u_char *cplim = cp2 + *cp3;
 925         u_char *cplim2 = cp2 + *cp1;
 926
 927         *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 928         cp3 += 2;
 929         if (cplim > cplim2)
 930                 cplim = cplim2;
 931         while (cp2 < cplim)
 932                 *cp2++ = *cp1++ & *cp3++;
 933         if (cp2 < cplim2)
 934                 bzero(cp2, cplim2 - cp2);
 935 }
 936
 937 static __inline uint16_t
 938 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
 939 {
 940         uint32_t l;
 941
 942         if (udp && !cksum)
 943                 return (0x0000);
 944         l = cksum + old - new;
 945         l = (l >> 16) + (l & 65535);
 946         l = l & 65535;
 947         if (udp && !l)
 948                 return (0xFFFF);
 949         return (l);
 950 }
 951
 952 static __inline void
 953 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
 954     in_addr_t daddr, uint16_t dport, uint8_t proto)
 955 {
 956
 957         key->proto = proto;
 958         key->swap = 0;
 959
 960         if (saddr < daddr) {
 961                 key->addr_u.addrs.addr1 = daddr;
 962                 key->addr_u.addrs.addr2 = saddr;
 963                 key->swap |= IPFW_KEY_SWAP_ADDRS;
 964         } else {
 965                 key->addr_u.addrs.addr1 = saddr;
 966                 key->addr_u.addrs.addr2 = daddr;
 967         }
 968
 969         if (sport < dport) {
 970                 key->port_u.ports.port1 = dport;
 971                 key->port_u.ports.port2 = sport;
 972                 key->swap |= IPFW_KEY_SWAP_PORTS;
 973         } else {
 974                 key->port_u.ports.port1 = sport;
 975                 key->port_u.ports.port2 = dport;
 976         }
 977
 978         if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
 979                 key->swap |= IPFW_KEY_SWAP_PORTS;
 980         if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
 981                 key->swap |= IPFW_KEY_SWAP_ADDRS;
 982 }
 983
 984 static __inline void
 985 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
 986     in_addr_t *daddr, uint16_t *dport)
 987 {
 988
 989         if (key->swap & IPFW_KEY_SWAP_ADDRS) {
 990                 *saddr = key->addr_u.addrs.addr2;
 991                 *daddr = key->addr_u.addrs.addr1;
 992         } else {
 993                 *saddr = key->addr_u.addrs.addr1;
 994                 *daddr = key->addr_u.addrs.addr2;
 995         }
 996
 997         if (key->swap & IPFW_KEY_SWAP_PORTS) {
 998                 *sport = key->port_u.ports.port2;
 999                 *dport = key->port_u.ports.port1;
1000         } else {
1001                 *sport = key->port_u.ports.port1;
1002                 *dport = key->port_u.ports.port2;
1003         }
1004 }
1005
1006 static int
1007 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1008 {
1009
1010         if (s1->st_proto > s2->st_proto)
1011                 return (1);
1012         if (s1->st_proto < s2->st_proto)
1013                 return (-1);
1014
1015         if (s1->st_addrs > s2->st_addrs)
1016                 return (1);
1017         if (s1->st_addrs < s2->st_addrs)
1018                 return (-1);
1019
1020         if (s1->st_ports > s2->st_ports)
1021                 return (1);
1022         if (s1->st_ports < s2->st_ports)
1023                 return (-1);
1024
1025         if (s1->st_swap == s2->st_swap ||
1026             (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1027                 return (0);
1028
1029         if (s1->st_swap > s2->st_swap)
1030                 return (1);
1031         else
1032                 return (-1);
1033 }
1034
1035 static int
1036 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1037 {
1038
1039         if (t1->tc_proto > t2->tc_proto)
1040                 return (1);
1041         if (t1->tc_proto < t2->tc_proto)
1042                 return (-1);
1043
1044         if (t1->tc_addrs > t2->tc_addrs)
1045                 return (1);
1046         if (t1->tc_addrs < t2->tc_addrs)
1047                 return (-1);
1048
1049         if (t1->tc_ports > t2->tc_ports)
1050                 return (1);
1051         if (t1->tc_ports < t2->tc_ports)
1052                 return (-1);
1053
1054         if (t1->tc_ruleid > t2->tc_ruleid)
1055                 return (1);
1056         if (t1->tc_ruleid < t2->tc_ruleid)
1057                 return (-1);
1058
1059         return (0);
1060 }
1061
1062 static int
1063 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1064 {
1065
1066         if (t1->t_proto > t2->t_proto)
1067                 return (1);
1068         if (t1->t_proto < t2->t_proto)
1069                 return (-1);
1070
1071         if (t1->t_addrs > t2->t_addrs)
1072                 return (1);
1073         if (t1->t_addrs < t2->t_addrs)
1074                 return (-1);
1075
1076         if (t1->t_ports > t2->t_ports)
1077                 return (1);
1078         if (t1->t_ports < t2->t_ports)
1079                 return (-1);
1080
1081         if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1082                 return (1);
1083         if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1084                 return (-1);
1085
1086         return (0);
1087 }
1088
1089 static __inline struct ipfw_state *
1090 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1091 {
1092         struct ipfw_state *dup;
1093
1094         KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1095             ("state %p was linked", s));
1096         dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1097         if (dup == NULL) {
1098                 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1099                 s->st_flags |= IPFW_STATE_F_LINKED;
1100         }
1101         return (dup);
1102 }
1103
1104 static __inline void
1105 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1106 {
1107
1108         KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1109             ("state %p was not linked", s));
1110         RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1111         TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1112         s->st_flags &= ~IPFW_STATE_F_LINKED;
1113 }
1114
1115 static void
1116 ipfw_state_max_set(int state_max)
1117 {
1118
1119         ipfw_state_max = state_max;
1120         /* Allow 5% states over-allocation. */
1121         ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1122 }
1123
1124 static __inline int
1125 ipfw_state_cntcoll(void)
1126 {
1127         int cpu, state_cnt = 0;
1128
1129         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1130                 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1131         return (state_cnt);
1132 }
1133
1134 static __inline int
1135 ipfw_state_cntsync(void)
1136 {
1137         int state_cnt;
1138
1139         state_cnt = ipfw_state_cntcoll();
1140         ipfw_gd.ipfw_state_loosecnt = state_cnt;
1141         return (state_cnt);
1142 }
1143
1144 static __inline int
1145 ipfw_free_rule(struct ip_fw *rule)
1146 {
1147         KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1148         KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1149         rule->refcnt--;
1150         if (rule->refcnt == 0) {
1151                 if (rule->cross_rules != NULL)
1152                         kfree(rule->cross_rules, M_IPFW);
1153                 kfree(rule, M_IPFW);
1154                 return 1;
1155         }
1156         return 0;
1157 }
1158
1159 static void
1160 ipfw_unref_rule(void *priv)
1161 {
1162         ipfw_free_rule(priv);
1163 #ifdef KLD_MODULE
1164         KASSERT(ipfw_gd.ipfw_refcnt > 0,
1165             ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1166         atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1167 #endif
1168 }
1169
1170 static __inline void
1171 ipfw_ref_rule(struct ip_fw *rule)
1172 {
1173         KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1174 #ifdef KLD_MODULE
1175         atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1176 #endif
1177         rule->refcnt++;
1178 }
1179
1180 /*
1181  * This macro maps an ip pointer into a layer3 header pointer of type T
1182  */
1183 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1184
1185 static __inline int
1186 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1187 {
1188         int type = L3HDR(struct icmp,ip)->icmp_type;
1189         int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1190         int idx = type / 32;
1191
1192         if (idx >= idx_max)
1193                 return (0);
1194         return (cmd->d[idx] & (1 << (type % 32)));
1195 }
1196
1197 #define TT      ((1 << ICMP_ECHO) | \
1198                  (1 << ICMP_ROUTERSOLICIT) | \
1199                  (1 << ICMP_TSTAMP) | \
1200                  (1 << ICMP_IREQ) | \
1201                  (1 << ICMP_MASKREQ))
1202
1203 static int
1204 is_icmp_query(struct ip *ip)
1205 {
1206         int type = L3HDR(struct icmp, ip)->icmp_type;
1207
1208         return (type < 32 && (TT & (1 << type)));
1209 }
1210
1211 #undef TT
1212
1213 /*
1214  * The following checks use two arrays of 8 or 16 bits to store the
1215  * bits that we want set or clear, respectively. They are in the
1216  * low and high half of cmd->arg1 or cmd->d[0].
1217  *
1218  * We scan options and store the bits we find set. We succeed if
1219  *
1220  *      (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1221  *
1222  * The code is sometimes optimized not to store additional variables.
1223  */
1224 static int
1225 flags_match(ipfw_insn *cmd, uint8_t bits)
1226 {
1227         u_char want_clear;
1228         bits = ~bits;
1229
1230         if (((cmd->arg1 & 0xff) & bits) != 0)
1231                 return 0; /* some bits we want set were clear */
1232
1233         want_clear = (cmd->arg1 >> 8) & 0xff;
1234         if ((want_clear & bits) != want_clear)
1235                 return 0; /* some bits we want clear were set */
1236         return 1;
1237 }
1238
1239 static int
1240 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1241 {
1242         int optlen, bits = 0;
1243         u_char *cp = (u_char *)(ip + 1);
1244         int x = (ip->ip_hl << 2) - sizeof(struct ip);
1245
1246         for (; x > 0; x -= optlen, cp += optlen) {
1247                 int opt = cp[IPOPT_OPTVAL];
1248
1249                 if (opt == IPOPT_EOL)
1250                         break;
1251
1252                 if (opt == IPOPT_NOP) {
1253                         optlen = 1;
1254                 } else {
1255                         optlen = cp[IPOPT_OLEN];
1256                         if (optlen <= 0 || optlen > x)
1257                                 return 0; /* invalid or truncated */
1258                 }
1259
1260                 switch (opt) {
1261                 case IPOPT_LSRR:
1262                         bits |= IP_FW_IPOPT_LSRR;
1263                         break;
1264
1265                 case IPOPT_SSRR:
1266                         bits |= IP_FW_IPOPT_SSRR;
1267                         break;
1268
1269                 case IPOPT_RR:
1270                         bits |= IP_FW_IPOPT_RR;
1271                         break;
1272
1273                 case IPOPT_TS:
1274                         bits |= IP_FW_IPOPT_TS;
1275                         break;
1276
1277                 default:
1278                         break;
1279                 }
1280         }
1281         return (flags_match(cmd, bits));
1282 }
1283
1284 static int
1285 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1286 {
1287         int optlen, bits = 0;
1288         struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1289         u_char *cp = (u_char *)(tcp + 1);
1290         int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1291
1292         for (; x > 0; x -= optlen, cp += optlen) {
1293                 int opt = cp[0];
1294
1295                 if (opt == TCPOPT_EOL)
1296                         break;
1297
1298                 if (opt == TCPOPT_NOP) {
1299                         optlen = 1;
1300                 } else {
1301                         optlen = cp[1];
1302                         if (optlen <= 0)
1303                                 break;
1304                 }
1305
1306                 switch (opt) {
1307                 case TCPOPT_MAXSEG:
1308                         bits |= IP_FW_TCPOPT_MSS;
1309                         break;
1310
1311                 case TCPOPT_WINDOW:
1312                         bits |= IP_FW_TCPOPT_WINDOW;
1313                         break;
1314
1315                 case TCPOPT_SACK_PERMITTED:
1316                 case TCPOPT_SACK:
1317                         bits |= IP_FW_TCPOPT_SACK;
1318                         break;
1319
1320                 case TCPOPT_TIMESTAMP:
1321                         bits |= IP_FW_TCPOPT_TS;
1322                         break;
1323
1324                 case TCPOPT_CC:
1325                 case TCPOPT_CCNEW:
1326                 case TCPOPT_CCECHO:
1327                         bits |= IP_FW_TCPOPT_CC;
1328                         break;
1329
1330                 default:
1331                         break;
1332                 }
1333         }
1334         return (flags_match(cmd, bits));
1335 }
1336
1337 static int
1338 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1339 {
1340         if (ifp == NULL)        /* no iface with this packet, match fails */
1341                 return 0;
1342
1343         /* Check by name or by IP address */
1344         if (cmd->name[0] != '\0') { /* match by name */
1345                 /* Check name */
1346                 if (cmd->p.glob) {
1347                         if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1348                                 return(1);
1349                 } else {
1350                         if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1351                                 return(1);
1352                 }
1353         } else {
1354                 struct ifaddr_container *ifac;
1355
1356                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1357                         struct ifaddr *ia = ifac->ifa;
1358
1359                         if (ia->ifa_addr == NULL)
1360                                 continue;
1361                         if (ia->ifa_addr->sa_family != AF_INET)
1362                                 continue;
1363                         if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1364                             (ia->ifa_addr))->sin_addr.s_addr)
1365                                 return(1);      /* match */
1366                 }
1367         }
1368         return(0);      /* no match, fail ... */
1369 }
1370
1371 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1372
1373 /*
1374  * We enter here when we have a rule with O_LOG.
1375  * XXX this function alone takes about 2Kbytes of code!
1376  */
1377 static void
1378 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1379     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1380 {
1381         char *action;
1382         int limit_reached = 0;
1383         char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1384
1385         fragment[0] = '\0';
1386         proto[0] = '\0';
1387
1388         if (f == NULL) {        /* bogus pkt */
1389                 if (verbose_limit != 0 &&
1390                     ctx->ipfw_norule_counter >= verbose_limit)
1391                         return;
1392                 ctx->ipfw_norule_counter++;
1393                 if (ctx->ipfw_norule_counter == verbose_limit)
1394                         limit_reached = verbose_limit;
1395                 action = "Refuse";
1396         } else {        /* O_LOG is the first action, find the real one */
1397                 ipfw_insn *cmd = ACTION_PTR(f);
1398                 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1399
1400                 if (l->max_log != 0 && l->log_left == 0)
1401                         return;
1402                 l->log_left--;
1403                 if (l->log_left == 0)
1404                         limit_reached = l->max_log;
1405                 cmd += F_LEN(cmd);      /* point to first action */
1406                 if (cmd->opcode == O_PROB)
1407                         cmd += F_LEN(cmd);
1408
1409                 action = action2;
1410                 switch (cmd->opcode) {
1411                 case O_DENY:
1412                         action = "Deny";
1413                         break;
1414
1415                 case O_REJECT:
1416                         if (cmd->arg1==ICMP_REJECT_RST) {
1417                                 action = "Reset";
1418                         } else if (cmd->arg1==ICMP_UNREACH_HOST) {
1419                                 action = "Reject";
1420                         } else {
1421                                 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1422                                           cmd->arg1);
1423                         }
1424                         break;
1425
1426                 case O_ACCEPT:
1427                         action = "Accept";
1428                         break;
1429
1430                 case O_COUNT:
1431                         action = "Count";
1432                         break;
1433
1434                 case O_DIVERT:
1435                         ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1436                         break;
1437
1438                 case O_TEE:
1439                         ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1440                         break;
1441
1442                 case O_SKIPTO:
1443                         ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1444                         break;
1445
1446                 case O_PIPE:
1447                         ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1448                         break;
1449
1450                 case O_QUEUE:
1451                         ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1452                         break;
1453
1454                 case O_FORWARD_IP:
1455                         {
1456                                 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1457                                 int len;
1458
1459                                 len = ksnprintf(SNPARGS(action2, 0),
1460                                     "Forward to %s",
1461                                     kinet_ntoa(sa->sa.sin_addr, abuf));
1462                                 if (sa->sa.sin_port) {
1463                                         ksnprintf(SNPARGS(action2, len), ":%d",
1464                                                   sa->sa.sin_port);
1465                                 }
1466                         }
1467                         break;
1468
1469                 default:
1470                         action = "UNKNOWN";
1471                         break;
1472                 }
1473         }
1474
1475         if (hlen == 0) {        /* non-ip */
1476                 ksnprintf(SNPARGS(proto, 0), "MAC");
1477         } else {
1478                 struct ip *ip = mtod(m, struct ip *);
1479                 /* these three are all aliases to the same thing */
1480                 struct icmp *const icmp = L3HDR(struct icmp, ip);
1481                 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1482                 struct udphdr *const udp = (struct udphdr *)icmp;
1483
1484                 int ip_off, offset, ip_len;
1485                 int len;
1486
1487                 if (eh != NULL) { /* layer 2 packets are as on the wire */
1488                         ip_off = ntohs(ip->ip_off);
1489                         ip_len = ntohs(ip->ip_len);
1490                 } else {
1491                         ip_off = ip->ip_off;
1492                         ip_len = ip->ip_len;
1493                 }
1494                 offset = ip_off & IP_OFFMASK;
1495                 switch (ip->ip_p) {
1496                 case IPPROTO_TCP:
1497                         len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1498                                         kinet_ntoa(ip->ip_src, abuf));
1499                         if (offset == 0) {
1500                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1501                                           ntohs(tcp->th_sport),
1502                                           kinet_ntoa(ip->ip_dst, abuf),
1503                                           ntohs(tcp->th_dport));
1504                         } else {
1505                                 ksnprintf(SNPARGS(proto, len), " %s",
1506                                           kinet_ntoa(ip->ip_dst, abuf));
1507                         }
1508                         break;
1509
1510                 case IPPROTO_UDP:
1511                         len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1512                                         kinet_ntoa(ip->ip_src, abuf));
1513                         if (offset == 0) {
1514                                 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1515                                           ntohs(udp->uh_sport),
1516                                           kinet_ntoa(ip->ip_dst, abuf),
1517                                           ntohs(udp->uh_dport));
1518                         } else {
1519                                 ksnprintf(SNPARGS(proto, len), " %s",
1520                                           kinet_ntoa(ip->ip_dst, abuf));
1521                         }
1522                         break;
1523
1524                 case IPPROTO_ICMP:
1525                         if (offset == 0) {
1526                                 len = ksnprintf(SNPARGS(proto, 0),
1527                                                 "ICMP:%u.%u ",
1528                                                 icmp->icmp_type,
1529                                                 icmp->icmp_code);
1530                         } else {
1531                                 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1532                         }
1533                         len += ksnprintf(SNPARGS(proto, len), "%s",
1534                                          kinet_ntoa(ip->ip_src, abuf));
1535                         ksnprintf(SNPARGS(proto, len), " %s",
1536                                   kinet_ntoa(ip->ip_dst, abuf));
1537                         break;
1538
1539                 default:
1540                         len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1541                                         kinet_ntoa(ip->ip_src, abuf));
1542                         ksnprintf(SNPARGS(proto, len), " %s",
1543                                   kinet_ntoa(ip->ip_dst, abuf));
1544                         break;
1545                 }
1546
1547                 if (ip_off & (IP_MF | IP_OFFMASK)) {
1548                         ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1549                                   ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1550                                   offset << 3, (ip_off & IP_MF) ? "+" : "");
1551                 }
1552         }
1553
1554         if (oif || m->m_pkthdr.rcvif) {
1555                 log(LOG_SECURITY | LOG_INFO,
1556                     "ipfw: %d %s %s %s via %s%s\n",
1557                     f ? f->rulenum : -1,
1558                     action, proto, oif ? "out" : "in",
1559                     oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1560                     fragment);
1561         } else {
1562                 log(LOG_SECURITY | LOG_INFO,
1563                     "ipfw: %d %s %s [no if info]%s\n",
1564                     f ? f->rulenum : -1,
1565                     action, proto, fragment);
1566         }
1567
1568         if (limit_reached) {
1569                 log(LOG_SECURITY | LOG_NOTICE,
1570                     "ipfw: limit %d reached on entry %d\n",
1571                     limit_reached, f ? f->rulenum : -1);
1572         }
1573 }
1574
1575 #undef SNPARGS
1576
1577 static void
1578 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1579 {
1580         struct ip_fw *rule = slave_x->xlat_rule;
1581
1582         KKASSERT(rule->cpuid == mycpuid);
1583
1584         /* No more cross references; free this pair now. */
1585         kfree(x, M_IPFW);
1586         kfree(slave_x, M_IPFW);
1587
1588         /* See the comment in ipfw_ip_xlate_dispatch(). */
1589         rule->cross_refs--;
1590 }
1591
1592 static void
1593 ipfw_xlat_reap_dispatch(netmsg_t nm)
1594 {
1595         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1596         struct ipfw_state *s, *ns;
1597
1598         ASSERT_NETISR_NCPUS(mycpuid);
1599
1600         crit_enter();
1601         /* Reply ASAP. */
1602         netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1603         crit_exit();
1604
1605         /* TODO: limit scanning depth */
1606         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1607                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1608                 struct ipfw_xlat *slave_x = x->xlat_pair;
1609                 uint64_t crefs;
1610
1611                 crefs = slave_x->xlat_crefs + x->xlat_crefs;
1612                 if (crefs == 0) {
1613                         TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1614                         ipfw_xlat_reap(x, slave_x);
1615                 }
1616         }
1617         if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1618                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1619                     &ctx->ipfw_xlatreap_nm);
1620         }
1621 }
1622
1623 static void
1624 ipfw_xlat_reap_timeo(void *xnm)
1625 {
1626         struct netmsg_base *nm = xnm;
1627
1628         KKASSERT(mycpuid < netisr_ncpus);
1629
1630         crit_enter();
1631         if (nm->lmsg.ms_flags & MSGF_DONE)
1632                 netisr_sendmsg_oncpu(nm);
1633         crit_exit();
1634 }
1635
1636 static void
1637 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1638 {
1639         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1640         struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1641         struct ipfw_xlat *slave_x = x->xlat_pair;
1642         uint64_t crefs;
1643
1644         ASSERT_NETISR_NCPUS(mycpuid);
1645
1646         KKASSERT(slave_x != NULL);
1647         KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1648
1649         KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1650             ("master xlat is still linked"));
1651         if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1652                 ipfw_state_unlink(ctx, &slave_x->xlat_st);
1653
1654         /* See the comment in ipfw_ip_xlate_dispatch(). */
1655         slave_x->xlat_crefs--;
1656
1657         crefs = slave_x->xlat_crefs + x->xlat_crefs;
1658         if (crefs == 0) {
1659                 ipfw_xlat_reap(x, slave_x);
1660                 return;
1661         }
1662
1663         if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1664                 callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1665                     &ctx->ipfw_xlatreap_nm);
1666         }
1667
1668         /*
1669          * This pair is still referenced; defer its destruction.
1670          * YYY reuse st_link.
1671          */
1672         TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1673 }
1674
1675 static __inline void
1676 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1677 {
1678
1679         x->xlat_invalid = 1;
1680         x->xlat_pair->xlat_invalid = 1;
1681 }
1682
1683 static void
1684 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1685 {
1686         struct ipfw_xlat *x, *slave_x;
1687         struct netmsg_base *nm;
1688
1689         KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1690             IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1691         KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1692             ("delete slave xlat"));
1693
1694         KASSERT(ctx->ipfw_state_cnt > 0,
1695             ("invalid state count %d", ctx->ipfw_state_cnt));
1696         ctx->ipfw_state_cnt--;
1697         if (ctx->ipfw_state_loosecnt > 0)
1698                 ctx->ipfw_state_loosecnt--;
1699
1700         /*
1701          * Unhook this state.
1702          */
1703         if (s->st_track != NULL) {
1704                 struct ipfw_track *t = s->st_track;
1705
1706                 KASSERT(!LIST_EMPTY(&t->t_state_list),
1707                     ("track state list is empty"));
1708                 LIST_REMOVE(s, st_trklink);
1709
1710                 KASSERT(*t->t_count > 0,
1711                     ("invalid track count %d", *t->t_count));
1712                 atomic_subtract_int(t->t_count, 1);
1713         }
1714         ipfw_state_unlink(ctx, s);
1715
1716         /*
1717          * Free this state.  Xlat requires special processing,
1718          * since xlat are paired state and they could be on
1719          * different cpus.
1720          */
1721
1722         if (!IPFW_ISXLAT(s->st_type)) {
1723                 /* Not xlat; free now. */
1724                 kfree(s, M_IPFW);
1725                 /* Done! */
1726                 return;
1727         }
1728         x = (struct ipfw_xlat *)s;
1729
1730         if (x->xlat_pair == NULL) {
1731                 /* Not setup yet; free now. */
1732                 kfree(x, M_IPFW);
1733                 /* Done! */
1734                 return;
1735         }
1736         slave_x = x->xlat_pair;
1737         KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1738
1739         if (x->xlat_pcpu == mycpuid) {
1740                 /*
1741                  * Paired states are on the same cpu; delete this
1742                  * pair now.
1743                  */
1744                 KKASSERT(x->xlat_crefs == 0);
1745                 KKASSERT(slave_x->xlat_crefs == 0);
1746                 if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1747                         ipfw_state_unlink(ctx, &slave_x->xlat_st);
1748                 kfree(x, M_IPFW);
1749                 kfree(slave_x, M_IPFW);
1750                 return;
1751         }
1752
1753         /*
1754          * Free the paired states on the cpu owning the slave xlat.
1755          */
1756
1757         /*
1758          * Mark the state pair invalid; completely deleting them
1759          * may take some time.
1760          */
1761         ipfw_xlat_invalidate(x);
1762
1763         nm = &x->xlat_freenm;
1764         netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1765             ipfw_xlat_free_dispatch);
1766         nm->lmsg.u.ms_resultp = x;
1767
1768         /* See the comment in ipfw_xlate_redispatch(). */
1769         x->xlat_rule->cross_refs++;
1770         x->xlat_crefs++;
1771
1772         netisr_sendmsg(nm, x->xlat_pcpu);
1773 }
1774
1775 static void
1776 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1777 {
1778
1779         if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1780                 KKASSERT(IPFW_ISXLAT(s->st_type));
1781                 ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1782                 ipfw_state_unlink(ctx, s);
1783                 return;
1784         }
1785         ipfw_state_del(ctx, s);
1786 }
1787
1788 static int
1789 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1790 {
1791         struct ipfw_state *s, *anchor;
1792         int expired;
1793
1794         if (reap_max < ipfw_state_reap_min)
1795                 reap_max = ipfw_state_reap_min;
1796
1797         if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1798                 /*
1799                  * Kick start state expiring.  Ignore scan limit,
1800                  * we are short of states.
1801                  */
1802                 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1803                 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1804                 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1805                 return (expired);
1806         }
1807
1808         /*
1809          * States are being expired.
1810          */
1811
1812         if (ctx->ipfw_state_cnt == 0)
1813                 return (0);
1814
1815         expired = 0;
1816         anchor = &ctx->ipfw_stateexp_anch;
1817         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1818                 /*
1819                  * Ignore scan limit; we are short of states.
1820                  */
1821
1822                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1823                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1824
1825                 if (IPFW_STATE_SCANSKIP(s))
1826                         continue;
1827
1828                 if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1829                         ipfw_state_del(ctx, s);
1830                         if (++expired >= reap_max)
1831                                 break;
1832                         if ((expired & 0xff) == 0 &&
1833                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1834                             ipfw_state_max)
1835                                 break;
1836                 }
1837         }
1838         /*
1839          * NOTE:
1840          * Leave the anchor on the list, even if the end of the list has
1841          * been reached.  ipfw_state_expire_more_dispatch() will handle
1842          * the removal.
1843          */
1844         return (expired);
1845 }
1846
1847 static void
1848 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1849 {
1850         struct ipfw_state *s, *sn;
1851
1852         TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1853                 if (IPFW_STATE_SCANSKIP(s))
1854                         continue;
1855                 if (rule != NULL && s->st_rule != rule)
1856                         continue;
1857                 ipfw_state_del(ctx, s);
1858         }
1859 }
1860
1861 static void
1862 ipfw_state_expire_done(struct ipfw_context *ctx)
1863 {
1864
1865         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1866             ("stateexp is not in progress"));
1867         ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1868         callout_reset(&ctx->ipfw_stateto_ch, hz,
1869             ipfw_state_expire_ipifunc, NULL);
1870 }
1871
1872 static void
1873 ipfw_state_expire_more(struct ipfw_context *ctx)
1874 {
1875         struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1876
1877         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1878             ("stateexp is not in progress"));
1879         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1880             ("stateexp more did not finish"));
1881         netisr_sendmsg_oncpu(nm);
1882 }
1883
1884 static int
1885 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1886     int scan_max, int expire_max)
1887 {
1888         struct ipfw_state *s;
1889         int scanned = 0, expired = 0;
1890
1891         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1892             ("stateexp is not in progress"));
1893
1894         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1895                 if (scanned++ >= scan_max) {
1896                         ipfw_state_expire_more(ctx);
1897                         return (expired);
1898                 }
1899
1900                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1901                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1902
1903                 if (IPFW_STATE_SCANSKIP(s))
1904                         continue;
1905
1906                 if (IPFW_STATE_ISDEAD(s) ||
1907                     ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1908                      IPFW_STATE_TCPCLOSED(s))) {
1909                         ipfw_state_del(ctx, s);
1910                         if (++expired >= expire_max) {
1911                                 ipfw_state_expire_more(ctx);
1912                                 return (expired);
1913                         }
1914                         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1915                             (expired & 0xff) == 0 &&
1916                             ipfw_state_cntcoll() + ipfw_state_headroom <=
1917                             ipfw_state_max) {
1918                                 ipfw_state_expire_more(ctx);
1919                                 return (expired);
1920                         }
1921                 }
1922         }
1923         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1924         ipfw_state_expire_done(ctx);
1925         return (expired);
1926 }
1927
1928 static void
1929 ipfw_state_expire_more_dispatch(netmsg_t nm)
1930 {
1931         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1932         struct ipfw_state *anchor;
1933
1934         ASSERT_NETISR_NCPUS(mycpuid);
1935         KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1936             ("statexp is not in progress"));
1937
1938         /* Reply ASAP */
1939         netisr_replymsg(&nm->base, 0);
1940
1941         anchor = &ctx->ipfw_stateexp_anch;
1942         if (ctx->ipfw_state_cnt == 0) {
1943                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1944                 ipfw_state_expire_done(ctx);
1945                 return;
1946         }
1947         ipfw_state_expire_loop(ctx, anchor,
1948             ipfw_state_scan_max, ipfw_state_expire_max);
1949 }
1950
1951 static int
1952 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1953 {
1954         struct ipfw_state *anchor;
1955
1956         KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1957             ("stateexp is in progress"));
1958         ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1959
1960         if (ctx->ipfw_state_cnt == 0) {
1961                 ipfw_state_expire_done(ctx);
1962                 return (0);
1963         }
1964
1965         /*
1966          * Do not expire more than once per second, it is useless.
1967          */
1968         if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1969             ctx->ipfw_state_lastexp == time_uptime) {
1970                 ipfw_state_expire_done(ctx);
1971                 return (0);
1972         }
1973         ctx->ipfw_state_lastexp = time_uptime;
1974
1975         anchor = &ctx->ipfw_stateexp_anch;
1976         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1977         return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1978 }
1979
1980 static void
1981 ipfw_state_expire_dispatch(netmsg_t nm)
1982 {
1983         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1984
1985         ASSERT_NETISR_NCPUS(mycpuid);
1986
1987         /* Reply ASAP */
1988         crit_enter();
1989         netisr_replymsg(&nm->base, 0);
1990         crit_exit();
1991
1992         if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1993                 /* Running; done. */
1994                 return;
1995         }
1996         ipfw_state_expire_start(ctx,
1997             ipfw_state_scan_max, ipfw_state_expire_max);
1998 }
1999
2000 static void
2001 ipfw_state_expire_ipifunc(void *dummy __unused)
2002 {
2003         struct netmsg_base *msg;
2004
2005         KKASSERT(mycpuid < netisr_ncpus);
2006         msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2007
2008         crit_enter();
2009         if (msg->lmsg.ms_flags & MSGF_DONE)
2010                 netisr_sendmsg_oncpu(msg);
2011         crit_exit();
2012 }
2013
2014 static boolean_t
2015 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2016 {
2017         uint32_t seq = ntohl(tcp->th_seq);
2018         uint32_t ack = ntohl(tcp->th_ack);
2019
2020         if (tcp->th_flags & TH_RST)
2021                 return (TRUE);
2022
2023         if (dir == MATCH_FORWARD) {
2024                 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2025                         s->st_flags |= IPFW_STATE_F_SEQFWD;
2026                         s->st_seq_fwd = seq;
2027                 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2028                         s->st_seq_fwd = seq;
2029                 } else {
2030                         /* Out-of-sequence; done. */
2031                         return (FALSE);
2032                 }
2033                 if (tcp->th_flags & TH_ACK) {
2034                         if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2035                                 s->st_flags |= IPFW_STATE_F_ACKFWD;
2036                                 s->st_ack_fwd = ack;
2037                         } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2038                                 s->st_ack_fwd = ack;
2039                         } else {
2040                                 /* Out-of-sequence; done. */
2041                                 return (FALSE);
2042                         }
2043
2044                         if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2045                             (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2046                                 s->st_state |= (TH_ACK << 8);
2047                 }
2048         } else {
2049                 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2050                         s->st_flags |= IPFW_STATE_F_SEQREV;
2051                         s->st_seq_rev = seq;
2052                 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2053                         s->st_seq_rev = seq;
2054                 } else {
2055                         /* Out-of-sequence; done. */
2056                         return (FALSE);
2057                 }
2058                 if (tcp->th_flags & TH_ACK) {
2059                         if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2060                                 s->st_flags |= IPFW_STATE_F_ACKREV;
2061                                 s->st_ack_rev= ack;
2062                         } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2063                                 s->st_ack_rev = ack;
2064                         } else {
2065                                 /* Out-of-sequence; done. */
2066                                 return (FALSE);
2067                         }
2068
2069                         if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2070                             s->st_ack_rev == s->st_seq_fwd + 1)
2071                                 s->st_state |= TH_ACK;
2072                 }
2073         }
2074         return (TRUE);
2075 }
2076
2077 static void
2078 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2079     const struct tcphdr *tcp, struct ipfw_state *s)
2080 {
2081
2082         if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2083                 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2084
2085                 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2086                         return;
2087
2088                 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2089                 switch (s->st_state & IPFW_STATE_TCPSTATES) {
2090                 case TH_SYN:                            /* opening */
2091                         s->st_expire = time_uptime + dyn_syn_lifetime;
2092                         break;
2093
2094                 case BOTH_SYN:                  /* move to established */
2095                 case BOTH_SYN | TH_FIN:         /* one side tries to close */
2096                 case BOTH_SYN | (TH_FIN << 8):
2097                         s->st_expire = time_uptime + dyn_ack_lifetime;
2098                         break;
2099
2100                 case BOTH_SYN | BOTH_FIN:       /* both sides closed */
2101                         if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2102                                 /* And both FINs were ACKed. */
2103                                 s->st_expire = time_uptime + dyn_fin_lifetime;
2104                         } else {
2105                                 s->st_expire = time_uptime +
2106                                     dyn_finwait_lifetime;
2107                         }
2108                         break;
2109
2110                 default:
2111 #if 0
2112                         /*
2113                          * reset or some invalid combination, but can also
2114                          * occur if we use keep-state the wrong way.
2115                          */
2116                         if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2117                                 kprintf("invalid state: 0x%x\n", s->st_state);
2118 #endif
2119                         s->st_expire = time_uptime + dyn_rst_lifetime;
2120                         break;
2121                 }
2122         } else if (pkt->proto == IPPROTO_UDP) {
2123                 s->st_expire = time_uptime + dyn_udp_lifetime;
2124         } else {
2125                 /* other protocols */
2126                 s->st_expire = time_uptime + dyn_short_lifetime;
2127         }
2128 }
2129
2130 /*
2131  * Lookup a state.
2132  */
2133 static struct ipfw_state *
2134 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2135     int *match_direction, const struct tcphdr *tcp)
2136 {
2137         struct ipfw_state *key, *s;
2138         int dir = MATCH_NONE;
2139
2140         key = &ctx->ipfw_state_tmpkey;
2141         ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2142             pkt->dst_ip, pkt->dst_port, pkt->proto);
2143         s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2144         if (s == NULL)
2145                 goto done; /* not found. */
2146         if (IPFW_STATE_ISDEAD(s)) {
2147                 ipfw_state_remove(ctx, s);
2148                 s = NULL;
2149                 goto done;
2150         }
2151         if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2152                 /* TCP ports recycling is too fast. */
2153                 ctx->ipfw_sts_tcprecycled++;
2154                 ipfw_state_remove(ctx, s);
2155                 s = NULL;
2156                 goto done;
2157         }
2158
2159         if (s->st_swap == key->st_swap) {
2160                 dir = MATCH_FORWARD;
2161         } else {
2162                 KASSERT((s->st_swap & key->st_swap) == 0,
2163                     ("found mismatch state"));
2164                 dir = MATCH_REVERSE;
2165         }
2166
2167         /* Update this state. */
2168         ipfw_state_update(pkt, dir, tcp, s);
2169
2170         if (s->st_track != NULL) {
2171                 /* This track has been used. */
2172                 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2173         }
2174 done:
2175         if (match_direction)
2176                 *match_direction = dir;
2177         return (s);
2178 }
2179
2180 static struct ipfw_state *
2181 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2182     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2183 {
2184         struct ipfw_state *s;
2185         size_t sz;
2186
2187         KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2188             ("invalid state type %u", type));
2189
2190         sz = sizeof(struct ipfw_state);
2191         if (IPFW_ISXLAT(type))
2192                 sz = sizeof(struct ipfw_xlat);
2193
2194         s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2195         if (s == NULL) {
2196                 ctx->ipfw_sts_nomem++;
2197                 return (NULL);
2198         }
2199
2200         ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2201             id->dst_ip, id->dst_port, id->proto);
2202
2203         s->st_rule = rule;
2204         s->st_type = type;
2205         if (IPFW_ISXLAT(type)) {
2206                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2207
2208                 x->xlat_dir = MATCH_NONE;
2209                 x->xlat_pcpu = -1;
2210         }
2211
2212         /*
2213          * Update this state:
2214          * Set st_expire and st_state.
2215          */
2216         ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2217
2218         return (s);
2219 }
2220
2221 static struct ipfw_state *
2222 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2223     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2224     const struct tcphdr *tcp)
2225 {
2226         struct ipfw_state *s, *dup;
2227
2228         s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2229         if (s == NULL)
2230                 return (NULL);
2231
2232         ctx->ipfw_state_cnt++;
2233         ctx->ipfw_state_loosecnt++;
2234         if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2235                 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2236                 ctx->ipfw_state_loosecnt = 0;
2237         }
2238
2239         dup = ipfw_state_link(ctx, s);
2240         if (dup != NULL)
2241                 panic("ipfw: %u state exists %p", type, dup);
2242
2243         if (t != NULL) {
2244                 /* Keep the track referenced. */
2245                 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2246                 s->st_track = t;
2247         }
2248         return (s);
2249 }
2250
2251 static boolean_t
2252 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2253 {
2254         struct ipfw_trkcnt *trk;
2255         boolean_t trk_freed = FALSE;
2256
2257         KASSERT(t->t_count != NULL, ("track anchor"));
2258         KASSERT(LIST_EMPTY(&t->t_state_list),
2259             ("invalid track is still referenced"));
2260
2261         trk = t->t_trkcnt;
2262         KASSERT(trk != NULL, ("track has no trkcnt"));
2263
2264         RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2265         TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2266         kfree(t, M_IPFW);
2267
2268         /*
2269          * fdrop() style reference counting.
2270          * See kern/kern_descrip.c fdrop().
2271          */
2272         for (;;) {
2273                 int refs = trk->tc_refs;
2274
2275                 cpu_ccfence();
2276                 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2277                 if (refs == 1) {
2278                         IPFW_TRKCNT_TOKGET;
2279                         if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2280                                 KASSERT(trk->tc_count == 0,
2281                                     ("%d states reference this trkcnt",
2282                                      trk->tc_count));
2283                                 RB_REMOVE(ipfw_trkcnt_tree,
2284                                     &ipfw_gd.ipfw_trkcnt_tree, trk);
2285
2286                                 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2287                                     ("invalid trkcnt cnt %d",
2288                                      ipfw_gd.ipfw_trkcnt_cnt));
2289                                 ipfw_gd.ipfw_trkcnt_cnt--;
2290                                 IPFW_TRKCNT_TOKREL;
2291
2292                                 if (ctx->ipfw_trkcnt_spare == NULL)
2293                                         ctx->ipfw_trkcnt_spare = trk;
2294                                 else
2295                                         kfree(trk, M_IPFW);
2296                                 trk_freed = TRUE;
2297                                 break; /* done! */
2298                         }
2299                         IPFW_TRKCNT_TOKREL;
2300                         /* retry */
2301                 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2302                         break; /* done! */
2303                 }
2304                 /* retry */
2305         }
2306         return (trk_freed);
2307 }
2308
2309 static void
2310 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2311 {
2312         struct ipfw_track *t, *tn;
2313
2314         TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2315                 if (t->t_count == NULL) /* anchor */
2316                         continue;
2317                 if (rule != NULL && t->t_rule != rule)
2318                         continue;
2319                 ipfw_track_free(ctx, t);
2320         }
2321 }
2322
2323 static boolean_t
2324 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2325     boolean_t reap)
2326 {
2327         struct ipfw_state *s, *sn;
2328         boolean_t ret = FALSE;
2329
2330         KASSERT(t->t_count != NULL, ("track anchor"));
2331
2332         if (LIST_EMPTY(&t->t_state_list))
2333                 return (FALSE);
2334
2335         /*
2336          * Do not expire more than once per second, it is useless.
2337          */
2338         if (t->t_lastexp == time_uptime)
2339                 return (FALSE);
2340         t->t_lastexp = time_uptime;
2341
2342         LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2343                 if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2344                         KASSERT(s->st_track == t,
2345                             ("state track %p does not match %p",
2346                              s->st_track, t));
2347                         ipfw_state_del(ctx, s);
2348                         ret = TRUE;
2349                 }
2350         }
2351         return (ret);
2352 }
2353
2354 static __inline struct ipfw_trkcnt *
2355 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2356 {
2357         struct ipfw_trkcnt *trk;
2358
2359         if (ctx->ipfw_trkcnt_spare != NULL) {
2360                 trk = ctx->ipfw_trkcnt_spare;
2361                 ctx->ipfw_trkcnt_spare = NULL;
2362         } else {
2363                 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2364                     M_INTWAIT | M_NULLOK);
2365         }
2366         return (trk);
2367 }
2368
2369 static void
2370 ipfw_track_expire_done(struct ipfw_context *ctx)
2371 {
2372
2373         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2374             ("trackexp is not in progress"));
2375         ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2376         callout_reset(&ctx->ipfw_trackto_ch, hz,
2377             ipfw_track_expire_ipifunc, NULL);
2378 }
2379
2380 static void
2381 ipfw_track_expire_more(struct ipfw_context *ctx)
2382 {
2383         struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2384
2385         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2386             ("trackexp is not in progress"));
2387         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2388             ("trackexp more did not finish"));
2389         netisr_sendmsg_oncpu(nm);
2390 }
2391
2392 static int
2393 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2394     int scan_max, int expire_max)
2395 {
2396         struct ipfw_track *t;
2397         int scanned = 0, expired = 0;
2398         boolean_t reap = FALSE;
2399
2400         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2401             ("trackexp is not in progress"));
2402
2403         if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2404                 reap = TRUE;
2405
2406         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2407                 if (scanned++ >= scan_max) {
2408                         ipfw_track_expire_more(ctx);
2409                         return (expired);
2410                 }
2411
2412                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2413                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2414
2415                 if (t->t_count == NULL) /* anchor */
2416                         continue;
2417
2418                 ipfw_track_state_expire(ctx, t, reap);
2419                 if (!LIST_EMPTY(&t->t_state_list)) {
2420                         /* There are states referencing this track. */
2421                         continue;
2422                 }
2423
2424                 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2425                         /* Expired. */
2426                         if (ipfw_track_free(ctx, t)) {
2427                                 if (++expired >= expire_max) {
2428                                         ipfw_track_expire_more(ctx);
2429                                         return (expired);
2430                                 }
2431                         }
2432                 }
2433         }
2434         TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2435         ipfw_track_expire_done(ctx);
2436         return (expired);
2437 }
2438
2439 static int
2440 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2441 {
2442         struct ipfw_track *anchor;
2443
2444         KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2445             ("trackexp is in progress"));
2446         ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2447
2448         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2449                 ipfw_track_expire_done(ctx);
2450                 return (0);
2451         }
2452
2453         /*
2454          * Do not expire more than once per second, it is useless.
2455          */
2456         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2457             ctx->ipfw_track_lastexp == time_uptime) {
2458                 ipfw_track_expire_done(ctx);
2459                 return (0);
2460         }
2461         ctx->ipfw_track_lastexp = time_uptime;
2462
2463         anchor = &ctx->ipfw_trackexp_anch;
2464         TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2465         return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2466 }
2467
2468 static void
2469 ipfw_track_expire_more_dispatch(netmsg_t nm)
2470 {
2471         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2472         struct ipfw_track *anchor;
2473
2474         ASSERT_NETISR_NCPUS(mycpuid);
2475         KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2476             ("trackexp is not in progress"));
2477
2478         /* Reply ASAP */
2479         netisr_replymsg(&nm->base, 0);
2480
2481         anchor = &ctx->ipfw_trackexp_anch;
2482         if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2483                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2484                 ipfw_track_expire_done(ctx);
2485                 return;
2486         }
2487         ipfw_track_expire_loop(ctx, anchor,
2488             ipfw_track_scan_max, ipfw_track_expire_max);
2489 }
2490
2491 static void
2492 ipfw_track_expire_dispatch(netmsg_t nm)
2493 {
2494         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2495
2496         ASSERT_NETISR_NCPUS(mycpuid);
2497
2498         /* Reply ASAP */
2499         crit_enter();
2500         netisr_replymsg(&nm->base, 0);
2501         crit_exit();
2502
2503         if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2504                 /* Running; done. */
2505                 return;
2506         }
2507         ipfw_track_expire_start(ctx,
2508             ipfw_track_scan_max, ipfw_track_expire_max);
2509 }
2510
2511 static void
2512 ipfw_track_expire_ipifunc(void *dummy __unused)
2513 {
2514         struct netmsg_base *msg;
2515
2516         KKASSERT(mycpuid < netisr_ncpus);
2517         msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2518
2519         crit_enter();
2520         if (msg->lmsg.ms_flags & MSGF_DONE)
2521                 netisr_sendmsg_oncpu(msg);
2522         crit_exit();
2523 }
2524
2525 static int
2526 ipfw_track_reap(struct ipfw_context *ctx)
2527 {
2528         struct ipfw_track *t, *anchor;
2529         int expired;
2530
2531         if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2532                 /*
2533                  * Kick start track expiring.  Ignore scan limit,
2534                  * we are short of tracks.
2535                  */
2536                 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2537                 expired = ipfw_track_expire_start(ctx, INT_MAX,
2538                     ipfw_track_reap_max);
2539                 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2540                 return (expired);
2541         }
2542
2543         /*
2544          * Tracks are being expired.
2545          */
2546
2547         if (RB_EMPTY(&ctx->ipfw_track_tree))
2548                 return (0);
2549
2550         expired = 0;
2551         anchor = &ctx->ipfw_trackexp_anch;
2552         while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2553                 /*
2554                  * Ignore scan limit; we are short of tracks.
2555                  */
2556
2557                 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2558                 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2559
2560                 if (t->t_count == NULL) /* anchor */
2561                         continue;
2562
2563                 ipfw_track_state_expire(ctx, t, TRUE);
2564                 if (!LIST_EMPTY(&t->t_state_list)) {
2565                         /* There are states referencing this track. */
2566                         continue;
2567                 }
2568
2569                 if (ipfw_track_free(ctx, t)) {
2570                         if (++expired >= ipfw_track_reap_max) {
2571                                 ipfw_track_expire_more(ctx);
2572                                 break;
2573                         }
2574                 }
2575         }
2576         /*
2577          * NOTE:
2578          * Leave the anchor on the list, even if the end of the list has
2579          * been reached.  ipfw_track_expire_more_dispatch() will handle
2580          * the removal.
2581          */
2582         return (expired);
2583 }
2584
2585 static struct ipfw_track *
2586 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2587     uint16_t limit_mask, struct ip_fw *rule)
2588 {
2589         struct ipfw_track *key, *t, *dup;
2590         struct ipfw_trkcnt *trk, *ret;
2591         boolean_t do_expire = FALSE;
2592
2593         KASSERT(rule->track_ruleid != 0,
2594             ("rule %u has no track ruleid", rule->rulenum));
2595
2596         key = &ctx->ipfw_track_tmpkey;
2597         key->t_proto = id->proto;
2598         key->t_addrs = 0;
2599         key->t_ports = 0;
2600         key->t_rule = rule;
2601         if (limit_mask & DYN_SRC_ADDR)
2602                 key->t_saddr = id->src_ip;
2603         if (limit_mask & DYN_DST_ADDR)
2604                 key->t_daddr = id->dst_ip;
2605         if (limit_mask & DYN_SRC_PORT)
2606                 key->t_sport = id->src_port;
2607         if (limit_mask & DYN_DST_PORT)
2608                 key->t_dport = id->dst_port;
2609
2610         t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2611         if (t != NULL)
2612                 goto done;
2613
2614         t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2615         if (t == NULL) {
2616                 ctx->ipfw_tks_nomem++;
2617                 return (NULL);
2618         }
2619
2620         t->t_key = key->t_key;
2621         t->t_rule = rule;
2622         t->t_lastexp = 0;
2623         LIST_INIT(&t->t_state_list);
2624
2625         if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2626                 time_t globexp, uptime;
2627
2628                 trk = NULL;
2629                 do_expire = TRUE;
2630
2631                 /*
2632                  * Do not expire globally more than once per second,
2633                  * it is useless.
2634                  */
2635                 uptime = time_uptime;
2636                 globexp = ipfw_gd.ipfw_track_globexp;
2637                 if (globexp != uptime &&
2638                     atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2639                     globexp, uptime)) {
2640                         int cpu;
2641
2642                         /* Expire tracks on other CPUs. */
2643                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2644                                 if (cpu == mycpuid)
2645                                         continue;
2646                                 lwkt_send_ipiq(globaldata_find(cpu),
2647                                     ipfw_track_expire_ipifunc, NULL);
2648                         }
2649                 }
2650         } else {
2651                 trk = ipfw_trkcnt_alloc(ctx);
2652         }
2653         if (trk == NULL) {
2654                 struct ipfw_trkcnt *tkey;
2655
2656                 tkey = &ctx->ipfw_trkcnt_tmpkey;
2657                 key = NULL; /* tkey overlaps key */
2658
2659                 tkey->tc_key = t->t_key;
2660                 tkey->tc_ruleid = rule->track_ruleid;
2661
2662                 IPFW_TRKCNT_TOKGET;
2663                 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2664                     tkey);
2665                 if (trk == NULL) {
2666                         IPFW_TRKCNT_TOKREL;
2667                         if (do_expire) {
2668                                 ctx->ipfw_tks_reap++;
2669                                 if (ipfw_track_reap(ctx) > 0) {
2670                                         if (ipfw_gd.ipfw_trkcnt_cnt <
2671                                             ipfw_track_max) {
2672                                                 trk = ipfw_trkcnt_alloc(ctx);
2673                                                 if (trk != NULL)
2674                                                         goto install;
2675                                                 ctx->ipfw_tks_cntnomem++;
2676                                         } else {
2677                                                 ctx->ipfw_tks_overflow++;
2678                                         }
2679                                 } else {
2680                                         ctx->ipfw_tks_reapfailed++;
2681                                         ctx->ipfw_tks_overflow++;
2682                                 }
2683                         } else {
2684                                 ctx->ipfw_tks_cntnomem++;
2685                         }
2686                         kfree(t, M_IPFW);
2687                         return (NULL);
2688                 }
2689                 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2690                     ("invalid trkcnt refs %d", trk->tc_refs));
2691                 atomic_add_int(&trk->tc_refs, 1);
2692                 IPFW_TRKCNT_TOKREL;
2693         } else {
2694 install:
2695                 trk->tc_key = t->t_key;
2696                 trk->tc_ruleid = rule->track_ruleid;
2697                 trk->tc_refs = 0;
2698                 trk->tc_count = 0;
2699                 trk->tc_expire = 0;
2700                 trk->tc_rulenum = rule->rulenum;
2701
2702                 IPFW_TRKCNT_TOKGET;
2703                 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2704                     trk);
2705                 if (ret != NULL) {
2706                         KASSERT(ret->tc_refs > 0 &&
2707                             ret->tc_refs < netisr_ncpus,
2708                             ("invalid trkcnt refs %d", ret->tc_refs));
2709                         KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2710                             ("trkcnt spare was installed"));
2711                         ctx->ipfw_trkcnt_spare = trk;
2712                         trk = ret;
2713                 } else {
2714                         ipfw_gd.ipfw_trkcnt_cnt++;
2715                 }
2716                 atomic_add_int(&trk->tc_refs, 1);
2717                 IPFW_TRKCNT_TOKREL;
2718         }
2719         t->t_count = &trk->tc_count;
2720         t->t_trkcnt = trk;
2721
2722         dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2723         if (dup != NULL)
2724                 panic("ipfw: track exists");
2725         TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2726 done:
2727         t->t_expire = time_uptime + dyn_short_lifetime;
2728         return (t);
2729 }
2730
2731 /*
2732  * Install state for rule type cmd->o.opcode
2733  *
2734  * Returns NULL if state is not installed because of errors or because
2735  * states limitations are enforced.
2736  */
2737 static struct ipfw_state *
2738 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2739     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2740 {
2741         struct ipfw_state *s;
2742         struct ipfw_track *t;
2743         int count, diff;
2744
2745         if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2746             (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2747                 boolean_t overflow = TRUE;
2748
2749                 ctx->ipfw_sts_reap++;
2750                 if (ipfw_state_reap(ctx, diff) == 0)
2751                         ctx->ipfw_sts_reapfailed++;
2752                 if (ipfw_state_cntsync() < ipfw_state_max)
2753                         overflow = FALSE;
2754
2755                 if (overflow) {
2756                         time_t globexp, uptime;
2757                         int cpu;
2758
2759                         /*
2760                          * Do not expire globally more than once per second,
2761                          * it is useless.
2762                          */
2763                         uptime = time_uptime;
2764                         globexp = ipfw_gd.ipfw_state_globexp;
2765                         if (globexp == uptime ||
2766                             !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2767                             globexp, uptime)) {
2768                                 ctx->ipfw_sts_overflow++;
2769                                 return (NULL);
2770                         }
2771
2772                         /* Expire states on other CPUs. */
2773                         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2774                                 if (cpu == mycpuid)
2775                                         continue;
2776                                 lwkt_send_ipiq(globaldata_find(cpu),
2777                                     ipfw_state_expire_ipifunc, NULL);
2778                         }
2779                         ctx->ipfw_sts_overflow++;
2780                         return (NULL);
2781                 }
2782         }
2783
2784         switch (cmd->o.opcode) {
2785         case O_KEEP_STATE: /* bidir rule */
2786         case O_REDIRECT:
2787                 s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2788                     tcp);
2789                 if (s == NULL)
2790                         return (NULL);
2791                 break;
2792
2793         case O_LIMIT: /* limit number of sessions */
2794                 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2795                 if (t == NULL)
2796                         return (NULL);
2797
2798                 if (*t->t_count >= cmd->conn_limit) {
2799                         if (!ipfw_track_state_expire(ctx, t, TRUE))
2800                                 return (NULL);
2801                 }
2802                 for (;;) {
2803                         count = *t->t_count;
2804                         if (count >= cmd->conn_limit)
2805                                 return (NULL);
2806                         if (atomic_cmpset_int(t->t_count, count, count + 1))
2807                                 break;
2808                 }
2809
2810                 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2811                 if (s == NULL) {
2812                         /* Undo damage. */
2813                         atomic_subtract_int(t->t_count, 1);
2814                         return (NULL);
2815                 }
2816                 break;
2817
2818         default:
2819                 panic("unknown state type %u\n", cmd->o.opcode);
2820         }
2821
2822         if (s->st_type == O_REDIRECT) {
2823                 struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2824                 ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2825
2826                 x->xlat_addr = r->addr.s_addr;
2827                 x->xlat_port = r->port;
2828                 x->xlat_ifp = args->m->m_pkthdr.rcvif;
2829                 x->xlat_dir = MATCH_FORWARD;
2830                 KKASSERT(x->xlat_ifp != NULL);
2831         }
2832         return (s);
2833 }
2834
2835 static int
2836 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2837     const struct in_addr *in)
2838 {
2839         struct radix_node_head *rnh;
2840         struct sockaddr_in sin;
2841         struct ipfw_tblent *te;
2842
2843         KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2844         rnh = ctx->ipfw_tables[tableid];
2845         if (rnh == NULL)
2846                 return (0); /* no match */
2847
2848         memset(&sin, 0, sizeof(sin));
2849         sin.sin_family = AF_INET;
2850         sin.sin_len = sizeof(sin);
2851         sin.sin_addr = *in;
2852
2853         te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2854         if (te == NULL)
2855                 return (0); /* no match */
2856
2857         te->te_use++;
2858         te->te_lastuse = time_second;
2859         return (1); /* match */
2860 }
2861
2862 /*
2863  * Transmit a TCP packet, containing either a RST or a keepalive.
2864  * When flags & TH_RST, we are sending a RST packet, because of a
2865  * "reset" action matched the packet.
2866  * Otherwise we are sending a keepalive, and flags & TH_
2867  *
2868  * Only {src,dst}_{ip,port} of "id" are used.
2869  */
2870 static void
2871 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2872 {
2873         struct mbuf *m;
2874         struct ip *ip;
2875         struct tcphdr *tcp;
2876         struct route sro;       /* fake route */
2877
2878         MGETHDR(m, M_NOWAIT, MT_HEADER);
2879         if (m == NULL)
2880                 return;
2881         m->m_pkthdr.rcvif = NULL;
2882         m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2883         m->m_data += max_linkhdr;
2884
2885         ip = mtod(m, struct ip *);
2886         bzero(ip, m->m_len);
2887         tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2888         ip->ip_p = IPPROTO_TCP;
2889         tcp->th_off = 5;
2890
2891         /*
2892          * Assume we are sending a RST (or a keepalive in the reverse
2893          * direction), swap src and destination addresses and ports.
2894          */
2895         ip->ip_src.s_addr = htonl(id->dst_ip);
2896         ip->ip_dst.s_addr = htonl(id->src_ip);
2897         tcp->th_sport = htons(id->dst_port);
2898         tcp->th_dport = htons(id->src_port);
2899         if (flags & TH_RST) {   /* we are sending a RST */
2900                 if (flags & TH_ACK) {
2901                         tcp->th_seq = htonl(ack);
2902                         tcp->th_ack = htonl(0);
2903                         tcp->th_flags = TH_RST;
2904                 } else {
2905                         if (flags & TH_SYN)
2906                                 seq++;
2907                         tcp->th_seq = htonl(0);
2908                         tcp->th_ack = htonl(seq);
2909                         tcp->th_flags = TH_RST | TH_ACK;
2910                 }
2911         } else {
2912                 /*
2913                  * We are sending a keepalive. flags & TH_SYN determines
2914                  * the direction, forward if set, reverse if clear.
2915                  * NOTE: seq and ack are always assumed to be correct
2916                  * as set by the caller. This may be confusing...
2917                  */
2918                 if (flags & TH_SYN) {
2919                         /*
2920                          * we have to rewrite the correct addresses!
2921                          */
2922                         ip->ip_dst.s_addr = htonl(id->dst_ip);
2923                         ip->ip_src.s_addr = htonl(id->src_ip);
2924                         tcp->th_dport = htons(id->dst_port);
2925                         tcp->th_sport = htons(id->src_port);
2926                 }
2927                 tcp->th_seq = htonl(seq);
2928                 tcp->th_ack = htonl(ack);
2929                 tcp->th_flags = TH_ACK;
2930         }
2931
2932         /*
2933          * set ip_len to the payload size so we can compute
2934          * the tcp checksum on the pseudoheader
2935          * XXX check this, could save a couple of words ?
2936          */
2937         ip->ip_len = htons(sizeof(struct tcphdr));
2938         tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2939
2940         /*
2941          * now fill fields left out earlier
2942          */
2943         ip->ip_ttl = ip_defttl;
2944         ip->ip_len = m->m_pkthdr.len;
2945
2946         bzero(&sro, sizeof(sro));
2947         ip_rtaddr(ip->ip_dst, &sro);
2948
2949         m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2950         ip_output(m, NULL, &sro, 0, NULL, NULL);
2951         if (sro.ro_rt)
2952                 RTFREE(sro.ro_rt);
2953 }
2954
2955 /*
2956  * Send a reject message, consuming the mbuf passed as an argument.
2957  */
2958 static void
2959 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2960 {
2961         if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2962                 /* We need the IP header in host order for icmp_error(). */
2963                 if (args->eh != NULL) {
2964                         struct ip *ip = mtod(args->m, struct ip *);
2965
2966                         ip->ip_len = ntohs(ip->ip_len);
2967                         ip->ip_off = ntohs(ip->ip_off);
2968                 }
2969                 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2970         } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2971                 struct tcphdr *const tcp =
2972                     L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2973
2974                 if ((tcp->th_flags & TH_RST) == 0) {
2975                         send_pkt(&args->f_id, ntohl(tcp->th_seq),
2976                                  ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2977                 }
2978                 m_freem(args->m);
2979         } else {
2980                 m_freem(args->m);
2981         }
2982         args->m = NULL;
2983 }
2984
2985 /*
2986  * Given an ip_fw *, lookup_next_rule will return a pointer
2987  * to the next rule, which can be either the jump
2988  * target (for skipto instructions) or the next one in the list (in
2989  * all other cases including a missing jump target).
2990  * The result is also written in the "next_rule" field of the rule.
2991  * Backward jumps are not allowed, so start looking from the next
2992  * rule...
2993  *
2994  * This never returns NULL -- in case we do not have an exact match,
2995  * the next rule is returned. When the ruleset is changed,
2996  * pointers are flushed so we are always correct.
2997  */
2998 static struct ip_fw *
2999 lookup_next_rule(struct ip_fw *me)
3000 {
3001         struct ip_fw *rule = NULL;
3002         ipfw_insn *cmd;
3003
3004         /* look for action, in case it is a skipto */
3005         cmd = ACTION_PTR(me);
3006         if (cmd->opcode == O_LOG)
3007                 cmd += F_LEN(cmd);
3008         if (cmd->opcode == O_SKIPTO) {
3009                 for (rule = me->next; rule; rule = rule->next) {
3010                         if (rule->rulenum >= cmd->arg1)
3011                                 break;
3012                 }
3013         }
3014         if (rule == NULL)                       /* failure or not a skipto */
3015                 rule = me->next;
3016         me->next_rule = rule;
3017         return rule;
3018 }
3019
3020 static int
3021 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3022                 enum ipfw_opcodes opcode, uid_t uid)
3023 {
3024         struct in_addr src_ip, dst_ip;
3025         struct inpcbinfo *pi;
3026         boolean_t wildcard;
3027         struct inpcb *pcb;
3028
3029         if (fid->proto == IPPROTO_TCP) {
3030                 wildcard = FALSE;
3031                 pi = &tcbinfo[mycpuid];
3032         } else if (fid->proto == IPPROTO_UDP) {
3033                 wildcard = TRUE;
3034                 pi = &udbinfo[mycpuid];
3035         } else {
3036                 return 0;
3037         }
3038
3039         /*
3040          * Values in 'fid' are in host byte order
3041          */
3042         dst_ip.s_addr = htonl(fid->dst_ip);
3043         src_ip.s_addr = htonl(fid->src_ip);
3044         if (oif) {
3045                 pcb = in_pcblookup_hash(pi,
3046                         dst_ip, htons(fid->dst_port),
3047                         src_ip, htons(fid->src_port),
3048                         wildcard, oif);
3049         } else {
3050                 pcb = in_pcblookup_hash(pi,
3051                         src_ip, htons(fid->src_port),
3052                         dst_ip, htons(fid->dst_port),
3053                         wildcard, NULL);
3054         }
3055         if (pcb == NULL || pcb->inp_socket == NULL)
3056                 return 0;
3057
3058         if (opcode == O_UID) {
3059 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
3060                 return !socheckuid(pcb->inp_socket, uid);
3061 #undef socheckuid
3062         } else  {
3063                 return groupmember(uid, pcb->inp_socket->so_cred);
3064         }
3065 }
3066
3067 static int
3068 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3069 {
3070
3071         if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3072                 struct ifaddr_container *ifac;
3073                 struct ifnet *ifp;
3074
3075                 ifp = ifunit_netisr(cmd->ifname);
3076                 if (ifp == NULL)
3077                         return (0);
3078
3079                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3080                         struct ifaddr *ia = ifac->ifa;
3081
3082                         if (ia->ifa_addr == NULL)
3083                                 continue;
3084                         if (ia->ifa_addr->sa_family != AF_INET)
3085                                 continue;
3086
3087                         cmd->mask.s_addr = INADDR_ANY;
3088                         if (cmd->o.arg1 & IPFW_IFIP_NET) {
3089                                 cmd->mask = ((struct sockaddr_in *)
3090                                     ia->ifa_netmask)->sin_addr;
3091                         }
3092                         if (cmd->mask.s_addr == INADDR_ANY)
3093                                 cmd->mask.s_addr = INADDR_BROADCAST;
3094
3095                         cmd->addr =
3096                             ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3097                         cmd->addr.s_addr &= cmd->mask.s_addr;
3098
3099                         cmd->o.arg1 |= IPFW_IFIP_VALID;
3100                         break;
3101                 }
3102                 if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3103                         return (0);
3104         }
3105         return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3106 }
3107
3108 static void
3109 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3110     struct in_addr *old_addr, uint16_t *old_port)
3111 {
3112         struct ip *ip = mtod(m, struct ip *);
3113         struct in_addr *addr;
3114         uint16_t *port, *csum, dlen = 0;
3115         uint8_t udp = 0;
3116         boolean_t pseudo = FALSE;
3117
3118         if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3119                 addr = &ip->ip_src;
3120                 switch (ip->ip_p) {
3121                 case IPPROTO_TCP:
3122                         port = &L3HDR(struct tcphdr, ip)->th_sport;
3123                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3124                         break;
3125                 case IPPROTO_UDP:
3126                         port = &L3HDR(struct udphdr, ip)->uh_sport;
3127                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3128                         udp = 1;
3129                         break;
3130                 default:
3131                         panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3132                 }
3133         } else {
3134                 addr = &ip->ip_dst;
3135                 switch (ip->ip_p) {
3136                 case IPPROTO_TCP:
3137                         port = &L3HDR(struct tcphdr, ip)->th_dport;
3138                         csum = &L3HDR(struct tcphdr, ip)->th_sum;
3139                         break;
3140                 case IPPROTO_UDP:
3141                         port = &L3HDR(struct udphdr, ip)->uh_dport;
3142                         csum = &L3HDR(struct udphdr, ip)->uh_sum;
3143                         udp = 1;
3144                         break;
3145                 default:
3146                         panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3147                 }
3148         }
3149         if (old_addr != NULL)
3150                 *old_addr = *addr;
3151         if (old_port != NULL) {
3152                 if (x->xlat_port != 0)
3153                         *old_port = *port;
3154                 else
3155                         *old_port = 0;
3156         }
3157
3158         if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3159                 if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3160                         dlen = ip->ip_len - (ip->ip_hl << 2);
3161                 pseudo = TRUE;
3162         }
3163
3164         if (!pseudo) {
3165                 const uint16_t *oaddr, *naddr;
3166
3167                 oaddr = (const uint16_t *)&addr->s_addr;
3168                 naddr = (const uint16_t *)&x->xlat_addr;
3169
3170                 ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3171                     oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3172                 *csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3173                     oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3174         }
3175         addr->s_addr = x->xlat_addr;
3176
3177         if (x->xlat_port != 0) {
3178                 if (!pseudo) {
3179                         *csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3180                             udp);
3181                 }
3182                 *port = x->xlat_port;
3183         }
3184
3185         if (pseudo) {
3186                 *csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3187                     htons(dlen + ip->ip_p));
3188         }
3189 }
3190
3191 static void
3192 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3193 {
3194         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3195         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3196         struct mbuf *m = nm->m;
3197         struct ipfw_xlat *x = nm->arg1;
3198         struct ip_fw *rule = x->xlat_rule;
3199
3200         ASSERT_NETISR_NCPUS(mycpuid);
3201         KASSERT(rule->cpuid == mycpuid,
3202             ("rule does not belong to cpu%d", mycpuid));
3203         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3204             ("mbuf does not have ipfw continue rule"));
3205
3206         KASSERT(ctx->ipfw_cont_rule == NULL,
3207             ("pending ipfw continue rule"));
3208         KASSERT(ctx->ipfw_cont_xlat == NULL,
3209             ("pending ipfw continue xlat"));
3210         ctx->ipfw_cont_rule = rule;
3211         ctx->ipfw_cont_xlat = x;
3212
3213         if (nm->arg2 == 0)
3214                 ip_input(m);
3215         else
3216                 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3217
3218         /* May not be cleared, if ipfw was unload/disabled. */
3219         ctx->ipfw_cont_rule = NULL;
3220         ctx->ipfw_cont_xlat = NULL;
3221
3222         /*
3223          * This state is no longer used; decrement its xlat_crefs,
3224          * so this state can be deleted.
3225          */
3226         x->xlat_crefs--;
3227         /*
3228          * This rule is no longer used; decrement its cross_refs,
3229          * so this rule can be deleted.
3230          *
3231          * NOTE:
3232          * Decrement cross_refs in the last step of this function,
3233          * so that the module could be unloaded safely.
3234          */
3235         rule->cross_refs--;
3236 }
3237
3238 static void
3239 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3240     uint32_t flags)
3241 {
3242         struct netmsg_genpkt *nm;
3243
3244         KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3245             x->xlat_pcpu, cpuid));
3246
3247         /*
3248          * Bump cross_refs to prevent this rule and its siblings
3249          * from being deleted, while this mbuf is inflight.  The
3250          * cross_refs of the sibling rule on the target cpu will
3251          * be decremented, once this mbuf is going to be filtered
3252          * on the target cpu.
3253          */
3254         x->xlat_rule->cross_refs++;
3255         /*
3256          * Bump xlat_crefs to prevent this state and its paired
3257          * state from being deleted, while this mbuf is inflight.
3258          * The xlat_crefs of the paired state on the target cpu
3259          * will be decremented, once this mbuf is going to be
3260          * filtered on the target cpu.
3261          */
3262         x->xlat_crefs++;
3263
3264         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3265         if (flags & IPFW_XLATE_INSERT)
3266                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3267         if (flags & IPFW_XLATE_FORWARD)
3268                 m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3269
3270         if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3271                 struct ip *ip = mtod(m, struct ip *);
3272
3273                 /*
3274                  * NOTE:
3275                  * ip_input() expects ip_len/ip_off are in network
3276                  * byte order.
3277                  */
3278                 ip->ip_len = htons(ip->ip_len);
3279                 ip->ip_off = htons(ip->ip_off);
3280         }
3281
3282         nm = &m->m_hdr.mh_genmsg;
3283         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3284             ipfw_ip_xlate_dispatch);
3285         nm->m = m;
3286         nm->arg1 = x->xlat_pair;
3287         nm->arg2 = 0;
3288         if (flags & IPFW_XLATE_OUTPUT)
3289                 nm->arg2 = 1;
3290         netisr_sendmsg(&nm->base, cpuid);
3291 }
3292
3293 static struct mbuf *
3294 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3295     struct ip_fw_local *local, struct ip **ip0)
3296 {
3297         struct ip *ip = mtod(m, struct ip *);
3298         struct tcphdr *tcp;
3299         struct udphdr *udp;
3300
3301         /*
3302          * Collect parameters into local variables for faster matching.
3303          */
3304         if (hlen == 0) {        /* do not grab addresses for non-ip pkts */
3305                 local->proto = args->f_id.proto = 0;    /* mark f_id invalid */
3306                 goto done;
3307         }
3308
3309         local->proto = args->f_id.proto = ip->ip_p;
3310         local->src_ip = ip->ip_src;
3311         local->dst_ip = ip->ip_dst;
3312         if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3313                 local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3314                 local->ip_len = ntohs(ip->ip_len);
3315         } else {
3316                 local->offset = ip->ip_off & IP_OFFMASK;
3317                 local->ip_len = ip->ip_len;
3318         }
3319
3320 #define PULLUP_TO(len)                                  \
3321 do {                                                    \
3322         if (m->m_len < (len)) {                         \
3323                 args->m = m = m_pullup(m, (len));       \
3324                 if (m == NULL) {                        \
3325                         ip = NULL;                      \
3326                         goto done;                      \
3327                 }                                       \
3328                 ip = mtod(m, struct ip *);              \
3329         }                                               \
3330 } while (0)
3331
3332         if (local->offset == 0) {
3333                 switch (local->proto) {
3334                 case IPPROTO_TCP:
3335                         PULLUP_TO(hlen + sizeof(struct tcphdr));
3336                         local->tcp = tcp = L3HDR(struct tcphdr, ip);
3337                         local->dst_port = tcp->th_dport;
3338                         local->src_port = tcp->th_sport;
3339                         args->f_id.flags = tcp->th_flags;
3340                         break;
3341
3342                 case IPPROTO_UDP:
3343                         PULLUP_TO(hlen + sizeof(struct udphdr));
3344                         udp = L3HDR(struct udphdr, ip);
3345                         local->dst_port = udp->uh_dport;
3346                         local->src_port = udp->uh_sport;
3347                         break;
3348
3349                 case IPPROTO_ICMP:
3350                         PULLUP_TO(hlen + 4);    /* type, code and checksum. */
3351                         args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3352                         break;
3353
3354                 default:
3355                         break;
3356                 }
3357         }
3358
3359 #undef PULLUP_TO
3360
3361         args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3362         args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3363         args->f_id.src_port = local->src_port = ntohs(local->src_port);
3364         args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3365 done:
3366         *ip0 = ip;
3367         return (m);
3368 }
3369
3370 static struct mbuf *
3371 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3372     struct ip_fw_local *local, struct ip **ip0)
3373 {
3374         struct ip *ip = mtod(m, struct ip *);
3375
3376         ip->ip_len = htons(ip->ip_len);
3377         ip->ip_off = htons(ip->ip_off);
3378
3379         m->m_flags &= ~M_HASH;
3380         ip_hashfn(&m, 0);
3381         args->m = m;
3382         if (m == NULL) {
3383                 *ip0 = NULL;
3384                 return (NULL);
3385         }
3386         KASSERT(m->m_flags & M_HASH, ("no hash"));
3387
3388         /* 'm' might be changed by ip_hashfn(). */
3389         ip = mtod(m, struct ip *);
3390         ip->ip_len = ntohs(ip->ip_len);
3391         ip->ip_off = ntohs(ip->ip_off);
3392
3393         return (ipfw_setup_local(m, hlen, args, local, ip0));
3394 }
3395
3396 /*
3397  * The main check routine for the firewall.
3398  *
3399  * All arguments are in args so we can modify them and return them
3400  * back to the caller.
3401  *
3402  * Parameters:
3403  *
3404  *      args->m (in/out) The packet; we set to NULL when/if we nuke it.
3405  *              Starts with the IP header.
3406  *      args->eh (in)   Mac header if present, or NULL for layer3 packet.
3407  *      args->oif       Outgoing interface, or NULL if packet is incoming.
3408  *              The incoming interface is in the mbuf. (in)
3409  *
3410  *      args->rule      Pointer to the last matching rule (in/out)
3411  *      args->f_id      Addresses grabbed from the packet (out)
3412  *
3413  * Return value:
3414  *
3415  *      If the packet was denied/rejected and has been dropped, *m is equal
3416  *      to NULL upon return.
3417  *
3418  *      IP_FW_DENY      the packet must be dropped.
3419  *      IP_FW_PASS      The packet is to be accepted and routed normally.
3420  *      IP_FW_DIVERT    Divert the packet to port (args->cookie)
3421  *      IP_FW_TEE       Tee the packet to port (args->cookie)
3422  *      IP_FW_DUMMYNET  Send the packet to pipe/queue (args->cookie)
3423  *      IP_FW_CONTINUE  Continue processing on another cpu.
3424  */
3425 static int
3426 ipfw_chk(struct ip_fw_args *args)
3427 {
3428         /*
3429          * Local variables hold state during the processing of a packet.
3430          *
3431          * IMPORTANT NOTE: to speed up the processing of rules, there
3432          * are some assumption on the values of the variables, which
3433          * are documented here. Should you change them, please check
3434          * the implementation of the various instructions to make sure
3435          * that they still work.
3436          *
3437          * args->eh     The MAC header. It is non-null for a layer2
3438          *      packet, it is NULL for a layer-3 packet.
3439          *
3440          * m | args->m  Pointer to the mbuf, as received from the caller.
3441          *      It may change if ipfw_chk() does an m_pullup, or if it
3442          *      consumes the packet because it calls send_reject().
3443          *      XXX This has to change, so that ipfw_chk() never modifies
3444          *      or consumes the buffer.
3445          * ip   is simply an alias of the value of m, and it is kept
3446          *      in sync with it (the packet is  supposed to start with
3447          *      the ip header).
3448          */
3449         struct mbuf *m = args->m;
3450         struct ip *ip = mtod(m, struct ip *);
3451
3452         /*
3453          * oif | args->oif      If NULL, ipfw_chk has been called on the
3454          *      inbound path (ether_input, ip_input).
3455          *      If non-NULL, ipfw_chk has been called on the outbound path
3456          *      (ether_output, ip_output).
3457          */
3458         struct ifnet *oif = args->oif;
3459
3460         struct ip_fw *f = NULL;         /* matching rule */
3461         int retval = IP_FW_PASS;
3462         struct m_tag *mtag;
3463         struct divert_info *divinfo;
3464         struct ipfw_state *s;
3465
3466         /*
3467          * hlen The length of the IPv4 header.
3468          *      hlen >0 means we have an IPv4 packet.
3469          */
3470         u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
3471
3472         struct ip_fw_local lc;
3473
3474         /*
3475          * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3476          *      MATCH_NONE when checked and not matched (dyn_f = NULL),
3477          *      MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3478          */
3479         int dyn_dir = MATCH_UNKNOWN;
3480         struct ip_fw *dyn_f = NULL;
3481         int cpuid = mycpuid;
3482         struct ipfw_context *ctx;
3483
3484         ASSERT_NETISR_NCPUS(cpuid);
3485         ctx = ipfw_ctx[cpuid];
3486
3487         if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3488                 return IP_FW_PASS;      /* accept */
3489
3490         if (args->eh == NULL ||         /* layer 3 packet */
3491             (m->m_pkthdr.len >= sizeof(struct ip) &&
3492              ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3493                 hlen = ip->ip_hl << 2;
3494
3495         memset(&lc, 0, sizeof(lc));
3496
3497         m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3498         if (m == NULL)
3499                 goto pullup_failed;
3500
3501         if (args->rule) {
3502                 /*
3503                  * Packet has already been tagged. Look for the next rule
3504                  * to restart processing.
3505                  *
3506                  * If fw_one_pass != 0 then just accept it.
3507                  * XXX should not happen here, but optimized out in
3508                  * the caller.
3509                  */
3510                 if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3511                         return IP_FW_PASS;
3512                 args->flags &= ~IP_FWARG_F_CONT;
3513
3514                 /* This rule is being/has been flushed */
3515                 if (ipfw_flushing)
3516                         return IP_FW_DENY;
3517
3518                 KASSERT(args->rule->cpuid == cpuid,
3519                         ("rule used on cpu%d", cpuid));
3520
3521                 /* This rule was deleted */
3522                 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3523                         return IP_FW_DENY;
3524
3525                 if (args->xlat != NULL) {
3526                         struct ipfw_xlat *x = args->xlat;
3527
3528                         /* This xlat is being deleted. */
3529                         if (x->xlat_invalid)
3530                                 return IP_FW_DENY;
3531
3532                         f = args->rule;
3533
3534                         dyn_f = f;
3535                         dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3536                             MATCH_FORWARD : MATCH_REVERSE;
3537
3538                         if (args->flags & IP_FWARG_F_XLATINS) {
3539                                 KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3540                                     ("not slave %u state", x->xlat_type));
3541                                 s = ipfw_state_link(ctx, &x->xlat_st);
3542                                 if (s != NULL) {
3543                                         ctx->ipfw_xlate_conflicts++;
3544                                         if (IPFW_STATE_ISDEAD(s)) {
3545                                                 ipfw_state_remove(ctx, s);
3546                                                 s = ipfw_state_link(ctx,
3547                                                     &x->xlat_st);
3548                                         }
3549                                         if (s != NULL) {
3550                                                 if (bootverbose) {
3551                                                         kprintf("ipfw: "
3552                                                         "slave %u state "
3553                                                         "conflicts %u state\n",
3554                                                         x->xlat_type,
3555                                                         s->st_type);
3556                                                 }
3557                                                 ipfw_xlat_invalidate(x);
3558                                                 return IP_FW_DENY;
3559                                         }
3560                                         ctx->ipfw_xlate_cresolved++;
3561                                 }
3562                         } else {
3563                                 ipfw_state_update(&args->f_id, dyn_dir,
3564                                     lc.tcp, &x->xlat_st);
3565                         }
3566                 } else {
3567                         /* TODO: setup dyn_f, dyn_dir */
3568
3569                         f = args->rule->next_rule;
3570                         if (f == NULL)
3571                                 f = lookup_next_rule(args->rule);
3572                 }
3573         } else {
3574                 /*
3575                  * Find the starting rule. It can be either the first
3576                  * one, or the one after divert_rule if asked so.
3577                  */
3578                 int skipto;
3579
3580                 KKASSERT((args->flags &
3581                     (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3582                 KKASSERT(args->xlat == NULL);
3583
3584                 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3585                 if (mtag != NULL) {
3586                         divinfo = m_tag_data(mtag);
3587                         skipto = divinfo->skipto;
3588                 } else {
3589                         skipto = 0;
3590                 }
3591
3592                 f = ctx->ipfw_layer3_chain;
3593                 if (args->eh == NULL && skipto != 0) {
3594                         /* No skipto during rule flushing */
3595                         if (ipfw_flushing)
3596                                 return IP_FW_DENY;
3597
3598                         if (skipto >= IPFW_DEFAULT_RULE)
3599                                 return IP_FW_DENY; /* invalid */
3600
3601                         while (f && f->rulenum <= skipto)
3602                                 f = f->next;
3603                         if (f == NULL)  /* drop packet */
3604                                 return IP_FW_DENY;
3605                 } else if (ipfw_flushing) {
3606                         /* Rules are being flushed; skip to default rule */
3607                         f = ctx->ipfw_default_rule;
3608                 }
3609         }
3610         if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3611                 m_tag_delete(m, mtag);
3612
3613         /*
3614          * Now scan the rules, and parse microinstructions for each rule.
3615          */
3616         for (; f; f = f->next) {
3617                 int l, cmdlen;
3618                 ipfw_insn *cmd;
3619                 int skip_or; /* skip rest of OR block */
3620
3621 again:
3622                 if (ctx->ipfw_set_disable & (1 << f->set)) {
3623                         args->xlat = NULL;
3624                         continue;
3625                 }
3626
3627                 if (args->xlat != NULL) {
3628                         args->xlat = NULL;
3629                         l = f->cmd_len - f->act_ofs;
3630                         cmd = ACTION_PTR(f);
3631                 } else {
3632                         l = f->cmd_len;
3633                         cmd = f->cmd;
3634                 }
3635
3636                 skip_or = 0;
3637                 for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3638                         int match;
3639
3640                         /*
3641                          * check_body is a jump target used when we find a
3642                          * CHECK_STATE, and need to jump to the body of
3643                          * the target rule.
3644                          */
3645 check_body:
3646                         cmdlen = F_LEN(cmd);
3647                         /*
3648                          * An OR block (insn_1 || .. || insn_n) has the
3649                          * F_OR bit set in all but the last instruction.
3650                          * The first match will set "skip_or", and cause
3651                          * the following instructions to be skipped until
3652                          * past the one with the F_OR bit clear.
3653                          */
3654                         if (skip_or) {          /* skip this instruction */
3655                                 if ((cmd->len & F_OR) == 0)
3656                                         skip_or = 0;    /* next one is good */
3657                                 continue;
3658                         }
3659                         match = 0; /* set to 1 if we succeed */
3660
3661                         switch (cmd->opcode) {
3662                         /*
3663                          * The first set of opcodes compares the packet's
3664                          * fields with some pattern, setting 'match' if a
3665                          * match is found. At the end of the loop there is
3666                          * logic to deal with F_NOT and F_OR flags associated
3667                          * with the opcode.
3668                          */
3669                         case O_NOP:
3670                                 match = 1;
3671                                 break;
3672
3673                         case O_FORWARD_MAC:
3674                                 kprintf("ipfw: opcode %d unimplemented\n",
3675                                         cmd->opcode);
3676                                 break;
3677
3678                         case O_GID:
3679                         case O_UID:
3680                                 /*
3681                                  * We only check offset == 0 && proto != 0,
3682                                  * as this ensures that we have an IPv4
3683                                  * packet with the ports info.
3684                                  */
3685                                 if (lc.offset!=0)
3686                                         break;
3687
3688                                 match = ipfw_match_uid(&args->f_id, oif,
3689                                         cmd->opcode,
3690                                         (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3691                                 break;
3692
3693                         case O_RECV:
3694                                 match = iface_match(m->m_pkthdr.rcvif,
3695                                     (ipfw_insn_if *)cmd);
3696                                 break;
3697
3698                         case O_XMIT:
3699                                 match = iface_match(oif, (ipfw_insn_if *)cmd);
3700                                 break;
3701
3702                         case O_VIA:
3703                                 match = iface_match(oif ? oif :
3704                                     m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3705                                 break;
3706
3707                         case O_MACADDR2:
3708                                 if (args->eh != NULL) { /* have MAC header */
3709                                         uint32_t *want = (uint32_t *)
3710                                                 ((ipfw_insn_mac *)cmd)->addr;
3711                                         uint32_t *mask = (uint32_t *)
3712                                                 ((ipfw_insn_mac *)cmd)->mask;
3713                                         uint32_t *hdr = (uint32_t *)args->eh;
3714
3715                                         match =
3716                                         (want[0] == (hdr[0] & mask[0]) &&
3717                                          want[1] == (hdr[1] & mask[1]) &&
3718                                          want[2] == (hdr[2] & mask[2]));
3719                                 }
3720                                 break;
3721
3722                         case O_MAC_TYPE:
3723                                 if (args->eh != NULL) {
3724                                         uint16_t t =
3725                                             ntohs(args->eh->ether_type);
3726                                         uint16_t *p =
3727                                             ((ipfw_insn_u16 *)cmd)->ports;
3728                                         int i;
3729
3730                                         /* Special vlan handling */
3731                                         if (m->m_flags & M_VLANTAG)
3732                                                 t = ETHERTYPE_VLAN;
3733
3734                                         for (i = cmdlen - 1; !match && i > 0;
3735                                              i--, p += 2) {
3736                                                 match =
3737                                                 (t >= p[0] && t <= p[1]);
3738                                         }
3739                                 }
3740                                 break;
3741
3742                         case O_FRAG:
3743                                 match = (hlen > 0 && lc.offset != 0);
3744                                 break;
3745
3746                         case O_IPFRAG:
3747                                 if (hlen > 0) {
3748                                         uint16_t off;
3749
3750                                         if (args->eh != NULL)
3751                                                 off = ntohs(ip->ip_off);
3752                                         else
3753                                                 off = ip->ip_off;
3754                                         if (off & (IP_MF | IP_OFFMASK))
3755                                                 match = 1;
3756                                 }
3757                                 break;
3758
3759                         case O_IN:      /* "out" is "not in" */
3760                                 match = (oif == NULL);
3761                                 break;
3762
3763                         case O_LAYER2:
3764                                 match = (args->eh != NULL);
3765                                 break;
3766
3767                         case O_PROTO:
3768                                 /*
3769                                  * We do not allow an arg of 0 so the
3770                                  * check of "proto" only suffices.
3771                                  */
3772                                 match = (lc.proto == cmd->arg1);
3773                                 break;
3774
3775                         case O_IP_SRC:
3776                                 match = (hlen > 0 &&
3777                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3778                                     lc.src_ip.s_addr);
3779                                 break;
3780
3781                         case O_IP_SRC_MASK:
3782                                 match = (hlen > 0 &&
3783                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3784                                      (lc.src_ip.s_addr &
3785                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3786                                 break;
3787
3788                         case O_IP_SRC_ME:
3789                                 if (hlen > 0) {
3790                                         struct ifnet *tif;
3791
3792                                         tif = INADDR_TO_IFP(&lc.src_ip);
3793                                         match = (tif != NULL);
3794                                 }
3795                                 break;
3796
3797                         case O_IP_SRC_TABLE:
3798                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3799                                     &lc.src_ip);
3800                                 break;
3801
3802                         case O_IP_SRC_IFIP:
3803                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3804                                     &lc.src_ip);
3805                                 break;
3806
3807                         case O_IP_DST_SET:
3808                         case O_IP_SRC_SET:
3809                                 if (hlen > 0) {
3810                                         uint32_t *d = (uint32_t *)(cmd + 1);
3811                                         uint32_t addr =
3812                                             cmd->opcode == O_IP_DST_SET ?
3813                                                 args->f_id.dst_ip :
3814                                                 args->f_id.src_ip;
3815
3816                                         if (addr < d[0])
3817                                                 break;
3818                                         addr -= d[0]; /* subtract base */
3819                                         match =
3820                                         (addr < cmd->arg1) &&
3821                                          (d[1 + (addr >> 5)] &
3822                                           (1 << (addr & 0x1f)));
3823                                 }
3824                                 break;
3825
3826                         case O_IP_DST:
3827                                 match = (hlen > 0 &&
3828                                     ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3829                                     lc.dst_ip.s_addr);
3830                                 break;
3831
3832                         case O_IP_DST_MASK:
3833                                 match = (hlen > 0) &&
3834                                     (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3835                                      (lc.dst_ip.s_addr &
3836                                      ((ipfw_insn_ip *)cmd)->mask.s_addr));
3837                                 break;
3838
3839                         case O_IP_DST_ME:
3840                                 if (hlen > 0) {
3841                                         struct ifnet *tif;
3842
3843                                         tif = INADDR_TO_IFP(&lc.dst_ip);
3844                                         match = (tif != NULL);
3845                                 }
3846                                 break;
3847
3848                         case O_IP_DST_TABLE:
3849                                 match = ipfw_table_lookup(ctx, cmd->arg1,
3850                                     &lc.dst_ip);
3851                                 break;
3852
3853                         case O_IP_DST_IFIP:
3854                                 match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3855                                     &lc.dst_ip);
3856                                 break;
3857
3858                         case O_IP_SRCPORT:
3859                         case O_IP_DSTPORT:
3860                                 /*
3861                                  * offset == 0 && proto != 0 is enough
3862                                  * to guarantee that we have an IPv4
3863                                  * packet with port info.
3864                                  */
3865                                 if ((lc.proto==IPPROTO_UDP ||
3866                                      lc.proto==IPPROTO_TCP)
3867                                     && lc.offset == 0) {
3868                                         uint16_t x =
3869                                             (cmd->opcode == O_IP_SRCPORT) ?
3870                                                 lc.src_port : lc.dst_port;
3871                                         uint16_t *p =
3872                                             ((ipfw_insn_u16 *)cmd)->ports;
3873                                         int i;
3874
3875                                         for (i = cmdlen - 1; !match && i > 0;
3876                                              i--, p += 2) {
3877                                                 match =
3878                                                 (x >= p[0] && x <= p[1]);
3879                                         }
3880                                 }
3881                                 break;
3882
3883                         case O_ICMPTYPE:
3884                                 match = (lc.offset == 0 &&
3885                                     lc.proto==IPPROTO_ICMP &&
3886                                     icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3887                                 break;
3888
3889                         case O_IPOPT:
3890                                 match = (hlen > 0 && ipopts_match(ip, cmd));
3891                                 break;
3892
3893                         case O_IPVER:
3894                                 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3895                                 break;
3896
3897                         case O_IPTTL:
3898                                 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3899                                 break;
3900
3901                         case O_IPID:
3902                                 match = (hlen > 0 &&
3903                                     cmd->arg1 == ntohs(ip->ip_id));
3904                                 break;
3905
3906                         case O_IPLEN:
3907                                 match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3908                                 break;
3909
3910                         case O_IPPRECEDENCE:
3911                                 match = (hlen > 0 &&
3912                                     (cmd->arg1 == (ip->ip_tos & 0xe0)));
3913                                 break;
3914
3915                         case O_IPTOS:
3916                                 match = (hlen > 0 &&
3917                                     flags_match(cmd, ip->ip_tos));
3918                                 break;
3919
3920                         case O_TCPFLAGS:
3921                                 match = (lc.proto == IPPROTO_TCP &&
3922                                     lc.offset == 0 &&
3923                                     flags_match(cmd,
3924                                         L3HDR(struct tcphdr,ip)->th_flags));
3925                                 break;
3926
3927                         case O_TCPOPTS:
3928                                 match = (lc.proto == IPPROTO_TCP &&
3929                                     lc.offset == 0 && tcpopts_match(ip, cmd));
3930                                 break;
3931
3932                         case O_TCPSEQ:
3933                                 match = (lc.proto == IPPROTO_TCP &&
3934                                     lc.offset == 0 &&
3935                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3936                                         L3HDR(struct tcphdr,ip)->th_seq);
3937                                 break;
3938
3939                         case O_TCPACK:
3940                                 match = (lc.proto == IPPROTO_TCP &&
3941                                     lc.offset == 0 &&
3942                                     ((ipfw_insn_u32 *)cmd)->d[0] ==
3943                                         L3HDR(struct tcphdr,ip)->th_ack);
3944                                 break;
3945
3946                         case O_TCPWIN:
3947                                 match = (lc.proto == IPPROTO_TCP &&
3948                                     lc.offset == 0 &&
3949                                     cmd->arg1 ==
3950                                         L3HDR(struct tcphdr,ip)->th_win);
3951                                 break;
3952
3953                         case O_ESTAB:
3954                                 /* reject packets which have SYN only */
3955                                 /* XXX should i also check for TH_ACK ? */
3956                                 match = (lc.proto == IPPROTO_TCP &&
3957                                     lc.offset == 0 &&
3958                                     (L3HDR(struct tcphdr,ip)->th_flags &
3959                                      (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3960                                 break;
3961
3962                         case O_LOG:
3963                                 if (fw_verbose) {
3964                                         ipfw_log(ctx, f, hlen, args->eh, m,
3965                                             oif);
3966                                 }
3967                                 match = 1;
3968                                 break;
3969
3970                         case O_PROB:
3971                                 match = (krandom() <
3972                                         ((ipfw_insn_u32 *)cmd)->d[0]);
3973                                 break;
3974
3975                         /*
3976                          * The second set of opcodes represents 'actions',
3977                          * i.e. the terminal part of a rule once the packet
3978                          * matches all previous patterns.
3979                          * Typically there is only one action for each rule,
3980                          * and the opcode is stored at the end of the rule
3981                          * (but there are exceptions -- see below).
3982                          *
3983                          * In general, here we set retval and terminate the
3984                          * outer loop (would be a 'break 3' in some language,
3985                          * but we need to do a 'goto done').
3986                          *
3987                          * Exceptions:
3988                          * O_COUNT and O_SKIPTO actions:
3989                          *   instead of terminating, we jump to the next rule
3990                          *   ('goto next_rule', equivalent to a 'break 2'),
3991                          *   or to the SKIPTO target ('goto again' after
3992                          *   having set f, cmd and l), respectively.
3993                          *
3994                          * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
3995                          *   are not real 'actions', and are stored right
3996                          *   before the 'action' part of the rule.
3997                          *   These opcodes try to install an entry in the
3998                          *   state tables; if successful, we continue with
3999                          *   the next opcode (match=1; break;), otherwise
4000                          *   the packet must be dropped ('goto done' after
4001                          *   setting retval).  If static rules are changed
4002                          *   during the state installation, the packet will
4003                          *   be dropped and rule's stats will not beupdated
4004                          *   ('return IP_FW_DENY').
4005                          *
4006                          * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4007                          *   cause a lookup of the state table, and a jump
4008                          *   to the 'action' part of the parent rule
4009                          *   ('goto check_body') if an entry is found, or
4010                          *   (CHECK_STATE only) a jump to the next rule if
4011                          *   the entry is not found ('goto next_rule').
4012                          *   The result of the lookup is cached to make
4013                          *   further instances of these opcodes are
4014                          *   effectively NOPs.  If static rules are changed
4015                          *   during the state looking up, the packet will
4016                          *   be dropped and rule's stats will not be updated
4017                          *   ('return IP_FW_DENY').
4018                          */
4019                         case O_REDIRECT:
4020                                 if (f->cross_rules == NULL) {
4021                                         /*
4022                                          * This rule was not completely setup;
4023                                          * move on to the next rule.
4024                                          */
4025                                         goto next_rule;
4026                                 }
4027                                 /*
4028                                  * Apply redirect only on input path and
4029                                  * only to non-fragment TCP segments or
4030                                  * UDP datagrams.
4031                                  *
4032                                  * Does _not_ work with layer2 filtering.
4033                                  */
4034                                 if (oif != NULL || args->eh != NULL ||
4035                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4036                                     (lc.proto != IPPROTO_TCP &&
4037                                      lc.proto != IPPROTO_UDP))
4038                                         break;
4039                                 /* FALL THROUGH */
4040                         case O_LIMIT:
4041                         case O_KEEP_STATE:
4042                                 if (hlen == 0)
4043                                         break;
4044                                 s = ipfw_state_install(ctx, f,
4045                                     (ipfw_insn_limit *)cmd, args, lc.tcp);
4046                                 if (s == NULL) {
4047                                         retval = IP_FW_DENY;
4048                                         goto done; /* error/limit violation */
4049                                 }
4050                                 s->st_pcnt++;
4051                                 s->st_bcnt += lc.ip_len;
4052
4053                                 if (s->st_type == O_REDIRECT) {
4054                                         struct in_addr oaddr;
4055                                         uint16_t oport;
4056                                         struct ipfw_xlat *slave_x, *x;
4057                                         struct ipfw_state *dup;
4058
4059                                         x = (struct ipfw_xlat *)s;
4060                                         ipfw_xlate(x, m, &oaddr, &oport);
4061                                         m = ipfw_rehashm(m, hlen, args, &lc,
4062                                             &ip);
4063                                         if (m == NULL) {
4064                                                 ipfw_state_del(ctx, s);
4065                                                 goto pullup_failed;
4066                                         }
4067
4068                                         cpuid = netisr_hashcpu(
4069                                             m->m_pkthdr.hash);
4070
4071                                         slave_x = (struct ipfw_xlat *)
4072                                             ipfw_state_alloc(ctx, &args->f_id,
4073                                             O_REDIRECT, f->cross_rules[cpuid],
4074                                             lc.tcp);
4075                                         if (slave_x == NULL) {
4076                                                 ipfw_state_del(ctx, s);
4077                                                 retval = IP_FW_DENY;
4078                                                 goto done;
4079                                         }
4080                                         slave_x->xlat_addr = oaddr.s_addr;
4081                                         slave_x->xlat_port = oport;
4082                                         slave_x->xlat_dir = MATCH_REVERSE;
4083                                         slave_x->xlat_flags |=
4084                                             IPFW_STATE_F_XLATSRC |
4085                                             IPFW_STATE_F_XLATSLAVE;
4086
4087                                         slave_x->xlat_pair = x;
4088                                         slave_x->xlat_pcpu = mycpuid;
4089                                         x->xlat_pair = slave_x;
4090                                         x->xlat_pcpu = cpuid;
4091
4092                                         ctx->ipfw_xlated++;
4093                                         if (cpuid != mycpuid) {
4094                                                 ctx->ipfw_xlate_split++;
4095                                                 ipfw_xlate_redispatch(
4096                                                     m, cpuid, x,
4097                                                     IPFW_XLATE_INSERT |
4098                                                     IPFW_XLATE_FORWARD);
4099                                                 args->m = NULL;
4100                                                 return (IP_FW_REDISPATCH);
4101                                         }
4102
4103                                         dup = ipfw_state_link(ctx,
4104                                             &slave_x->xlat_st);
4105                                         if (dup != NULL) {
4106                                                 ctx->ipfw_xlate_conflicts++;
4107                                                 if (IPFW_STATE_ISDEAD(dup)) {
4108                                                         ipfw_state_remove(ctx,
4109                                                             dup);
4110                                                         dup = ipfw_state_link(
4111                                                         ctx, &slave_x->xlat_st);
4112                                                 }
4113                                                 if (dup != NULL) {
4114                                                         if (bootverbose) {
4115                                                             kprintf("ipfw: "
4116                                                             "slave %u state "
4117                                                             "conflicts "
4118                                                             "%u state\n",
4119                                                             x->xlat_type,
4120                                                             s->st_type);
4121                                                         }
4122                                                         ipfw_state_del(ctx, s);
4123                                                         return (IP_FW_DENY);
4124                                                 }
4125                                                 ctx->ipfw_xlate_cresolved++;
4126                                         }
4127                                 }
4128                                 match = 1;
4129                                 break;
4130
4131                         case O_PROBE_STATE:
4132                         case O_CHECK_STATE:
4133                                 /*
4134                                  * States are checked at the first keep-state
4135                                  * check-state occurrence, with the result
4136                                  * being stored in dyn_dir.  The compiler
4137                                  * introduces a PROBE_STATE instruction for
4138                                  * us when we have a KEEP_STATE/LIMIT/RDR
4139                                  * (because PROBE_STATE needs to be run first).
4140                                  */
4141                                 s = NULL;
4142                                 if (dyn_dir == MATCH_UNKNOWN) {
4143                                         s = ipfw_state_lookup(ctx,
4144                                             &args->f_id, &dyn_dir, lc.tcp);
4145                                 }
4146                                 if (s == NULL ||
4147                                     (s->st_type == O_REDIRECT &&
4148                                      (args->eh != NULL ||
4149                                       (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4150                                       (lc.proto != IPPROTO_TCP &&
4151                                        lc.proto != IPPROTO_UDP)))) {
4152                                         /*
4153                                          * State not found. If CHECK_STATE,
4154                                          * skip to next rule, if PROBE_STATE
4155                                          * just ignore and continue with next
4156                                          * opcode.
4157                                          */
4158                                         if (cmd->opcode == O_CHECK_STATE)
4159                                                 goto next_rule;
4160                                         match = 1;
4161                                         break;
4162                                 }
4163
4164                                 s->st_pcnt++;
4165                                 s->st_bcnt += lc.ip_len;
4166
4167                                 if (s->st_type == O_REDIRECT) {
4168                                         struct ipfw_xlat *x =
4169                                             (struct ipfw_xlat *)s;
4170
4171                                         if (oif != NULL &&
4172                                             x->xlat_ifp == NULL) {
4173                                                 KASSERT(x->xlat_flags &
4174                                                     IPFW_STATE_F_XLATSLAVE,
4175                                                     ("master rdr state "
4176                                                      "missing ifp"));
4177                                                 x->xlat_ifp = oif;
4178                                         } else if (
4179                                             (oif != NULL && x->xlat_ifp!=oif) ||
4180                                             (oif == NULL &&
4181                                              x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4182                                                 retval = IP_FW_DENY;
4183                                                 goto done;
4184                                         }
4185                                         if (x->xlat_dir != dyn_dir)
4186                                                 goto skip_xlate;
4187
4188                                         ipfw_xlate(x, m, NULL, NULL);
4189                                         m = ipfw_rehashm(m, hlen, args, &lc,
4190                                             &ip);
4191                                         if (m == NULL)
4192                                                 goto pullup_failed;
4193
4194                                         cpuid = netisr_hashcpu(
4195                                             m->m_pkthdr.hash);
4196                                         if (cpuid != mycpuid) {
4197                                                 uint32_t xlate = 0;
4198
4199                                                 if (oif != NULL) {
4200                                                         xlate |=
4201                                                             IPFW_XLATE_OUTPUT;
4202                                                 }
4203                                                 if (dyn_dir == MATCH_FORWARD) {
4204                                                         xlate |=
4205                                                             IPFW_XLATE_FORWARD;
4206                                                 }
4207                                                 ipfw_xlate_redispatch(m, cpuid,
4208                                                     x, xlate);
4209                                                 args->m = NULL;
4210                                                 return (IP_FW_REDISPATCH);
4211                                         }
4212
4213                                         KKASSERT(x->xlat_pcpu == mycpuid);
4214                                         ipfw_state_update(&args->f_id, dyn_dir,
4215                                             lc.tcp, &x->xlat_pair->xlat_st);
4216                                 }
4217 skip_xlate:
4218                                 /*
4219                                  * Found a rule from a state; jump to the
4220                                  * 'action' part of the rule.
4221                                  */
4222                                 f = s->st_rule;
4223                                 KKASSERT(f->cpuid == mycpuid);
4224
4225                                 cmd = ACTION_PTR(f);
4226                                 l = f->cmd_len - f->act_ofs;
4227                                 dyn_f = f;
4228                                 goto check_body;
4229
4230                         case O_ACCEPT:
4231                                 retval = IP_FW_PASS;    /* accept */
4232                                 goto done;
4233
4234                         case O_DEFRAG:
4235                                 if (f->cross_rules == NULL) {
4236                                         /*
4237                                          * This rule was not completely setup;
4238                                          * move on to the next rule.
4239                                          */
4240                                         goto next_rule;
4241                                 }
4242
4243                                 /*
4244                                  * Don't defrag for l2 packets, output packets
4245                                  * or non-fragments.
4246                                  */
4247                                 if (oif != NULL || args->eh != NULL ||
4248                                     (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4249                                         goto next_rule;
4250
4251                                 ctx->ipfw_frags++;
4252                                 m = ip_reass(m);
4253                                 args->m = m;
4254                                 if (m == NULL) {
4255                                         retval = IP_FW_PASS;
4256                                         goto done;
4257                                 }
4258                                 ctx->ipfw_defraged++;
4259                                 KASSERT((m->m_flags & M_HASH) == 0,
4260                                     ("hash not cleared"));
4261
4262                                 /* Update statistics */
4263                                 f->pcnt++;
4264                                 f->bcnt += lc.ip_len;
4265                                 f->timestamp = time_second;
4266
4267                                 ip = mtod(m, struct ip *);
4268                                 hlen = ip->ip_hl << 2;
4269                                 ip->ip_len += hlen;
4270
4271                                 ip->ip_len = htons(ip->ip_len);
4272                                 ip->ip_off = htons(ip->ip_off);
4273
4274                                 ip_hashfn(&m, 0);
4275                                 args->m = m;
4276                                 if (m == NULL)
4277                                         goto pullup_failed;
4278
4279                                 KASSERT(m->m_flags & M_HASH, ("no hash"));
4280                                 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4281                                 if (cpuid != mycpuid) {
4282                                         /*
4283                                          * NOTE:
4284                                          * ip_len/ip_off are in network byte
4285                                          * order.
4286                                          */
4287                                         ctx->ipfw_defrag_remote++;
4288                                         ipfw_defrag_redispatch(m, cpuid, f);
4289                                         args->m = NULL;
4290                                         return (IP_FW_REDISPATCH);
4291                                 }
4292
4293                                 /* 'm' might be changed by ip_hashfn(). */
4294                                 ip = mtod(m, struct ip *);
4295                                 ip->ip_len = ntohs(ip->ip_len);
4296                                 ip->ip_off = ntohs(ip->ip_off);
4297
4298                                 m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4299                                 if (m == NULL)
4300                                         goto pullup_failed;
4301
4302                                 /* Move on. */
4303                                 goto next_rule;
4304
4305                         case O_PIPE:
4306                         case O_QUEUE:
4307                                 args->rule = f; /* report matching rule */
4308                                 args->cookie = cmd->arg1;
4309                                 retval = IP_FW_DUMMYNET;
4310                                 goto done;
4311
4312                         case O_DIVERT:
4313                         case O_TEE:
4314                                 if (args->eh) /* not on layer 2 */
4315                                         break;
4316
4317                                 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4318                                     sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4319                                 if (mtag == NULL) {
4320                                         retval = IP_FW_DENY;
4321                                         goto done;
4322                                 }
4323                                 divinfo = m_tag_data(mtag);
4324
4325                                 divinfo->skipto = f->rulenum;
4326                                 divinfo->port = cmd->arg1;
4327                                 divinfo->tee = (cmd->opcode == O_TEE);
4328                                 m_tag_prepend(m, mtag);
4329
4330                                 args->cookie = cmd->arg1;
4331                                 retval = (cmd->opcode == O_DIVERT) ?
4332                                          IP_FW_DIVERT : IP_FW_TEE;
4333                                 goto done;
4334
4335                         case O_COUNT:
4336                         case O_SKIPTO:
4337                                 f->pcnt++;      /* update stats */
4338                                 f->bcnt += lc.ip_len;
4339                                 f->timestamp = time_second;
4340                                 if (cmd->opcode == O_COUNT)
4341                                         goto next_rule;
4342                                 /* handle skipto */
4343                                 if (f->next_rule == NULL)
4344                                         lookup_next_rule(f);
4345                                 f = f->next_rule;
4346                                 goto again;
4347
4348                         case O_REJECT:
4349                                 /*
4350                                  * Drop the packet and send a reject notice
4351                                  * if the packet is not ICMP (or is an ICMP
4352                                  * query), and it is not multicast/broadcast.
4353                                  */
4354                                 if (hlen > 0 &&
4355                                     (lc.proto != IPPROTO_ICMP ||
4356                                      is_icmp_query(ip)) &&
4357                                     !(m->m_flags & (M_BCAST|M_MCAST)) &&
4358                                     !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4359                                         send_reject(args, cmd->arg1,
4360                                             lc.offset, lc.ip_len);
4361                                         retval = IP_FW_DENY;
4362                                         goto done;
4363                                 }
4364                                 /* FALLTHROUGH */
4365                         case O_DENY:
4366                                 retval = IP_FW_DENY;
4367                                 goto done;
4368
4369                         case O_FORWARD_IP:
4370                                 if (args->eh)   /* not valid on layer2 pkts */
4371                                         break;
4372                                 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4373                                         struct sockaddr_in *sin;
4374
4375                                         mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4376                                             sizeof(*sin), M_INTWAIT | M_NULLOK);
4377                                         if (mtag == NULL) {
4378                                                 retval = IP_FW_DENY;
4379                                                 goto done;
4380                                         }
4381                                         sin = m_tag_data(mtag);
4382
4383                                         /* Structure copy */
4384                                         *sin = ((ipfw_insn_sa *)cmd)->sa;
4385
4386                                         m_tag_prepend(m, mtag);
4387                                         m->m_pkthdr.fw_flags |=
4388                                                 IPFORWARD_MBUF_TAGGED;
4389                                         m->m_pkthdr.fw_flags &=
4390                                                 ~BRIDGE_MBUF_TAGGED;
4391                                 }
4392                                 retval = IP_FW_PASS;
4393                                 goto done;
4394
4395                         default:
4396                                 panic("-- unknown opcode %d", cmd->opcode);
4397                         } /* end of switch() on opcodes */
4398
4399                         if (cmd->len & F_NOT)
4400                                 match = !match;
4401
4402                         if (match) {
4403                                 if (cmd->len & F_OR)
4404                                         skip_or = 1;
4405                         } else {
4406                                 if (!(cmd->len & F_OR)) /* not an OR block, */
4407                                         break;          /* try next rule    */
4408                         }
4409
4410                 }       /* end of inner for, scan opcodes */
4411
4412 next_rule:;             /* try next rule                */
4413
4414         }               /* end of outer for, scan rules */
4415         kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4416         return IP_FW_DENY;
4417
4418 done:
4419         /* Update statistics */
4420         f->pcnt++;
4421         f->bcnt += lc.ip_len;
4422         f->timestamp = time_second;
4423         return retval;
4424
4425 pullup_failed:
4426         if (fw_verbose)
4427                 kprintf("pullup failed\n");
4428         return IP_FW_DENY;
4429 }
4430
4431 static struct mbuf *
4432 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4433 {
4434         struct m_tag *mtag;
4435         struct dn_pkt *pkt;
4436         ipfw_insn *cmd;
4437         const struct ipfw_flow_id *id;
4438         struct dn_flow_id *fid;
4439
4440         M_ASSERTPKTHDR(m);
4441
4442         mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4443             M_INTWAIT | M_NULLOK);
4444         if (mtag == NULL) {
4445                 m_freem(m);
4446                 return (NULL);
4447         }
4448         m_tag_prepend(m, mtag);
4449
4450         pkt = m_tag_data(mtag);
4451         bzero(pkt, sizeof(*pkt));
4452
4453         cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4454         if (cmd->opcode == O_LOG)
4455                 cmd += F_LEN(cmd);
4456         KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4457                 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4458
4459         pkt->dn_m = m;
4460         pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4461         pkt->ifp = fwa->oif;
4462         pkt->pipe_nr = pipe_nr;
4463
4464         pkt->cpuid = mycpuid;
4465         pkt->msgport = netisr_curport();
4466
4467         id = &fwa->f_id;
4468         fid = &pkt->id;
4469         fid->fid_dst_ip = id->dst_ip;
4470         fid->fid_src_ip = id->src_ip;
4471         fid->fid_dst_port = id->dst_port;
4472         fid->fid_src_port = id->src_port;
4473         fid->fid_proto = id->proto;
4474         fid->fid_flags = id->flags;
4475
4476         ipfw_ref_rule(fwa->rule);
4477         pkt->dn_priv = fwa->rule;
4478         pkt->dn_unref_priv = ipfw_unref_rule;
4479
4480         if (cmd->opcode == O_PIPE)
4481                 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4482
4483         m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4484         return (m);
4485 }
4486
4487 /*
4488  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4489  * These will be reconstructed on the fly as packets are matched.
4490  */
4491 static void
4492 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4493 {
4494         struct ip_fw *rule;
4495
4496         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4497                 rule->next_rule = NULL;
4498 }
4499
4500 static void
4501 ipfw_inc_static_count(struct ip_fw *rule)
4502 {
4503         /* Static rule's counts are updated only on CPU0 */
4504         KKASSERT(mycpuid == 0);
4505
4506         static_count++;
4507         static_ioc_len += IOC_RULESIZE(rule);
4508 }
4509
4510 static void
4511 ipfw_dec_static_count(struct ip_fw *rule)
4512 {
4513         int l = IOC_RULESIZE(rule);
4514
4515         /* Static rule's counts are updated only on CPU0 */
4516         KKASSERT(mycpuid == 0);
4517
4518         KASSERT(static_count > 0, ("invalid static count %u", static_count));
4519         static_count--;
4520
4521         KASSERT(static_ioc_len >= l,
4522                 ("invalid static len %u", static_ioc_len));
4523         static_ioc_len -= l;
4524 }
4525
4526 static void
4527 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4528 {
4529         if (fwmsg->sibling != NULL) {
4530                 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4531                 fwmsg->sibling->sibling = rule;
4532         }
4533         fwmsg->sibling = rule;
4534 }
4535
4536 static struct ip_fw *
4537 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4538 {
4539         struct ip_fw *rule;
4540
4541         rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4542
4543         rule->act_ofs = ioc_rule->act_ofs;
4544         rule->cmd_len = ioc_rule->cmd_len;
4545         rule->rulenum = ioc_rule->rulenum;
4546         rule->set = ioc_rule->set;
4547         rule->usr_flags = ioc_rule->usr_flags;
4548
4549         bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4550
4551         rule->refcnt = 1;
4552         rule->cpuid = mycpuid;
4553         rule->rule_flags = rule_flags;
4554
4555         return rule;
4556 }
4557
4558 static void
4559 ipfw_add_rule_dispatch(netmsg_t nmsg)
4560 {
4561         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4562         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4563         struct ip_fw *rule;
4564
4565         ASSERT_NETISR_NCPUS(mycpuid);
4566
4567         rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4568
4569         /*
4570          * Insert rule into the pre-determined position
4571          */
4572         if (fwmsg->prev_rule != NULL) {
4573                 struct ip_fw *prev, *next;
4574
4575                 prev = fwmsg->prev_rule;
4576                 KKASSERT(prev->cpuid == mycpuid);
4577
4578                 next = fwmsg->next_rule;
4579                 KKASSERT(next->cpuid == mycpuid);
4580
4581                 rule->next = next;
4582                 prev->next = rule;
4583
4584                 /*
4585                  * Move to the position on the next CPU
4586                  * before the msg is forwarded.
4587                  */
4588                 fwmsg->prev_rule = prev->sibling;
4589                 fwmsg->next_rule = next->sibling;
4590         } else {
4591                 KKASSERT(fwmsg->next_rule == NULL);
4592                 rule->next = ctx->ipfw_layer3_chain;
4593                 ctx->ipfw_layer3_chain = rule;
4594         }
4595
4596         /* Link rule CPU sibling */
4597         ipfw_link_sibling(fwmsg, rule);
4598
4599         ipfw_flush_rule_ptrs(ctx);
4600
4601         if (mycpuid == 0) {
4602                 /* Statistics only need to be updated once */
4603                 ipfw_inc_static_count(rule);
4604
4605                 /* Return the rule on CPU0 */
4606                 nmsg->lmsg.u.ms_resultp = rule;
4607         }
4608
4609         if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4610                 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4611
4612         if (fwmsg->cross_rules != NULL) {
4613                 /* Save rules for later use. */
4614                 fwmsg->cross_rules[mycpuid] = rule;
4615         }
4616
4617         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4618 }
4619
4620 static void
4621 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4622 {
4623         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4624         struct ip_fw *rule = fwmsg->sibling;
4625         int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4626
4627         ASSERT_NETISR_NCPUS(mycpuid);
4628         KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4629             ("not crossref rule"));
4630
4631         rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4632         memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4633
4634         fwmsg->sibling = rule->sibling;
4635         netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4636 }
4637
4638 /*
4639  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4640  * then possibly create a rule number and add the rule to the list.
4641  * Update the rule_number in the input struct so the caller knows
4642  * it as well.
4643  */
4644 static void
4645 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4646 {
4647         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4648         struct netmsg_ipfw fwmsg;
4649         struct ip_fw *f, *prev, *rule;
4650
4651         ASSERT_NETISR0;
4652
4653         /*
4654          * If rulenum is 0, find highest numbered rule before the
4655          * default rule, and add rule number incremental step.
4656          */
4657         if (ioc_rule->rulenum == 0) {
4658                 int step = autoinc_step;
4659
4660                 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4661                          step <= IPFW_AUTOINC_STEP_MAX);
4662
4663                 /*
4664                  * Locate the highest numbered rule before default
4665                  */
4666                 for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4667                         if (f->rulenum == IPFW_DEFAULT_RULE)
4668                                 break;
4669                         ioc_rule->rulenum = f->rulenum;
4670                 }
4671                 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4672                         ioc_rule->rulenum += step;
4673         }
4674         KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4675                 ioc_rule->rulenum != 0,
4676                 ("invalid rule num %d", ioc_rule->rulenum));
4677
4678         /*
4679          * Now find the right place for the new rule in the sorted list.
4680          */
4681         for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4682              prev = f, f = f->next) {
4683                 if (f->rulenum > ioc_rule->rulenum) {
4684                         /* Found the location */
4685                         break;
4686                 }
4687         }
4688         KASSERT(f != NULL, ("no default rule?!"));
4689
4690         /*
4691          * Duplicate the rule onto each CPU.
4692          * The rule duplicated on CPU0 will be returned.
4693          */
4694         bzero(&fwmsg, sizeof(fwmsg));
4695         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4696             ipfw_add_rule_dispatch);
4697         fwmsg.ioc_rule = ioc_rule;
4698         fwmsg.prev_rule = prev;
4699         fwmsg.next_rule = prev == NULL ? NULL : f;
4700         fwmsg.rule_flags = rule_flags;
4701         if (rule_flags & IPFW_RULE_F_CROSSREF) {
4702                 fwmsg.cross_rules = kmalloc(
4703                     sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4704                     M_WAITOK | M_ZERO);
4705         }
4706
4707         netisr_domsg_global(&fwmsg.base);
4708         KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4709
4710         rule = fwmsg.base.lmsg.u.ms_resultp;
4711         KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4712
4713         if (fwmsg.cross_rules != NULL) {
4714                 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4715                     MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4716                 fwmsg.sibling = rule;
4717                 netisr_domsg_global(&fwmsg.base);
4718                 KKASSERT(fwmsg.sibling == NULL);
4719
4720                 kfree(fwmsg.cross_rules, M_TEMP);
4721
4722 #ifdef KLD_MODULE
4723                 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4724 #endif
4725         }
4726
4727         DPRINTF("++ installed rule %d, static count now %d\n",
4728                 rule->rulenum, static_count);
4729 }
4730
4731 /*
4732  * Free storage associated with a static rule (including derived
4733  * states/tracks).
4734  * The caller is in charge of clearing rule pointers to avoid
4735  * dangling pointers.
4736  * @return a pointer to the next entry.
4737  * Arguments are not checked, so they better be correct.
4738  */
4739 static struct ip_fw *
4740 ipfw_delete_rule(struct ipfw_context *ctx,
4741                  struct ip_fw *prev, struct ip_fw *rule)
4742 {
4743         struct ip_fw *n;
4744
4745         n = rule->next;
4746         if (prev == NULL)
4747                 ctx->ipfw_layer3_chain = n;
4748         else
4749                 prev->next = n;
4750
4751         /* Mark the rule as invalid */
4752         rule->rule_flags |= IPFW_RULE_F_INVALID;
4753         rule->next_rule = NULL;
4754         rule->sibling = NULL;
4755 #ifdef foo
4756         /* Don't reset cpuid here; keep various assertion working */
4757         rule->cpuid = -1;
4758 #endif
4759
4760         /* Statistics only need to be updated once */
4761         if (mycpuid == 0)
4762                 ipfw_dec_static_count(rule);
4763
4764         if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4765                 /* Try to free this rule */
4766                 ipfw_free_rule(rule);
4767         } else {
4768                 /* TODO: check staging area. */
4769                 if (mycpuid == 0) {
4770                         rule->next = ipfw_gd.ipfw_crossref_free;
4771                         ipfw_gd.ipfw_crossref_free = rule;
4772                 }
4773         }
4774
4775         /* Return the next rule */
4776         return n;
4777 }
4778
4779 static void
4780 ipfw_flush_dispatch(netmsg_t nmsg)
4781 {
4782         int kill_default = nmsg->lmsg.u.ms_result;
4783         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4784         struct ip_fw *rule;
4785
4786         ASSERT_NETISR_NCPUS(mycpuid);
4787
4788         /*
4789          * Flush states.
4790          */
4791         ipfw_state_flush(ctx, NULL);
4792         KASSERT(ctx->ipfw_state_cnt == 0,
4793             ("%d pcpu states remain", ctx->ipfw_state_cnt));
4794         ctx->ipfw_state_loosecnt = 0;
4795         ctx->ipfw_state_lastexp = 0;
4796
4797         /*
4798          * Flush tracks.
4799          */
4800         ipfw_track_flush(ctx, NULL);
4801         ctx->ipfw_track_lastexp = 0;
4802         if (ctx->ipfw_trkcnt_spare != NULL) {
4803                 kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4804                 ctx->ipfw_trkcnt_spare = NULL;
4805         }
4806
4807         ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4808
4809         while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4810                (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4811                 ipfw_delete_rule(ctx, NULL, rule);
4812
4813         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4814 }
4815
4816 /*
4817  * Deletes all rules from a chain (including the default rule
4818  * if the second argument is set).
4819  */
4820 static void
4821 ipfw_flush(int kill_default)
4822 {
4823         struct netmsg_base nmsg;
4824 #ifdef INVARIANTS
4825         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4826         int state_cnt;
4827 #endif
4828
4829         ASSERT_NETISR0;
4830
4831         /*
4832          * If 'kill_default' then caller has done the necessary
4833          * msgport syncing; unnecessary to do it again.
4834          */
4835         if (!kill_default) {
4836                 /*
4837                  * Let ipfw_chk() know the rules are going to
4838                  * be flushed, so it could jump directly to
4839                  * the default rule.
4840                  */
4841                 ipfw_flushing = 1;
4842                 /* XXX use priority sync */
4843                 netmsg_service_sync();
4844         }
4845
4846         /*
4847          * Press the 'flush' button
4848          */
4849         bzero(&nmsg, sizeof(nmsg));
4850         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4851             ipfw_flush_dispatch);
4852         nmsg.lmsg.u.ms_result = kill_default;
4853         netisr_domsg_global(&nmsg);
4854         ipfw_gd.ipfw_state_loosecnt = 0;
4855         ipfw_gd.ipfw_state_globexp = 0;
4856         ipfw_gd.ipfw_track_globexp = 0;
4857
4858 #ifdef INVARIANTS
4859         state_cnt = ipfw_state_cntcoll();
4860         KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4861
4862         KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4863             ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4864
4865         if (kill_default) {
4866                 KASSERT(static_count == 0,
4867                         ("%u static rules remain", static_count));
4868                 KASSERT(static_ioc_len == 0,
4869                         ("%u bytes of static rules remain", static_ioc_len));
4870         } else {
4871                 KASSERT(static_count == 1,
4872                         ("%u static rules remain", static_count));
4873                 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4874                         ("%u bytes of static rules remain, should be %lu",
4875                          static_ioc_len,
4876                          (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4877         }
4878 #endif
4879
4880         /* Flush is done */
4881         ipfw_flushing = 0;
4882 }
4883
4884 static void
4885 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4886 {
4887         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4888         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4889         struct ip_fw *rule, *prev;
4890
4891         ASSERT_NETISR_NCPUS(mycpuid);
4892
4893         rule = dmsg->start_rule;
4894         KKASSERT(rule->cpuid == mycpuid);
4895         dmsg->start_rule = rule->sibling;
4896
4897         prev = dmsg->prev_rule;
4898         if (prev != NULL) {
4899                 KKASSERT(prev->cpuid == mycpuid);
4900
4901                 /*
4902                  * Move to the position on the next CPU
4903                  * before the msg is forwarded.
4904                  */
4905                 dmsg->prev_rule = prev->sibling;
4906         }
4907
4908         /*
4909          * flush pointers outside the loop, then delete all matching
4910          * rules.  'prev' remains the same throughout the cycle.
4911          */
4912         ipfw_flush_rule_ptrs(ctx);
4913         while (rule && rule->rulenum == dmsg->rulenum) {
4914                 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4915                         /* Flush states generated by this rule. */
4916                         ipfw_state_flush(ctx, rule);
4917                 }
4918                 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4919                         /* Flush tracks generated by this rule. */
4920                         ipfw_track_flush(ctx, rule);
4921                 }
4922                 rule = ipfw_delete_rule(ctx, prev, rule);
4923         }
4924
4925         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4926 }
4927
4928 static int
4929 ipfw_alt_delete_rule(uint16_t rulenum)
4930 {
4931         struct ip_fw *prev, *rule;
4932         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4933         struct netmsg_del dmsg;
4934
4935         ASSERT_NETISR0;
4936
4937         /*
4938          * Locate first rule to delete
4939          */
4940         for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4941              rule && rule->rulenum < rulenum;
4942              prev = rule, rule = rule->next)
4943                 ; /* EMPTY */
4944         if (rule->rulenum != rulenum)
4945                 return EINVAL;
4946
4947         /*
4948          * Get rid of the rule duplications on all CPUs
4949          */
4950         bzero(&dmsg, sizeof(dmsg));
4951         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4952             ipfw_alt_delete_rule_dispatch);
4953         dmsg.prev_rule = prev;
4954         dmsg.start_rule = rule;
4955         dmsg.rulenum = rulenum;
4956
4957         netisr_domsg_global(&dmsg.base);
4958         KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4959         return 0;
4960 }
4961
4962 static void
4963 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4964 {
4965         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4966         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4967         struct ip_fw *prev, *rule;
4968 #ifdef INVARIANTS
4969         int del = 0;
4970 #endif
4971
4972         ASSERT_NETISR_NCPUS(mycpuid);
4973
4974         ipfw_flush_rule_ptrs(ctx);
4975
4976         prev = NULL;
4977         rule = ctx->ipfw_layer3_chain;
4978         while (rule != NULL) {
4979                 if (rule->set == dmsg->from_set) {
4980                         if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4981                                 /* Flush states generated by this rule. */
4982                                 ipfw_state_flush(ctx, rule);
4983                         }
4984                         if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4985                                 /* Flush tracks generated by this rule. */
4986                                 ipfw_track_flush(ctx, rule);
4987                         }
4988                         rule = ipfw_delete_rule(ctx, prev, rule);
4989 #ifdef INVARIANTS
4990                         del = 1;
4991 #endif
4992                 } else {
4993                         prev = rule;
4994                         rule = rule->next;
4995                 }
4996         }
4997         KASSERT(del, ("no match set?!"));
4998
4999         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5000 }
5001
5002 static int
5003 ipfw_alt_delete_ruleset(uint8_t set)
5004 {
5005         struct netmsg_del dmsg;
5006         int del;
5007         struct ip_fw *rule;
5008         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5009
5010         ASSERT_NETISR0;
5011
5012         /*
5013          * Check whether the 'set' exists.  If it exists,
5014          * then check whether any rules within the set will
5015          * try to create states.
5016          */
5017         del = 0;
5018         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5019                 if (rule->set == set)
5020                         del = 1;
5021         }
5022         if (!del)
5023                 return 0; /* XXX EINVAL? */
5024
5025         /*
5026          * Delete this set
5027          */
5028         bzero(&dmsg, sizeof(dmsg));
5029         netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5030             ipfw_alt_delete_ruleset_dispatch);
5031         dmsg.from_set = set;
5032         netisr_domsg_global(&dmsg.base);
5033
5034         return 0;
5035 }
5036
5037 static void
5038 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5039 {
5040         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5041         struct ip_fw *rule;
5042
5043         ASSERT_NETISR_NCPUS(mycpuid);
5044
5045         rule = dmsg->start_rule;
5046         KKASSERT(rule->cpuid == mycpuid);
5047
5048         /*
5049          * Move to the position on the next CPU
5050          * before the msg is forwarded.
5051          */
5052         dmsg->start_rule = rule->sibling;
5053
5054         while (rule && rule->rulenum <= dmsg->rulenum) {
5055                 if (rule->rulenum == dmsg->rulenum)
5056                         rule->set = dmsg->to_set;
5057                 rule = rule->next;
5058         }
5059         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5060 }
5061
5062 static int
5063 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5064 {
5065         struct netmsg_del dmsg;
5066         struct netmsg_base *nmsg;
5067         struct ip_fw *rule;
5068         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5069
5070         ASSERT_NETISR0;
5071
5072         /*
5073          * Locate first rule to move
5074          */
5075         for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5076              rule = rule->next) {
5077                 if (rule->rulenum == rulenum && rule->set != set)
5078                         break;
5079         }
5080         if (rule == NULL || rule->rulenum > rulenum)
5081                 return 0; /* XXX error? */
5082
5083         bzero(&dmsg, sizeof(dmsg));
5084         nmsg = &dmsg.base;
5085         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5086             ipfw_alt_move_rule_dispatch);
5087         dmsg.start_rule = rule;
5088         dmsg.rulenum = rulenum;
5089         dmsg.to_set = set;
5090
5091         netisr_domsg_global(nmsg);
5092         KKASSERT(dmsg.start_rule == NULL);
5093         return 0;
5094 }
5095
5096 static void
5097 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5098 {
5099         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5100         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5101         struct ip_fw *rule;
5102
5103         ASSERT_NETISR_NCPUS(mycpuid);
5104
5105         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5106                 if (rule->set == dmsg->from_set)
5107                         rule->set = dmsg->to_set;
5108         }
5109         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5110 }
5111
5112 static int
5113 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5114 {
5115         struct netmsg_del dmsg;
5116         struct netmsg_base *nmsg;
5117
5118         ASSERT_NETISR0;
5119
5120         bzero(&dmsg, sizeof(dmsg));
5121         nmsg = &dmsg.base;
5122         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5123             ipfw_alt_move_ruleset_dispatch);
5124         dmsg.from_set = from_set;
5125         dmsg.to_set = to_set;
5126
5127         netisr_domsg_global(nmsg);
5128         return 0;
5129 }
5130
5131 static void
5132 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5133 {
5134         struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5135         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5136         struct ip_fw *rule;
5137
5138         ASSERT_NETISR_NCPUS(mycpuid);
5139
5140         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5141                 if (rule->set == dmsg->from_set)
5142                         rule->set = dmsg->to_set;
5143                 else if (rule->set == dmsg->to_set)
5144                         rule->set = dmsg->from_set;
5145         }
5146         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5147 }
5148
5149 static int
5150 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5151 {
5152         struct netmsg_del dmsg;
5153         struct netmsg_base *nmsg;
5154
5155         ASSERT_NETISR0;
5156
5157         bzero(&dmsg, sizeof(dmsg));
5158         nmsg = &dmsg.base;
5159         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5160             ipfw_alt_swap_ruleset_dispatch);
5161         dmsg.from_set = set1;
5162         dmsg.to_set = set2;
5163
5164         netisr_domsg_global(nmsg);
5165         return 0;
5166 }
5167
5168 /*
5169  * Remove all rules with given number, and also do set manipulation.
5170  *
5171  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5172  * the next 8 bits are the new set, the top 8 bits are the command:
5173  *
5174  *      0       delete rules with given number
5175  *      1       delete rules with given set number
5176  *      2       move rules with given number to new set
5177  *      3       move rules with given set number to new set
5178  *      4       swap sets with given numbers
5179  */
5180 static int
5181 ipfw_ctl_alter(uint32_t arg)
5182 {
5183         uint16_t rulenum;
5184         uint8_t cmd, new_set;
5185         int error = 0;
5186
5187         ASSERT_NETISR0;
5188
5189         rulenum = arg & 0xffff;
5190         cmd = (arg >> 24) & 0xff;
5191         new_set = (arg >> 16) & 0xff;
5192
5193         if (cmd > 4)
5194                 return EINVAL;
5195         if (new_set >= IPFW_DEFAULT_SET)
5196                 return EINVAL;
5197         if (cmd == 0 || cmd == 2) {
5198                 if (rulenum == IPFW_DEFAULT_RULE)
5199                         return EINVAL;
5200         } else {
5201                 if (rulenum >= IPFW_DEFAULT_SET)
5202                         return EINVAL;
5203         }
5204
5205         switch (cmd) {
5206         case 0: /* delete rules with given number */
5207                 error = ipfw_alt_delete_rule(rulenum);
5208                 break;
5209
5210         case 1: /* delete all rules with given set number */
5211                 error = ipfw_alt_delete_ruleset(rulenum);
5212                 break;
5213
5214         case 2: /* move rules with given number to new set */
5215                 error = ipfw_alt_move_rule(rulenum, new_set);
5216                 break;
5217
5218         case 3: /* move rules with given set number to new set */
5219                 error = ipfw_alt_move_ruleset(rulenum, new_set);
5220                 break;
5221
5222         case 4: /* swap two sets */
5223                 error = ipfw_alt_swap_ruleset(rulenum, new_set);
5224                 break;
5225         }
5226         return error;
5227 }
5228
5229 /*
5230  * Clear counters for a specific rule.
5231  */
5232 static void
5233 clear_counters(struct ip_fw *rule, int log_only)
5234 {
5235         ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5236
5237         if (log_only == 0) {
5238                 rule->bcnt = rule->pcnt = 0;
5239                 rule->timestamp = 0;
5240         }
5241         if (l->o.opcode == O_LOG)
5242                 l->log_left = l->max_log;
5243 }
5244
5245 static void
5246 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5247 {
5248         struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5249         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5250         struct ip_fw *rule;
5251
5252         ASSERT_NETISR_NCPUS(mycpuid);
5253
5254         if (zmsg->rulenum == 0) {
5255                 KKASSERT(zmsg->start_rule == NULL);
5256
5257                 ctx->ipfw_norule_counter = 0;
5258                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5259                         clear_counters(rule, zmsg->log_only);
5260         } else {
5261                 struct ip_fw *start = zmsg->start_rule;
5262
5263                 KKASSERT(start->cpuid == mycpuid);
5264                 KKASSERT(start->rulenum == zmsg->rulenum);
5265
5266                 /*
5267                  * We can have multiple rules with the same number, so we
5268                  * need to clear them all.
5269                  */
5270                 for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5271                      rule = rule->next)
5272                         clear_counters(rule, zmsg->log_only);
5273
5274                 /*
5275                  * Move to the position on the next CPU
5276                  * before the msg is forwarded.
5277                  */
5278                 zmsg->start_rule = start->sibling;
5279         }
5280         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5281 }
5282
5283 /*
5284  * Reset some or all counters on firewall rules.
5285  * @arg frwl is null to clear all entries, or contains a specific
5286  * rule number.
5287  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5288  */
5289 static int
5290 ipfw_ctl_zero_entry(int rulenum, int log_only)
5291 {
5292         struct netmsg_zent zmsg;
5293         struct netmsg_base *nmsg;
5294         const char *msg;
5295         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5296
5297         ASSERT_NETISR0;
5298
5299         bzero(&zmsg, sizeof(zmsg));
5300         nmsg = &zmsg.base;
5301         netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5302             ipfw_zero_entry_dispatch);
5303         zmsg.log_only = log_only;
5304
5305         if (rulenum == 0) {
5306                 msg = log_only ? "ipfw: All logging counts reset.\n"
5307                                : "ipfw: Accounting cleared.\n";
5308         } else {
5309                 struct ip_fw *rule;
5310
5311                 /*
5312                  * Locate the first rule with 'rulenum'
5313                  */
5314                 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5315                         if (rule->rulenum == rulenum)
5316                                 break;
5317                 }
5318                 if (rule == NULL) /* we did not find any matching rules */
5319                         return (EINVAL);
5320                 zmsg.start_rule = rule;
5321                 zmsg.rulenum = rulenum;
5322
5323                 msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5324                                : "ipfw: Entry %d cleared.\n";
5325         }
5326         netisr_domsg_global(nmsg);
5327         KKASSERT(zmsg.start_rule == NULL);
5328
5329         if (fw_verbose)
5330                 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5331         return (0);
5332 }
5333
5334 /*
5335  * Check validity of the structure before insert.
5336  * Fortunately rules are simple, so this mostly need to check rule sizes.
5337  */
5338 static int
5339 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5340 {
5341         int l, cmdlen = 0;
5342         int have_action = 0;
5343         ipfw_insn *cmd;
5344
5345         *rule_flags = 0;
5346
5347         /* Check for valid size */
5348         if (size < sizeof(*rule)) {
5349                 kprintf("ipfw: rule too short\n");
5350                 return EINVAL;
5351         }
5352         l = IOC_RULESIZE(rule);
5353         if (l != size) {
5354                 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5355                 return EINVAL;
5356         }
5357
5358         /* Check rule number */
5359         if (rule->rulenum == IPFW_DEFAULT_RULE) {
5360                 kprintf("ipfw: invalid rule number\n");
5361                 return EINVAL;
5362         }
5363
5364         /*
5365          * Now go for the individual checks. Very simple ones, basically only
5366          * instruction sizes.
5367          */
5368         for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5369              l -= cmdlen, cmd += cmdlen) {
5370                 cmdlen = F_LEN(cmd);
5371                 if (cmdlen > l) {
5372                         kprintf("ipfw: opcode %d size truncated\n",
5373                                 cmd->opcode);
5374                         return EINVAL;
5375                 }
5376
5377                 DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5378
5379                 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5380                     IPFW_ISXLAT(cmd->opcode)) {
5381                         /* This rule will generate states. */
5382                         *rule_flags |= IPFW_RULE_F_GENSTATE;
5383                         if (cmd->opcode == O_LIMIT)
5384                                 *rule_flags |= IPFW_RULE_F_GENTRACK;
5385                 }
5386                 if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5387                         *rule_flags |= IPFW_RULE_F_CROSSREF;
5388                 if (cmd->opcode == O_IP_SRC_IFIP ||
5389                     cmd->opcode == O_IP_DST_IFIP) {
5390                         *rule_flags |= IPFW_RULE_F_DYNIFADDR;
5391                         cmd->arg1 &= IPFW_IFIP_SETTINGS;
5392                 }
5393
5394                 switch (cmd->opcode) {
5395                 case O_NOP:
5396                 case O_PROBE_STATE:
5397                 case O_KEEP_STATE:
5398                 case O_PROTO:
5399                 case O_IP_SRC_ME:
5400                 case O_IP_DST_ME:
5401                 case O_LAYER2:
5402                 case O_IN:
5403                 case O_FRAG:
5404                 case O_IPFRAG:
5405                 case O_IPOPT:
5406                 case O_IPLEN:
5407                 case O_IPID:
5408                 case O_IPTOS:
5409                 case O_IPPRECEDENCE:
5410                 case O_IPTTL:
5411                 case O_IPVER:
5412                 case O_TCPWIN:
5413                 case O_TCPFLAGS:
5414                 case O_TCPOPTS:
5415                 case O_ESTAB:
5416                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5417                                 goto bad_size;
5418                         break;
5419
5420                 case O_IP_SRC_TABLE:
5421                 case O_IP_DST_TABLE:
5422                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5423                                 goto bad_size;
5424                         if (cmd->arg1 >= ipfw_table_max) {
5425                                 kprintf("ipfw: invalid table id %u, max %d\n",
5426                                     cmd->arg1, ipfw_table_max);
5427                                 return EINVAL;
5428                         }
5429                         break;
5430
5431                 case O_IP_SRC_IFIP:
5432                 case O_IP_DST_IFIP:
5433                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5434                                 goto bad_size;
5435                         break;
5436
5437                 case O_ICMPTYPE:
5438                         if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5439                                 goto bad_size;
5440                         break;
5441
5442                 case O_UID:
5443                 case O_GID:
5444                 case O_IP_SRC:
5445                 case O_IP_DST:
5446                 case O_TCPSEQ:
5447                 case O_TCPACK:
5448                 case O_PROB:
5449                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5450                                 goto bad_size;
5451                         break;
5452
5453                 case O_LIMIT:
5454                         if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5455                                 goto bad_size;
5456                         break;
5457                 case O_REDIRECT:
5458                         if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5459                                 goto bad_size;
5460                         break;
5461
5462                 case O_LOG:
5463                         if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5464                                 goto bad_size;
5465
5466                         ((ipfw_insn_log *)cmd)->log_left =
5467                             ((ipfw_insn_log *)cmd)->max_log;
5468
5469                         break;
5470
5471                 case O_IP_SRC_MASK:
5472                 case O_IP_DST_MASK:
5473                         if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5474                                 goto bad_size;
5475                         if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5476                                 kprintf("ipfw: opcode %d, useless rule\n",
5477                                         cmd->opcode);
5478                                 return EINVAL;
5479                         }
5480                         break;
5481
5482                 case O_IP_SRC_SET:
5483                 case O_IP_DST_SET:
5484                         if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5485                                 kprintf("ipfw: invalid set size %d\n",
5486                                         cmd->arg1);
5487                                 return EINVAL;
5488                         }
5489                         if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5490                             (cmd->arg1+31)/32 )
5491                                 goto bad_size;
5492                         break;
5493
5494                 case O_MACADDR2:
5495                         if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5496                                 goto bad_size;
5497                         break;
5498
5499                 case O_MAC_TYPE:
5500                 case O_IP_SRCPORT:
5501                 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5502                         if (cmdlen < 2 || cmdlen > 31)
5503                                 goto bad_size;
5504                         break;
5505
5506                 case O_RECV:
5507                 case O_XMIT:
5508                 case O_VIA:
5509                         if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5510                                 goto bad_size;
5511                         break;
5512
5513                 case O_PIPE:
5514                 case O_QUEUE:
5515                         if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5516                                 goto bad_size;
5517                         goto check_action;
5518
5519                 case O_FORWARD_IP:
5520                         if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5521                                 goto bad_size;
5522                         } else {
5523                                 in_addr_t fwd_addr;
5524
5525                                 fwd_addr = ((ipfw_insn_sa *)cmd)->
5526                                            sa.sin_addr.s_addr;
5527                                 if (IN_MULTICAST(ntohl(fwd_addr))) {
5528                                         kprintf("ipfw: try forwarding to "
5529                                                 "multicast address\n");
5530                                         return EINVAL;
5531                                 }
5532                         }
5533                         goto check_action;
5534
5535                 case O_FORWARD_MAC: /* XXX not implemented yet */
5536                 case O_CHECK_STATE:
5537                 case O_COUNT:
5538                 case O_ACCEPT:
5539                 case O_DENY:
5540                 case O_REJECT:
5541                 case O_SKIPTO:
5542                 case O_DIVERT:
5543                 case O_TEE:
5544                 case O_DEFRAG:
5545                         if (cmdlen != F_INSN_SIZE(ipfw_insn))
5546                                 goto bad_size;
5547 check_action:
5548                         if (have_action) {
5549                                 kprintf("ipfw: opcode %d, multiple actions"
5550                                         " not allowed\n",
5551                                         cmd->opcode);
5552                                 return EINVAL;
5553                         }
5554                         have_action = 1;
5555                         if (l != cmdlen) {
5556                                 kprintf("ipfw: opcode %d, action must be"
5557                                         " last opcode\n",
5558                                         cmd->opcode);
5559                                 return EINVAL;
5560                         }
5561                         break;
5562                 default:
5563                         kprintf("ipfw: opcode %d, unknown opcode\n",
5564                                 cmd->opcode);
5565                         return EINVAL;
5566                 }
5567         }
5568         if (have_action == 0) {
5569                 kprintf("ipfw: missing action\n");
5570                 return EINVAL;
5571         }
5572         return 0;
5573
5574 bad_size:
5575         kprintf("ipfw: opcode %d size %d wrong\n",
5576                 cmd->opcode, cmdlen);
5577         return EINVAL;
5578 }
5579
5580 static int
5581 ipfw_ctl_add_rule(struct sockopt *sopt)
5582 {
5583         struct ipfw_ioc_rule *ioc_rule;
5584         size_t size;
5585         uint32_t rule_flags;
5586         int error;
5587
5588         ASSERT_NETISR0;
5589
5590         size = sopt->sopt_valsize;
5591         if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5592             size < sizeof(*ioc_rule)) {
5593                 return EINVAL;
5594         }
5595         if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5596                 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5597                                           IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5598         }
5599         ioc_rule = sopt->sopt_val;
5600
5601         error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5602         if (error)
5603                 return error;
5604
5605         ipfw_add_rule(ioc_rule, rule_flags);
5606
5607         if (sopt->sopt_dir == SOPT_GET)
5608                 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5609         return 0;
5610 }
5611
5612 static void *
5613 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5614     struct ipfw_ioc_rule *ioc_rule)
5615 {
5616         const struct ip_fw *sibling;
5617 #ifdef INVARIANTS
5618         int i;
5619 #endif
5620
5621         ASSERT_NETISR0;
5622         KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5623
5624         ioc_rule->act_ofs = rule->act_ofs;
5625         ioc_rule->cmd_len = rule->cmd_len;
5626         ioc_rule->rulenum = rule->rulenum;
5627         ioc_rule->set = rule->set;
5628         ioc_rule->usr_flags = rule->usr_flags;
5629
5630         ioc_rule->set_disable = ctx->ipfw_set_disable;
5631         ioc_rule->static_count = static_count;
5632         ioc_rule->static_len = static_ioc_len;
5633
5634         /*
5635          * Visit (read-only) all of the rule's duplications to get
5636          * the necessary statistics
5637          */
5638 #ifdef INVARIANTS
5639         i = 0;
5640 #endif
5641         ioc_rule->pcnt = 0;
5642         ioc_rule->bcnt = 0;
5643         ioc_rule->timestamp = 0;
5644         for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5645                 ioc_rule->pcnt += sibling->pcnt;
5646                 ioc_rule->bcnt += sibling->bcnt;
5647                 if (sibling->timestamp > ioc_rule->timestamp)
5648                         ioc_rule->timestamp = sibling->timestamp;
5649 #ifdef INVARIANTS
5650                 ++i;
5651 #endif
5652         }
5653         KASSERT(i == netisr_ncpus,
5654             ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5655
5656         bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5657
5658         return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5659 }
5660
5661 static boolean_t
5662 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5663 {
5664         struct ipfw_ioc_flowid *ioc_id;
5665
5666         if (trk->tc_expire == 0) {
5667                 /* Not a scanned one. */
5668                 return (FALSE);
5669         }
5670
5671         ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5672             0 : trk->tc_expire - time_uptime;
5673         ioc_state->pcnt = 0;
5674         ioc_state->bcnt = 0;
5675
5676         ioc_state->dyn_type = O_LIMIT_PARENT;
5677         ioc_state->count = trk->tc_count;
5678
5679         ioc_state->rulenum = trk->tc_rulenum;
5680
5681         ioc_id = &ioc_state->id;
5682         ioc_id->type = ETHERTYPE_IP;
5683         ioc_id->u.ip.proto = trk->tc_proto;
5684         ioc_id->u.ip.src_ip = trk->tc_saddr;
5685         ioc_id->u.ip.dst_ip = trk->tc_daddr;
5686         ioc_id->u.ip.src_port = trk->tc_sport;
5687         ioc_id->u.ip.dst_port = trk->tc_dport;
5688
5689         return (TRUE);
5690 }
5691
5692 static boolean_t
5693 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5694 {
5695         struct ipfw_ioc_flowid *ioc_id;
5696
5697         if (IPFW_STATE_SCANSKIP(s))
5698                 return (FALSE);
5699
5700         ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5701             0 : s->st_expire - time_uptime;
5702         ioc_state->pcnt = s->st_pcnt;
5703         ioc_state->bcnt = s->st_bcnt;
5704
5705         ioc_state->dyn_type = s->st_type;
5706         ioc_state->count = 0;
5707
5708         ioc_state->rulenum = s->st_rule->rulenum;
5709
5710         ioc_id = &ioc_state->id;
5711         ioc_id->type = ETHERTYPE_IP;
5712         ioc_id->u.ip.proto = s->st_proto;
5713         ipfw_key_4tuple(&s->st_key,
5714             &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5715             &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5716
5717         if (IPFW_ISXLAT(s->st_type)) {
5718                 const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5719
5720                 if (x->xlat_port == 0)
5721                         ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5722                 else
5723                         ioc_state->xlat_port = ntohs(x->xlat_port);
5724                 ioc_state->xlat_addr = ntohl(x->xlat_addr);
5725
5726                 ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5727                 ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5728         }
5729
5730         return (TRUE);
5731 }
5732
5733 static void
5734 ipfw_state_copy_dispatch(netmsg_t nmsg)
5735 {
5736         struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5737         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5738         const struct ipfw_state *s;
5739         const struct ipfw_track *t;
5740
5741         ASSERT_NETISR_NCPUS(mycpuid);
5742         KASSERT(nm->state_cnt < nm->state_cntmax,
5743             ("invalid state count %d, max %d",
5744              nm->state_cnt, nm->state_cntmax));
5745
5746         TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5747                 if (ipfw_state_copy(s, nm->ioc_state)) {
5748                         nm->ioc_state++;
5749                         nm->state_cnt++;
5750                         if (nm->state_cnt == nm->state_cntmax)
5751                                 goto done;
5752                 }
5753         }
5754
5755         /*
5756          * Prepare tracks in the global track tree for userland.
5757          */
5758         TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5759                 struct ipfw_trkcnt *trk;
5760
5761                 if (t->t_count == NULL) /* anchor */
5762                         continue;
5763                 trk = t->t_trkcnt;
5764
5765                 /*
5766                  * Only one netisr can run this function at
5767                  * any time, and only this function accesses
5768                  * trkcnt's tc_expire, so this is safe w/o
5769                  * ipfw_gd.ipfw_trkcnt_token.
5770                  */
5771                 if (trk->tc_expire > t->t_expire)
5772                         continue;
5773                 trk->tc_expire = t->t_expire;
5774         }
5775
5776         /*
5777          * Copy tracks in the global track tree to userland in
5778          * the last netisr.
5779          */
5780         if (mycpuid == netisr_ncpus - 1) {
5781                 struct ipfw_trkcnt *trk;
5782
5783                 KASSERT(nm->state_cnt < nm->state_cntmax,
5784                     ("invalid state count %d, max %d",
5785                      nm->state_cnt, nm->state_cntmax));
5786
5787                 IPFW_TRKCNT_TOKGET;
5788                 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5789                         if (ipfw_track_copy(trk, nm->ioc_state)) {
5790                                 nm->ioc_state++;
5791                                 nm->state_cnt++;
5792                                 if (nm->state_cnt == nm->state_cntmax) {
5793                                         IPFW_TRKCNT_TOKREL;
5794                                         goto done;
5795                                 }
5796                         }
5797                 }
5798                 IPFW_TRKCNT_TOKREL;
5799         }
5800 done:
5801         if (nm->state_cnt == nm->state_cntmax) {
5802                 /* No more space; done. */
5803                 netisr_replymsg(&nm->base, 0);
5804         } else {
5805                 netisr_forwardmsg(&nm->base, mycpuid + 1);
5806         }
5807 }
5808
5809 static int
5810 ipfw_ctl_get_rules(struct sockopt *sopt)
5811 {
5812         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5813         struct ip_fw *rule;
5814         void *bp;
5815         size_t size;
5816         int state_cnt;
5817
5818         ASSERT_NETISR0;
5819
5820         /*
5821          * pass up a copy of the current rules. Static rules
5822          * come first (the last of which has number IPFW_DEFAULT_RULE),
5823          * followed by a possibly empty list of states.
5824          */
5825
5826         size = static_ioc_len;  /* size of static rules */
5827
5828         /*
5829          * Size of the states.
5830          * XXX take tracks as state for userland compat.
5831          */
5832         state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5833         state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5834         size += state_cnt * sizeof(struct ipfw_ioc_state);
5835
5836         if (sopt->sopt_valsize < size) {
5837                 /* short length, no need to return incomplete rules */
5838                 /* XXX: if superuser, no need to zero buffer */
5839                 bzero(sopt->sopt_val, sopt->sopt_valsize);
5840                 return 0;
5841         }
5842         bp = sopt->sopt_val;
5843
5844         for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5845                 bp = ipfw_copy_rule(ctx, rule, bp);
5846
5847         if (state_cnt) {
5848                 struct netmsg_cpstate nm;
5849 #ifdef INVARIANTS
5850                 size_t old_size = size;
5851 #endif
5852
5853                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5854                     MSGF_PRIORITY, ipfw_state_copy_dispatch);
5855                 nm.ioc_state = bp;
5856                 nm.state_cntmax = state_cnt;
5857                 nm.state_cnt = 0;
5858                 netisr_domsg_global(&nm.base);
5859
5860                 /*
5861                  * The # of states may be shrinked after the snapshot
5862                  * of the state count was taken.  To give user a correct
5863                  * state count, nm->state_cnt is used to recalculate
5864                  * the actual size.
5865                  */
5866                 size = static_ioc_len +
5867                     (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5868                 KKASSERT(size <= old_size);
5869         }
5870
5871         sopt->sopt_valsize = size;
5872         return 0;
5873 }
5874
5875 static void
5876 ipfw_set_disable_dispatch(netmsg_t nmsg)
5877 {
5878         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5879
5880         ASSERT_NETISR_NCPUS(mycpuid);
5881
5882         ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5883         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5884 }
5885
5886 static void
5887 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5888 {
5889         struct netmsg_base nmsg;
5890         uint32_t set_disable;
5891
5892         ASSERT_NETISR0;
5893
5894         /* IPFW_DEFAULT_SET is always enabled */
5895         enable |= (1 << IPFW_DEFAULT_SET);
5896         set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5897
5898         bzero(&nmsg, sizeof(nmsg));
5899         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5900             ipfw_set_disable_dispatch);
5901         nmsg.lmsg.u.ms_result32 = set_disable;
5902
5903         netisr_domsg_global(&nmsg);
5904 }
5905
5906 static void
5907 ipfw_table_create_dispatch(netmsg_t nm)
5908 {
5909         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5910         int tblid = nm->lmsg.u.ms_result;
5911
5912         ASSERT_NETISR_NCPUS(mycpuid);
5913
5914         if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5915             rn_cpumaskhead(mycpuid), 32))
5916                 panic("ipfw: create table%d failed", tblid);
5917
5918         netisr_forwardmsg(&nm->base, mycpuid + 1);
5919 }
5920
5921 static int
5922 ipfw_table_create(struct sockopt *sopt)
5923 {
5924         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5925         struct ipfw_ioc_table *tbl;
5926         struct netmsg_base nm;
5927
5928         ASSERT_NETISR0;
5929
5930         if (sopt->sopt_valsize != sizeof(*tbl))
5931                 return (EINVAL);
5932
5933         tbl = sopt->sopt_val;
5934         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5935                 return (EINVAL);
5936
5937         if (ctx->ipfw_tables[tbl->tableid] != NULL)
5938                 return (EEXIST);
5939
5940         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5941             ipfw_table_create_dispatch);
5942         nm.lmsg.u.ms_result = tbl->tableid;
5943         netisr_domsg_global(&nm);
5944
5945         return (0);
5946 }
5947
5948 static void
5949 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5950 {
5951         struct radix_node *ret;
5952
5953         ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5954         if (ret != rn)
5955                 panic("deleted other table entry");
5956         kfree(ret, M_IPFW);
5957 }
5958
5959 static int
5960 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5961 {
5962
5963         ipfw_table_killrn(xrnh, rn);
5964         return (0);
5965 }
5966
5967 static void
5968 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5969     int destroy)
5970 {
5971         struct radix_node_head *rnh;
5972
5973         ASSERT_NETISR_NCPUS(mycpuid);
5974
5975         rnh = ctx->ipfw_tables[tableid];
5976         rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5977         if (destroy) {
5978                 Free(rnh);
5979                 ctx->ipfw_tables[tableid] = NULL;
5980         }
5981 }
5982
5983 static void
5984 ipfw_table_flush_dispatch(netmsg_t nmsg)
5985 {
5986         struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5987         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5988
5989         ASSERT_NETISR_NCPUS(mycpuid);
5990
5991         ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5992         netisr_forwardmsg(&nm->base, mycpuid + 1);
5993 }
5994
5995 static void
5996 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5997 {
5998         int i;
5999
6000         ASSERT_NETISR_NCPUS(mycpuid);
6001
6002         for (i = 0; i < ipfw_table_max; ++i) {
6003                 if (ctx->ipfw_tables[i] != NULL)
6004                         ipfw_table_flush_oncpu(ctx, i, destroy);
6005         }
6006 }
6007
6008 static void
6009 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6010 {
6011         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6012
6013         ASSERT_NETISR_NCPUS(mycpuid);
6014
6015         ipfw_table_flushall_oncpu(ctx, 0);
6016         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6017 }
6018
6019 static int
6020 ipfw_table_flush(struct sockopt *sopt)
6021 {
6022         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6023         struct ipfw_ioc_table *tbl;
6024         struct netmsg_tblflush nm;
6025
6026         ASSERT_NETISR0;
6027
6028         if (sopt->sopt_valsize != sizeof(*tbl))
6029                 return (EINVAL);
6030
6031         tbl = sopt->sopt_val;
6032         if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6033                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6034                     MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6035                 netisr_domsg_global(&nm.base);
6036                 return (0);
6037         }
6038
6039         if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6040                 return (EINVAL);
6041
6042         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6043                 return (ENOENT);
6044
6045         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6046             ipfw_table_flush_dispatch);
6047         nm.tableid = tbl->tableid;
6048         nm.destroy = 0;
6049         if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6050                 nm.destroy = 1;
6051         netisr_domsg_global(&nm.base);
6052
6053         return (0);
6054 }
6055
6056 static int
6057 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6058 {
6059         int *cnt = xcnt;
6060
6061         (*cnt)++;
6062         return (0);
6063 }
6064
6065 static int
6066 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6067 {
6068         struct ipfw_table_cp *cp = xcp;
6069         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6070         struct ipfw_ioc_tblent *ioc_te;
6071 #ifdef INVARIANTS
6072         int cnt;
6073 #endif
6074
6075         KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6076             cp->te_idx, cp->te_cnt));
6077         ioc_te = &cp->te[cp->te_idx];
6078
6079         if (te->te_nodes->rn_mask != NULL) {
6080                 memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6081                     *te->te_nodes->rn_mask);
6082         } else {
6083                 ioc_te->netmask.sin_len = 0;
6084         }
6085         memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6086
6087         ioc_te->use = te->te_use;
6088         ioc_te->last_used = te->te_lastuse;
6089 #ifdef INVARIANTS
6090         cnt = 1;
6091 #endif
6092
6093         while ((te = te->te_sibling) != NULL) {
6094 #ifdef INVARIANTS
6095                 ++cnt;
6096 #endif
6097                 ioc_te->use += te->te_use;
6098                 if (te->te_lastuse > ioc_te->last_used)
6099                         ioc_te->last_used = te->te_lastuse;
6100         }
6101         KASSERT(cnt == netisr_ncpus,
6102             ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6103
6104         cp->te_idx++;
6105
6106         return (0);
6107 }
6108
6109 static int
6110 ipfw_table_get(struct sockopt *sopt)
6111 {
6112         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6113         struct radix_node_head *rnh;
6114         struct ipfw_ioc_table *tbl;
6115         struct ipfw_ioc_tblcont *cont;
6116         struct ipfw_table_cp cp;
6117         int cnt = 0, sz;
6118
6119         ASSERT_NETISR0;
6120
6121         if (sopt->sopt_valsize < sizeof(*tbl))
6122                 return (EINVAL);
6123
6124         tbl = sopt->sopt_val;
6125         if (tbl->tableid < 0) {
6126                 struct ipfw_ioc_tbllist *list;
6127                 int i;
6128
6129                 /*
6130                  * List available table ids.
6131                  */
6132                 for (i = 0; i < ipfw_table_max; ++i) {
6133                         if (ctx->ipfw_tables[i] != NULL)
6134                                 ++cnt;
6135                 }
6136
6137                 sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6138                 if (sopt->sopt_valsize < sz) {
6139                         bzero(sopt->sopt_val, sopt->sopt_valsize);
6140                         return (E2BIG);
6141                 }
6142                 list = sopt->sopt_val;
6143                 list->tablecnt = cnt;
6144
6145                 cnt = 0;
6146                 for (i = 0; i < ipfw_table_max; ++i) {
6147                         if (ctx->ipfw_tables[i] != NULL) {
6148                                 KASSERT(cnt < list->tablecnt,
6149                                     ("invalid idx %d, cnt %d",
6150                                      cnt, list->tablecnt));
6151                                 list->tables[cnt++] = i;
6152                         }
6153                 }
6154                 sopt->sopt_valsize = sz;
6155                 return (0);
6156         } else if (tbl->tableid >= ipfw_table_max) {
6157                 return (EINVAL);
6158         }
6159
6160         rnh = ctx->ipfw_tables[tbl->tableid];
6161         if (rnh == NULL)
6162                 return (ENOENT);
6163         rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6164
6165         sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6166         if (sopt->sopt_valsize < sz) {
6167                 bzero(sopt->sopt_val, sopt->sopt_valsize);
6168                 return (E2BIG);
6169         }
6170         cont = sopt->sopt_val;
6171         cont->entcnt = cnt;
6172
6173         cp.te = cont->ent;
6174         cp.te_idx = 0;
6175         cp.te_cnt = cnt;
6176         rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6177
6178         sopt->sopt_valsize = sz;
6179         return (0);
6180 }
6181
6182 static void
6183 ipfw_table_add_dispatch(netmsg_t nmsg)
6184 {
6185         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6186         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6187         struct radix_node_head *rnh;
6188         struct ipfw_tblent *te;
6189
6190         ASSERT_NETISR_NCPUS(mycpuid);
6191
6192         rnh = ctx->ipfw_tables[nm->tableid];
6193
6194         te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6195         te->te_nodes->rn_key = (char *)&te->te_key;
6196         memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6197
6198         if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6199             te->te_nodes) == NULL) {
6200                 if (mycpuid == 0) {
6201                         kfree(te, M_IPFW);
6202                         netisr_replymsg(&nm->base, EEXIST);
6203                         return;
6204                 }
6205                 panic("rnh_addaddr failed");
6206         }
6207
6208         /* Link siblings. */
6209         if (nm->sibling != NULL)
6210                 nm->sibling->te_sibling = te;
6211         nm->sibling = te;
6212
6213         netisr_forwardmsg(&nm->base, mycpuid + 1);
6214 }
6215
6216 static void
6217 ipfw_table_del_dispatch(netmsg_t nmsg)
6218 {
6219         struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6220         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6221         struct radix_node_head *rnh;
6222         struct radix_node *rn;
6223
6224         ASSERT_NETISR_NCPUS(mycpuid);
6225
6226         rnh = ctx->ipfw_tables[nm->tableid];
6227         rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6228         if (rn == NULL) {
6229                 if (mycpuid == 0) {
6230                         netisr_replymsg(&nm->base, ESRCH);
6231                         return;
6232                 }
6233                 panic("rnh_deladdr failed");
6234         }
6235         kfree(rn, M_IPFW);
6236
6237         netisr_forwardmsg(&nm->base, mycpuid + 1);
6238 }
6239
6240 static int
6241 ipfw_table_alt(struct sockopt *sopt)
6242 {
6243         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6244         struct ipfw_ioc_tblcont *tbl;
6245         struct ipfw_ioc_tblent *te;
6246         struct sockaddr_in key0;
6247         struct sockaddr *netmask = NULL, *key;
6248         struct netmsg_tblent nm;
6249
6250         ASSERT_NETISR0;
6251
6252         if (sopt->sopt_valsize != sizeof(*tbl))
6253                 return (EINVAL);
6254         tbl = sopt->sopt_val;
6255
6256         if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6257                 return (EINVAL);
6258         if (tbl->entcnt != 1)
6259                 return (EINVAL);
6260
6261         if (ctx->ipfw_tables[tbl->tableid] == NULL)
6262                 return (ENOENT);
6263         te = &tbl->ent[0];
6264
6265         if (te->key.sin_family != AF_INET ||
6266             te->key.sin_port != 0 ||
6267             te->key.sin_len != sizeof(struct sockaddr_in))
6268                 return (EINVAL);
6269         key = (struct sockaddr *)&te->key;
6270
6271         if (te->netmask.sin_len != 0) {
6272                 if (te->netmask.sin_port != 0 ||
6273                     te->netmask.sin_len > sizeof(struct sockaddr_in))
6274                         return (EINVAL);
6275                 netmask = (struct sockaddr *)&te->netmask;
6276                 sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6277                 key = (struct sockaddr *)&key0;
6278         }
6279
6280         if (sopt->sopt_name == IP_FW_TBL_ADD) {
6281                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6282                     MSGF_PRIORITY, ipfw_table_add_dispatch);
6283         } else {
6284                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6285                     MSGF_PRIORITY, ipfw_table_del_dispatch);
6286         }
6287         nm.key = key;
6288         nm.netmask = netmask;
6289         nm.tableid = tbl->tableid;
6290         nm.sibling = NULL;
6291         return (netisr_domsg_global(&nm.base));
6292 }
6293
6294 static int
6295 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6296 {
6297         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6298
6299         te->te_use = 0;
6300         te->te_lastuse = 0;
6301         return (0);
6302 }
6303
6304 static void
6305 ipfw_table_zero_dispatch(netmsg_t nmsg)
6306 {
6307         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6308         struct radix_node_head *rnh;
6309
6310         ASSERT_NETISR_NCPUS(mycpuid);
6311
6312         rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6313         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6314
6315         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6316 }
6317
6318 static void
6319 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6320 {
6321         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6322         int i;
6323
6324         ASSERT_NETISR_NCPUS(mycpuid);
6325
6326         for (i = 0; i < ipfw_table_max; ++i) {
6327                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6328
6329                 if (rnh != NULL)
6330                         rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6331         }
6332         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6333 }
6334
6335 static int
6336 ipfw_table_zero(struct sockopt *sopt)
6337 {
6338         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6339         struct netmsg_base nm;
6340         struct ipfw_ioc_table *tbl;
6341
6342         ASSERT_NETISR0;
6343
6344         if (sopt->sopt_valsize != sizeof(*tbl))
6345                 return (EINVAL);
6346         tbl = sopt->sopt_val;
6347
6348         if (tbl->tableid < 0) {
6349                 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6350                     ipfw_table_zeroall_dispatch);
6351                 netisr_domsg_global(&nm);
6352                 return (0);
6353         } else if (tbl->tableid >= ipfw_table_max) {
6354                 return (EINVAL);
6355         } else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6356                 return (ENOENT);
6357         }
6358
6359         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6360             ipfw_table_zero_dispatch);
6361         nm.lmsg.u.ms_result = tbl->tableid;
6362         netisr_domsg_global(&nm);
6363
6364         return (0);
6365 }
6366
6367 static int
6368 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6369 {
6370         struct netmsg_tblexp *nm = xnm;
6371         struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6372
6373         if (te->te_expired) {
6374                 ipfw_table_killrn(nm->rnh, rn);
6375                 nm->expcnt++;
6376         }
6377         return (0);
6378 }
6379
6380 static void
6381 ipfw_table_expire_dispatch(netmsg_t nmsg)
6382 {
6383         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6384         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6385         struct radix_node_head *rnh;
6386
6387         ASSERT_NETISR_NCPUS(mycpuid);
6388
6389         rnh = ctx->ipfw_tables[nm->tableid];
6390         nm->rnh = rnh;
6391         rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6392
6393         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6394             ("not all expired addresses (%d) were deleted (%d)",
6395              nm->cnt * (mycpuid + 1), nm->expcnt));
6396
6397         netisr_forwardmsg(&nm->base, mycpuid + 1);
6398 }
6399
6400 static void
6401 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6402 {
6403         struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6404         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6405         int i;
6406
6407         ASSERT_NETISR_NCPUS(mycpuid);
6408
6409         for (i = 0; i < ipfw_table_max; ++i) {
6410                 struct radix_node_head *rnh = ctx->ipfw_tables[i];
6411
6412                 if (rnh == NULL)
6413                         continue;
6414                 nm->rnh = rnh;
6415                 rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6416         }
6417
6418         KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6419             ("not all expired addresses (%d) were deleted (%d)",
6420              nm->cnt * (mycpuid + 1), nm->expcnt));
6421
6422         netisr_forwardmsg(&nm->base, mycpuid + 1);
6423 }
6424
6425 static int
6426 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6427 {
6428         struct netmsg_tblexp *nm = xnm;
6429         struct ipfw_tblent *te;
6430         time_t lastuse;
6431
6432         te = (struct ipfw_tblent *)rn;
6433         lastuse = te->te_lastuse;
6434
6435         while ((te = te->te_sibling) != NULL) {
6436                 if (te->te_lastuse > lastuse)
6437                         lastuse = te->te_lastuse;
6438         }
6439         if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6440                 /* Not expired */
6441                 return (0);
6442         }
6443
6444         te = (struct ipfw_tblent *)rn;
6445         te->te_expired = 1;
6446         while ((te = te->te_sibling) != NULL)
6447                 te->te_expired = 1;
6448         nm->cnt++;
6449
6450         return (0);
6451 }
6452
6453 static int
6454 ipfw_table_expire(struct sockopt *sopt)
6455 {
6456         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6457         struct netmsg_tblexp nm;
6458         struct ipfw_ioc_tblexp *tbl;
6459         struct radix_node_head *rnh;
6460
6461         ASSERT_NETISR0;
6462
6463         if (sopt->sopt_valsize != sizeof(*tbl))
6464                 return (EINVAL);
6465         tbl = sopt->sopt_val;
6466         tbl->expcnt = 0;
6467
6468         nm.expcnt = 0;
6469         nm.cnt = 0;
6470         nm.expire = tbl->expire;
6471
6472         if (tbl->tableid < 0) {
6473                 int i;
6474
6475                 for (i = 0; i < ipfw_table_max; ++i) {
6476                         rnh = ctx->ipfw_tables[i];
6477                         if (rnh == NULL)
6478                                 continue;
6479                         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6480                 }
6481                 if (nm.cnt == 0) {
6482                         /* No addresses can be expired. */
6483                         return (0);
6484                 }
6485                 tbl->expcnt = nm.cnt;
6486
6487                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6488                     MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6489                 nm.tableid = -1;
6490                 netisr_domsg_global(&nm.base);
6491                 KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6492                     ("not all expired addresses (%d) were deleted (%d)",
6493                      nm.cnt * netisr_ncpus, nm.expcnt));
6494
6495                 return (0);
6496         } else if (tbl->tableid >= ipfw_table_max) {
6497                 return (EINVAL);
6498         }
6499
6500         rnh = ctx->ipfw_tables[tbl->tableid];
6501         if (rnh == NULL)
6502                 return (ENOENT);
6503         rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6504         if (nm.cnt == 0) {
6505                 /* No addresses can be expired. */
6506                 return (0);
6507         }
6508         tbl->expcnt = nm.cnt;
6509
6510         netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6511             ipfw_table_expire_dispatch);
6512         nm.tableid = tbl->tableid;
6513         netisr_domsg_global(&nm.base);
6514         KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6515             ("not all expired addresses (%d) were deleted (%d)",
6516              nm.cnt * netisr_ncpus, nm.expcnt));
6517         return (0);
6518 }
6519
6520 static void
6521 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6522 {
6523         struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6524
6525         KKASSERT((rule->rule_flags &
6526             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6527             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6528         ipfw_free_rule(rule);
6529
6530         netisr_replymsg(&nmsg->base, 0);
6531 }
6532
6533 static void
6534 ipfw_crossref_reap(void)
6535 {
6536         struct ip_fw *rule, *prev = NULL;
6537
6538         ASSERT_NETISR0;
6539
6540         rule = ipfw_gd.ipfw_crossref_free;
6541         while (rule != NULL) {
6542                 uint64_t inflight = 0;
6543                 int i;
6544
6545                 for (i = 0; i < netisr_ncpus; ++i)
6546                         inflight += rule->cross_rules[i]->cross_refs;
6547                 if (inflight == 0) {
6548                         struct ip_fw *f = rule;
6549
6550                         /*
6551                          * Unlink.
6552                          */
6553                         rule = rule->next;
6554                         if (prev != NULL)
6555                                 prev->next = rule;
6556                         else
6557                                 ipfw_gd.ipfw_crossref_free = rule;
6558
6559                         /*
6560                          * Free.
6561                          */
6562                         for (i = 1; i < netisr_ncpus; ++i) {
6563                                 struct netmsg_base nm;
6564
6565                                 netmsg_init(&nm, NULL, &curthread->td_msgport,
6566                                     MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6567                                 nm.lmsg.u.ms_resultp = f->cross_rules[i];
6568                                 netisr_domsg(&nm, i);
6569                         }
6570                         KKASSERT((f->rule_flags &
6571                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6572                             (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6573                         ipfw_unref_rule(f);
6574                 } else {
6575                         prev = rule;
6576                         rule = rule->next;
6577                 }
6578         }
6579
6580         if (ipfw_gd.ipfw_crossref_free != NULL) {
6581                 callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6582                     ipfw_crossref_timeo, NULL);
6583         }
6584 }
6585
6586 /*
6587  * {set|get}sockopt parser.
6588  */
6589 static int
6590 ipfw_ctl(struct sockopt *sopt)
6591 {
6592         int error, rulenum;
6593         uint32_t *masks;
6594         size_t size;
6595
6596         ASSERT_NETISR0;
6597
6598         error = 0;
6599
6600         switch (sopt->sopt_name) {
6601         case IP_FW_GET:
6602                 error = ipfw_ctl_get_rules(sopt);
6603                 break;
6604
6605         case IP_FW_FLUSH:
6606                 ipfw_flush(0 /* keep default rule */);
6607                 break;
6608
6609         case IP_FW_ADD:
6610                 error = ipfw_ctl_add_rule(sopt);
6611                 break;
6612
6613         case IP_FW_DEL:
6614                 /*
6615                  * IP_FW_DEL is used for deleting single rules or sets,
6616                  * and (ab)used to atomically manipulate sets.
6617                  * Argument size is used to distinguish between the two:
6618                  *    sizeof(uint32_t)
6619                  *      delete single rule or set of rules,
6620                  *      or reassign rules (or sets) to a different set.
6621                  *    2 * sizeof(uint32_t)
6622                  *      atomic disable/enable sets.
6623                  *      first uint32_t contains sets to be disabled,
6624                  *      second uint32_t contains sets to be enabled.
6625                  */
6626                 masks = sopt->sopt_val;
6627                 size = sopt->sopt_valsize;
6628                 if (size == sizeof(*masks)) {
6629                         /*
6630                          * Delete or reassign static rule
6631                          */
6632                         error = ipfw_ctl_alter(masks[0]);
6633                 } else if (size == (2 * sizeof(*masks))) {
6634                         /*
6635                          * Set enable/disable
6636                          */
6637                         ipfw_ctl_set_disable(masks[0], masks[1]);
6638                 } else {
6639                         error = EINVAL;
6640                 }
6641                 break;
6642
6643         case IP_FW_ZERO:
6644         case IP_FW_RESETLOG: /* argument is an int, the rule number */
6645                 rulenum = 0;
6646
6647                 if (sopt->sopt_val != 0) {
6648                     error = soopt_to_kbuf(sopt, &rulenum,
6649                             sizeof(int), sizeof(int));
6650                     if (error)
6651                         break;
6652                 }
6653                 error = ipfw_ctl_zero_entry(rulenum,
6654                         sopt->sopt_name == IP_FW_RESETLOG);
6655                 break;
6656
6657         case IP_FW_TBL_CREATE:
6658                 error = ipfw_table_create(sopt);
6659                 break;
6660
6661         case IP_FW_TBL_ADD:
6662         case IP_FW_TBL_DEL:
6663                 error = ipfw_table_alt(sopt);
6664                 break;
6665
6666         case IP_FW_TBL_FLUSH:
6667         case IP_FW_TBL_DESTROY:
6668                 error = ipfw_table_flush(sopt);
6669                 break;
6670
6671         case IP_FW_TBL_GET:
6672                 error = ipfw_table_get(sopt);
6673                 break;
6674
6675         case IP_FW_TBL_ZERO:
6676                 error = ipfw_table_zero(sopt);
6677                 break;
6678
6679         case IP_FW_TBL_EXPIRE:
6680                 error = ipfw_table_expire(sopt);
6681                 break;
6682
6683         default:
6684                 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6685                 error = EINVAL;
6686         }
6687
6688         ipfw_crossref_reap();
6689         return error;
6690 }
6691
6692 static void
6693 ipfw_keepalive_done(struct ipfw_context *ctx)
6694 {
6695
6696         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6697             ("keepalive is not in progress"));
6698         ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6699         callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6700             ipfw_keepalive, NULL);
6701 }
6702
6703 static void
6704 ipfw_keepalive_more(struct ipfw_context *ctx)
6705 {
6706         struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6707
6708         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6709             ("keepalive is not in progress"));
6710         KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6711             ("keepalive more did not finish"));
6712         netisr_sendmsg_oncpu(nm);
6713 }
6714
6715 static void
6716 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6717 {
6718         struct ipfw_state *s;
6719         int scanned = 0, expired = 0, kept = 0;
6720
6721         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6722             ("keepalive is not in progress"));
6723
6724         while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6725                 uint32_t ack_rev, ack_fwd;
6726                 struct ipfw_flow_id id;
6727                 uint8_t send_dir;
6728
6729                 if (scanned++ >= ipfw_state_scan_max) {
6730                         ipfw_keepalive_more(ctx);
6731                         return;
6732                 }
6733
6734                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6735                 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6736
6737                 /*
6738                  * NOTE:
6739                  * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6740                  * on slave xlat.
6741                  */
6742                 if (s->st_type == O_ANCHOR)
6743                         continue;
6744
6745                 if (IPFW_STATE_ISDEAD(s)) {
6746                         ipfw_state_remove(ctx, s);
6747                         if (++expired >= ipfw_state_expire_max) {
6748                                 ipfw_keepalive_more(ctx);
6749                                 return;
6750                         }
6751                         continue;
6752                 }
6753
6754                 /*
6755                  * Keep alive processing
6756                  */
6757
6758                 if (s->st_proto != IPPROTO_TCP)
6759                         continue;
6760                 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6761                         continue;
6762                 if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6763                     s->st_expire))
6764                         continue;       /* too early */
6765
6766                 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6767                     &id.dst_ip, &id.dst_port);
6768                 ack_rev = s->st_ack_rev;
6769                 ack_fwd = s->st_ack_fwd;
6770
6771 #define SEND_FWD        0x1
6772 #define SEND_REV        0x2
6773
6774                 if (IPFW_ISXLAT(s->st_type)) {
6775                         const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6776
6777                         if (x->xlat_dir == MATCH_FORWARD)
6778                                 send_dir = SEND_FWD;
6779                         else
6780                                 send_dir = SEND_REV;
6781                 } else {
6782                         send_dir = SEND_FWD | SEND_REV;
6783                 }
6784
6785                 if (send_dir & SEND_REV)
6786                         send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6787                 if (send_dir & SEND_FWD)
6788                         send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6789
6790 #undef SEND_FWD
6791 #undef SEND_REV
6792
6793                 if (++kept >= ipfw_keepalive_max) {
6794                         ipfw_keepalive_more(ctx);
6795                         return;
6796                 }
6797         }
6798         TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6799         ipfw_keepalive_done(ctx);
6800 }
6801
6802 static void
6803 ipfw_keepalive_more_dispatch(netmsg_t nm)
6804 {
6805         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6806         struct ipfw_state *anchor;
6807
6808         ASSERT_NETISR_NCPUS(mycpuid);
6809         KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6810             ("keepalive is not in progress"));
6811
6812         /* Reply ASAP */
6813         netisr_replymsg(&nm->base, 0);
6814
6815         anchor = &ctx->ipfw_keepalive_anch;
6816         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6817                 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6818                 ipfw_keepalive_done(ctx);
6819                 return;
6820         }
6821         ipfw_keepalive_loop(ctx, anchor);
6822 }
6823
6824 /*
6825  * This procedure is only used to handle keepalives. It is invoked
6826  * every dyn_keepalive_period
6827  */
6828 static void
6829 ipfw_keepalive_dispatch(netmsg_t nm)
6830 {
6831         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6832         struct ipfw_state *anchor;
6833
6834         ASSERT_NETISR_NCPUS(mycpuid);
6835         KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6836             ("keepalive is in progress"));
6837         ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6838
6839         /* Reply ASAP */
6840         crit_enter();
6841         netisr_replymsg(&nm->base, 0);
6842         crit_exit();
6843
6844         if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6845                 ipfw_keepalive_done(ctx);
6846                 return;
6847         }
6848
6849         anchor = &ctx->ipfw_keepalive_anch;
6850         TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6851         ipfw_keepalive_loop(ctx, anchor);
6852 }
6853
6854 /*
6855  * This procedure is only used to handle keepalives. It is invoked
6856  * every dyn_keepalive_period
6857  */
6858 static void
6859 ipfw_keepalive(void *dummy __unused)
6860 {
6861         struct netmsg_base *msg;
6862
6863         KKASSERT(mycpuid < netisr_ncpus);
6864         msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6865
6866         crit_enter();
6867         if (msg->lmsg.ms_flags & MSGF_DONE)
6868                 netisr_sendmsg_oncpu(msg);
6869         crit_exit();
6870 }
6871
6872 static void
6873 ipfw_ip_input_dispatch(netmsg_t nmsg)
6874 {
6875         struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6876         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6877         struct mbuf *m = nm->m;
6878         struct ip_fw *rule = nm->arg1;
6879
6880         ASSERT_NETISR_NCPUS(mycpuid);
6881         KASSERT(rule->cpuid == mycpuid,
6882             ("rule does not belong to cpu%d", mycpuid));
6883         KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6884             ("mbuf does not have ipfw continue rule"));
6885
6886         KASSERT(ctx->ipfw_cont_rule == NULL,
6887             ("pending ipfw continue rule"));
6888         ctx->ipfw_cont_rule = rule;
6889         ip_input(m);
6890
6891         /* May not be cleared, if ipfw was unload/disabled. */
6892         ctx->ipfw_cont_rule = NULL;
6893
6894         /*
6895          * This rule is no longer used; decrement its cross_refs,
6896          * so this rule can be deleted.
6897          */
6898         rule->cross_refs--;
6899 }
6900
6901 static void
6902 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6903 {
6904         struct netmsg_genpkt *nm;
6905
6906         KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6907
6908         /*
6909          * NOTE:
6910          * Bump cross_refs to prevent this rule and its siblings
6911          * from being deleted, while this mbuf is inflight.  The
6912          * cross_refs of the sibling rule on the target cpu will
6913          * be decremented, once this mbuf is going to be filtered
6914          * on the target cpu.
6915          */
6916         rule->cross_refs++;
6917         m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6918
6919         nm = &m->m_hdr.mh_genmsg;
6920         netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6921             ipfw_ip_input_dispatch);
6922         nm->m = m;
6923         nm->arg1 = rule->cross_rules[cpuid];
6924         netisr_sendmsg(&nm->base, cpuid);
6925 }
6926
6927 static void
6928 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6929 {
6930
6931         args->flags = 0;
6932         args->rule = NULL;
6933         args->xlat = NULL;
6934
6935         if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6936                 struct m_tag *mtag;
6937
6938                 /* Extract info from dummynet tag */
6939                 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6940                 KKASSERT(mtag != NULL);
6941                 args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6942                 KKASSERT(args->rule != NULL);
6943
6944                 m_tag_delete(m, mtag);
6945                 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6946         } else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6947                 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6948
6949                 KKASSERT(ctx->ipfw_cont_rule != NULL);
6950                 args->rule = ctx->ipfw_cont_rule;
6951                 ctx->ipfw_cont_rule = NULL;
6952
6953                 if (ctx->ipfw_cont_xlat != NULL) {
6954                         args->xlat = ctx->ipfw_cont_xlat;
6955                         ctx->ipfw_cont_xlat = NULL;
6956                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6957                                 args->flags |= IP_FWARG_F_XLATINS;
6958                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6959                         }
6960                         if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6961                                 args->flags |= IP_FWARG_F_XLATFWD;
6962                                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6963                         }
6964                 }
6965                 KKASSERT((m->m_pkthdr.fw_flags &
6966                     (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6967
6968                 args->flags |= IP_FWARG_F_CONT;
6969                 m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6970         }
6971
6972         args->eh = NULL;
6973         args->oif = oif;
6974         args->m = m;
6975 }
6976
6977 static int
6978 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6979 {
6980         struct ip_fw_args args;
6981         struct mbuf *m = *m0;
6982         int tee = 0, error = 0, ret;
6983
6984         ipfw_init_args(&args, m, NULL);
6985
6986         ret = ipfw_chk(&args);
6987         m = args.m;
6988         if (m == NULL) {
6989                 if (ret != IP_FW_REDISPATCH)
6990                         error = EACCES;
6991                 goto back;
6992         }
6993
6994         switch (ret) {
6995         case IP_FW_PASS:
6996                 break;
6997
6998         case IP_FW_DENY:
6999                 m_freem(m);
7000                 m = NULL;
7001                 error = EACCES;
7002                 break;
7003
7004         case IP_FW_DUMMYNET:
7005                 /* Send packet to the appropriate pipe */
7006                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
7007                 break;
7008
7009         case IP_FW_TEE:
7010                 tee = 1;
7011                 /* FALL THROUGH */
7012
7013         case IP_FW_DIVERT:
7014                 /*
7015                  * Must clear bridge tag when changing
7016                  */
7017                 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7018                 if (ip_divert_p != NULL) {
7019                         m = ip_divert_p(m, tee, 1);
7020                 } else {
7021                         m_freem(m);
7022                         m = NULL;
7023                         /* not sure this is the right error msg */
7024                         error = EACCES;
7025                 }
7026                 break;
7027
7028         default:
7029                 panic("unknown ipfw return value: %d", ret);
7030         }
7031 back:
7032         *m0 = m;
7033         return error;
7034 }
7035
7036 static int
7037 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7038 {
7039         struct ip_fw_args args;
7040         struct mbuf *m = *m0;
7041         int tee = 0, error = 0, ret;
7042
7043         ipfw_init_args(&args, m, ifp);
7044
7045         ret = ipfw_chk(&args);
7046         m = args.m;
7047         if (m == NULL) {
7048                 if (ret != IP_FW_REDISPATCH)
7049                         error = EACCES;
7050                 goto back;
7051         }
7052
7053         switch (ret) {
7054         case IP_FW_PASS:
7055                 break;
7056
7057         case IP_FW_DENY:
7058                 m_freem(m);
7059                 m = NULL;
7060                 error = EACCES;
7061                 break;
7062
7063         case IP_FW_DUMMYNET:
7064                 m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7065                 break;
7066
7067         case IP_FW_TEE:
7068                 tee = 1;
7069                 /* FALL THROUGH */
7070
7071         case IP_FW_DIVERT:
7072                 if (ip_divert_p != NULL) {
7073                         m = ip_divert_p(m, tee, 0);
7074                 } else {
7075                         m_freem(m);
7076                         m = NULL;
7077                         /* not sure this is the right error msg */
7078                         error = EACCES;
7079                 }
7080                 break;
7081
7082         default:
7083                 panic("unknown ipfw return value: %d", ret);
7084         }
7085 back:
7086         *m0 = m;
7087         return error;
7088 }
7089
7090 static void
7091 ipfw_hook(void)
7092 {
7093         struct pfil_head *pfh;
7094
7095         ASSERT_NETISR0;
7096
7097         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7098         if (pfh == NULL)
7099                 return;
7100
7101         pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7102         pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7103 }
7104
7105 static void
7106 ipfw_dehook(void)
7107 {
7108         struct pfil_head *pfh;
7109
7110         ASSERT_NETISR0;
7111
7112         pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7113         if (pfh == NULL)
7114                 return;
7115
7116         pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7117         pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7118 }
7119
7120 static int
7121 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7122 {
7123         int dyn_cnt;
7124
7125         dyn_cnt = ipfw_state_cntcoll();
7126         dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7127
7128         return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7129 }
7130
7131 static int
7132 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7133 {
7134         int state_cnt;
7135
7136         state_cnt = ipfw_state_cntcoll();
7137         return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7138 }
7139
7140 static int
7141 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7142 {
7143         int state_max, error;
7144
7145         state_max = ipfw_state_max;
7146         error = sysctl_handle_int(oidp, &state_max, 0, req);
7147         if (error || req->newptr == NULL)
7148                 return (error);
7149
7150         if (state_max < 1)
7151                 return (EINVAL);
7152
7153         ipfw_state_max_set(state_max);
7154         return (0);
7155 }
7156
7157 static int
7158 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7159 {
7160         int dyn_max, error;
7161
7162         dyn_max = ipfw_state_max + ipfw_track_max;
7163
7164         error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7165         if (error || req->newptr == NULL)
7166                 return (error);
7167
7168         if (dyn_max < 2)
7169                 return (EINVAL);
7170
7171         ipfw_state_max_set(dyn_max / 2);
7172         ipfw_track_max = dyn_max / 2;
7173         return (0);
7174 }
7175
7176 static void
7177 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7178 {
7179         int enable = nmsg->lmsg.u.ms_result;
7180
7181         ASSERT_NETISR0;
7182
7183         if (fw_enable == enable)
7184                 goto reply;
7185
7186         fw_enable = enable;
7187         if (fw_enable)
7188                 ipfw_hook();
7189         else
7190                 ipfw_dehook();
7191 reply:
7192         netisr_replymsg(&nmsg->base, 0);
7193 }
7194
7195 static int
7196 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7197 {
7198         struct netmsg_base nmsg;
7199         int enable, error;
7200
7201         enable = fw_enable;
7202         error = sysctl_handle_int(oidp, &enable, 0, req);
7203         if (error || req->newptr == NULL)
7204                 return error;
7205
7206         netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7207             ipfw_sysctl_enable_dispatch);
7208         nmsg.lmsg.u.ms_result = enable;
7209
7210         return netisr_domsg(&nmsg, 0);
7211 }
7212
7213 static int
7214 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7215 {
7216         return sysctl_int_range(oidp, arg1, arg2, req,
7217                IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7218 }
7219
7220 static int
7221 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7222 {
7223
7224         return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7225 }
7226
7227 static int
7228 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7229 {
7230         u_long stat = 0;
7231         int cpu, error;
7232
7233         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7234                 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7235
7236         error = sysctl_handle_long(oidp, &stat, 0, req);
7237         if (error || req->newptr == NULL)
7238                 return (error);
7239
7240         /* Zero out this stat. */
7241         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7242                 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7243         return (0);
7244 }
7245
7246 static void
7247 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7248 {
7249         struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7250         struct ipfw_context *ctx;
7251         struct ip_fw *def_rule;
7252
7253         ASSERT_NETISR_NCPUS(mycpuid);
7254
7255         ctx = kmalloc(__offsetof(struct ipfw_context,
7256             ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7257
7258         RB_INIT(&ctx->ipfw_state_tree);
7259         TAILQ_INIT(&ctx->ipfw_state_list);
7260
7261         RB_INIT(&ctx->ipfw_track_tree);
7262         TAILQ_INIT(&ctx->ipfw_track_list);
7263
7264         callout_init_mp(&ctx->ipfw_stateto_ch);
7265         netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7266             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7267         ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7268         netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7269             MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7270
7271         callout_init_mp(&ctx->ipfw_trackto_ch);
7272         netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7273             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7274         netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7275             MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7276
7277         callout_init_mp(&ctx->ipfw_keepalive_ch);
7278         netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7279             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7280         ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7281         netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7282             MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7283
7284         callout_init_mp(&ctx->ipfw_xlatreap_ch);
7285         netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7286             MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7287         TAILQ_INIT(&ctx->ipfw_xlatreap);
7288
7289         ipfw_ctx[mycpuid] = ctx;
7290
7291         def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7292
7293         def_rule->act_ofs = 0;
7294         def_rule->rulenum = IPFW_DEFAULT_RULE;
7295         def_rule->cmd_len = 1;
7296         def_rule->set = IPFW_DEFAULT_SET;
7297
7298         def_rule->cmd[0].len = 1;
7299 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7300         def_rule->cmd[0].opcode = O_ACCEPT;
7301 #else
7302         if (filters_default_to_accept)
7303                 def_rule->cmd[0].opcode = O_ACCEPT;
7304         else
7305                 def_rule->cmd[0].opcode = O_DENY;
7306 #endif
7307
7308         def_rule->refcnt = 1;
7309         def_rule->cpuid = mycpuid;
7310
7311         /* Install the default rule */
7312         ctx->ipfw_default_rule = def_rule;
7313         ctx->ipfw_layer3_chain = def_rule;
7314
7315         /* Link rule CPU sibling */
7316         ipfw_link_sibling(fwmsg, def_rule);
7317
7318         /* Statistics only need to be updated once */
7319         if (mycpuid == 0)
7320                 ipfw_inc_static_count(def_rule);
7321
7322         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7323 }
7324
7325 static void
7326 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7327 {
7328
7329         crit_enter();
7330         /* Reply ASAP */
7331         netisr_replymsg(&nmsg->base, 0);
7332         crit_exit();
7333         ipfw_crossref_reap();
7334 }
7335
7336 static void
7337 ipfw_crossref_timeo(void *dummy __unused)
7338 {
7339         struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7340
7341         KKASSERT(mycpuid == 0);
7342
7343         crit_enter();
7344         if (msg->lmsg.ms_flags & MSGF_DONE)
7345                 netisr_sendmsg_oncpu(msg);
7346         crit_exit();
7347 }
7348
7349 static void
7350 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7351 {
7352         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7353         struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7354         struct ip_fw *f;
7355
7356         ASSERT_NETISR_NCPUS(mycpuid);
7357
7358         for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7359                 int l, cmdlen;
7360                 ipfw_insn *cmd;
7361
7362                 if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7363                         continue;
7364
7365                 for (l = f->cmd_len, cmd = f->cmd; l > 0;
7366                      l -= cmdlen, cmd += cmdlen) {
7367                         cmdlen = F_LEN(cmd);
7368                         if (cmd->opcode == O_IP_SRC_IFIP ||
7369                             cmd->opcode == O_IP_DST_IFIP) {
7370                                 if (strncmp(ifp->if_xname,
7371                                     ((ipfw_insn_ifip *)cmd)->ifname,
7372                                     IFNAMSIZ) == 0)
7373                                         cmd->arg1 &= ~IPFW_IFIP_VALID;
7374                         }
7375                 }
7376         }
7377         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7378 }
7379
7380 static void
7381 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7382     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7383 {
7384         struct netmsg_base nm;
7385
7386         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7387             ipfw_ifaddr_dispatch);
7388         nm.lmsg.u.ms_resultp = ifp;
7389         netisr_domsg_global(&nm);
7390 }
7391
7392 static void
7393 ipfw_init_dispatch(netmsg_t nmsg)
7394 {
7395         struct netmsg_ipfw fwmsg;
7396         int error = 0, cpu;
7397
7398         ASSERT_NETISR0;
7399
7400         if (IPFW_LOADED) {
7401                 kprintf("IP firewall already loaded\n");
7402                 error = EEXIST;
7403                 goto reply;
7404         }
7405
7406         if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7407                 ipfw_table_max = UINT16_MAX;
7408
7409         /* Initialize global track tree. */
7410         RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7411         IPFW_TRKCNT_TOKINIT;
7412
7413         /* GC for freed crossref rules. */
7414         callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7415         netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7416             MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7417
7418         ipfw_state_max_set(ipfw_state_max);
7419         ipfw_state_headroom = 8 * netisr_ncpus;
7420
7421         bzero(&fwmsg, sizeof(fwmsg));
7422         netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7423             ipfw_ctx_init_dispatch);
7424         netisr_domsg_global(&fwmsg.base);
7425
7426         ip_fw_chk_ptr = ipfw_chk;
7427         ip_fw_ctl_ptr = ipfw_ctl;
7428         ip_fw_dn_io_ptr = ipfw_dummynet_io;
7429
7430         kprintf("ipfw2 initialized, default to %s, logging ",
7431                 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7432                 O_ACCEPT ? "accept" : "deny");
7433
7434 #ifdef IPFIREWALL_VERBOSE
7435         fw_verbose = 1;
7436 #endif
7437 #ifdef IPFIREWALL_VERBOSE_LIMIT
7438         verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7439 #endif
7440         if (fw_verbose == 0) {
7441                 kprintf("disabled\n");
7442         } else if (verbose_limit == 0) {
7443                 kprintf("unlimited\n");
7444         } else {
7445                 kprintf("limited to %d packets/entry by default\n",
7446                         verbose_limit);
7447         }
7448
7449         ip_fw_loaded = 1;
7450         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7451                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7452                     ipfw_state_expire_ipifunc, NULL, cpu);
7453                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7454                     ipfw_track_expire_ipifunc, NULL, cpu);
7455                 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7456                     ipfw_keepalive, NULL, cpu);
7457         }
7458
7459         if (fw_enable)
7460                 ipfw_hook();
7461
7462         ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7463             NULL, EVENTHANDLER_PRI_ANY);
7464         if (ipfw_ifaddr_event == NULL)
7465                 kprintf("ipfw: ifaddr_event register failed\n");
7466
7467 reply:
7468         netisr_replymsg(&nmsg->base, error);
7469 }
7470
7471 static int
7472 ipfw_init(void)
7473 {
7474         struct netmsg_base smsg;
7475
7476         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7477             ipfw_init_dispatch);
7478         return netisr_domsg(&smsg, 0);
7479 }
7480
7481 #ifdef KLD_MODULE
7482
7483 static void
7484 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7485 {
7486         struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7487
7488         ASSERT_NETISR_NCPUS(mycpuid);
7489
7490         callout_stop_sync(&ctx->ipfw_stateto_ch);
7491         callout_stop_sync(&ctx->ipfw_trackto_ch);
7492         callout_stop_sync(&ctx->ipfw_keepalive_ch);
7493         callout_stop_sync(&ctx->ipfw_xlatreap_ch);
7494
7495         crit_enter();
7496         netisr_dropmsg(&ctx->ipfw_stateexp_more);
7497         netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7498         netisr_dropmsg(&ctx->ipfw_trackexp_more);
7499         netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7500         netisr_dropmsg(&ctx->ipfw_keepalive_more);
7501         netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7502         netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7503         crit_exit();
7504
7505         ipfw_table_flushall_oncpu(ctx, 1);
7506
7507         netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7508 }
7509
7510 static void
7511 ipfw_fini_dispatch(netmsg_t nmsg)
7512 {
7513         struct netmsg_base nm;
7514         int error = 0, cpu;
7515
7516         ASSERT_NETISR0;
7517
7518         ipfw_crossref_reap();
7519
7520         if (ipfw_gd.ipfw_refcnt != 0) {
7521                 error = EBUSY;
7522                 goto reply;
7523         }
7524
7525         ip_fw_loaded = 0;
7526         ipfw_dehook();
7527
7528         /* Synchronize any inflight state/track expire IPIs. */
7529         lwkt_synchronize_ipiqs("ipfwfini");
7530
7531         netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7532             ipfw_ctx_fini_dispatch);
7533         netisr_domsg_global(&nm);
7534
7535         callout_stop_sync(&ipfw_gd.ipfw_crossref_ch);
7536         crit_enter();
7537         netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7538         crit_exit();
7539
7540         if (ipfw_ifaddr_event != NULL)
7541                 EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7542
7543         ip_fw_chk_ptr = NULL;
7544         ip_fw_ctl_ptr = NULL;
7545         ip_fw_dn_io_ptr = NULL;
7546         ipfw_flush(1 /* kill default rule */);
7547
7548         /* Free pre-cpu context */
7549         for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7550                 kfree(ipfw_ctx[cpu], M_IPFW);
7551
7552         kprintf("IP firewall unloaded\n");
7553 reply:
7554         netisr_replymsg(&nmsg->base, error);
7555 }
7556
7557 static void
7558 ipfw_fflush_dispatch(netmsg_t nmsg)
7559 {
7560
7561         ipfw_flush(0 /* keep default rule */);
7562         ipfw_crossref_reap();
7563         netisr_replymsg(&nmsg->base, 0);
7564 }
7565
7566 static int
7567 ipfw_fini(void)
7568 {
7569         struct netmsg_base smsg;
7570         int i = 0;
7571
7572         for (;;) {
7573                 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7574                     ipfw_fflush_dispatch);
7575                 netisr_domsg(&smsg, 0);
7576
7577                 if (ipfw_gd.ipfw_refcnt == 0)
7578                         break;
7579                 kprintf("ipfw: flush pending %d\n", ++i);
7580                 tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7581         }
7582
7583         netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7584             ipfw_fini_dispatch);
7585         return netisr_domsg(&smsg, 0);
7586 }
7587
7588 #endif  /* KLD_MODULE */
7589
7590 static int
7591 ipfw_modevent(module_t mod, int type, void *unused)
7592 {
7593         int err = 0;
7594
7595         switch (type) {
7596         case MOD_LOAD:
7597                 err = ipfw_init();
7598                 break;
7599
7600         case MOD_UNLOAD:
7601 #ifndef KLD_MODULE
7602                 kprintf("ipfw statically compiled, cannot unload\n");
7603                 err = EBUSY;
7604 #else
7605                 err = ipfw_fini();
7606 #endif
7607                 break;
7608         default:
7609                 break;
7610         }
7611         return err;
7612 }
7613
7614 static moduledata_t ipfwmod = {
7615         "ipfw",
7616         ipfw_modevent,
7617         0
7618 };
7619 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7620 MODULE_VERSION(ipfw, 1);