pf: Update packet filter to the version that comes with OpenBSD 4.1
authorJan Lentfer <Jan.Lentfer@web.de>
Sun, 8 Aug 2010 07:44:38 +0000 (09:44 +0200)
committerJan Lentfer <Jan.Lentfer@web.de>
Sun, 8 Aug 2010 08:18:38 +0000 (10:18 +0200)
    The original OpenBSD 4.1 defaults to "keep state flags S/SA" for
    all pass rules. In contrast to that we default to "no state". As
    in earlier verions of pf in DragonFly the default keep-state
    policy can still be set with the keep-policy option (e.g. "set
    keep-policy keep state (pickups)").

    DragonFly additions to pf have been kept: fairq support,
    pickups.

Detailed Info on changes/additions:
* ALTQ: Fix altq to work with pf_mtag
Patch by Matthew Dillon
* libkern: Revert commit e104539
strchr was added to libkern.h together with strrch
* net/if.h: add interface groups
Imported from FreeBSD.
* netinet6/in6.h: add macros
IN6_IS_ADDR_MC_INTFACELOCAL
IN6_IS_SCOPE_EMBED
PV6_ADDR_SCOPE_INTFACELOCAL
* sys/libkern.h: Add strchr and strrchr as inline functions
Brought in from FreeBSD
* sys/net/if_var.h: Import interface groups
Import interface groups and event handlers from FreeBSD
* sys/net/if_var.h: add if_pf_kif, if_groups to struct ifnet
obtained from: Open/FreeBSD
* net/if_types.h: add IFT_ENC to non-IATA-assignments
obtained from Open/FreeBSD
* net/bpf.c: add bpf_mtap_hdr from OpenBSD
Con up a minimal dummy header to pacify bpf.  Allocate
(only) a struct m_hdr on the stack.

55 files changed:
include/string.h
share/man/man4/pf.4
share/man/man4/pflog.4
share/man/man4/pfsync.4
sys/conf/files
sys/libkern/strchr.c [deleted file]
sys/net/altq/altq_cbq.c
sys/net/altq/altq_fairq.c
sys/net/altq/altq_hfsc.c
sys/net/altq/altq_priq.c
sys/net/altq/altq_red.c
sys/net/altq/altq_subr.c
sys/net/altq/altq_var.h
sys/net/bpf.c
sys/net/bpf.h
sys/net/if.h
sys/net/if_types.h
sys/net/if_var.h
sys/net/pf/Makefile
sys/net/pf/if_pflog.c
sys/net/pf/if_pflog.h
sys/net/pf/if_pfsync.c
sys/net/pf/if_pfsync.h
sys/net/pf/pf.c
sys/net/pf/pf_if.c
sys/net/pf/pf_ioctl.c
sys/net/pf/pf_norm.c
sys/net/pf/pf_osfp.c
sys/net/pf/pf_ruleset.c [new file with mode: 0644]
sys/net/pf/pf_subr.c
sys/net/pf/pf_table.c
sys/net/pf/pfvar.h
sys/netinet6/in6.h
sys/sys/libkern.h
sys/sys/mbuf.h
sys/sys/sockio.h
usr.sbin/authpf/Makefile
usr.sbin/authpf/authpf.c
usr.sbin/authpf/pathnames.h
usr.sbin/pfctl/Makefile
usr.sbin/pfctl/parse.y
usr.sbin/pfctl/pf.conf.5
usr.sbin/pfctl/pf.os.5
usr.sbin/pfctl/pf_print_state.c
usr.sbin/pfctl/pfctl.8
usr.sbin/pfctl/pfctl.c
usr.sbin/pfctl/pfctl.h
usr.sbin/pfctl/pfctl_altq.c
usr.sbin/pfctl/pfctl_optimize.c [new file with mode: 0644]
usr.sbin/pfctl/pfctl_osfp.c
usr.sbin/pfctl/pfctl_parser.c
usr.sbin/pfctl/pfctl_parser.h
usr.sbin/pfctl/pfctl_qstats.c
usr.sbin/pfctl/pfctl_radix.c
usr.sbin/pfctl/pfctl_table.c

index f2d6766..a9885df 100644 (file)
@@ -114,7 +114,9 @@ size_t       strnlen(const char *, size_t) __pure;
 char   *strnstr(const char *, const char *, size_t) __pure;
 #endif
 char   *strpbrk(const char *, const char *) __pure;
+#if !defined(_KERNEL_VIRTUAL)
 char   *strrchr(const char *, int) __pure;
+#endif
 #if __BSD_VISIBLE
 #if !defined(_KERNEL_VIRTUAL)
 char   *strsep(char **, const char *);
index 187a050..3a57046 100644 (file)
@@ -1,5 +1,4 @@
-.\"    $OpenBSD: pf.4,v 1.46 2004/02/19 21:29:51 cedric Exp $
-.\"    $DragonFly: src/share/man/man4/pf.4,v 1.5 2007/05/19 17:32:12 swildner Exp $
+.\"    $OpenBSD: pf.4,v 1.58 2007/02/09 11:39:06 henning Exp $
 .\"
 .\" Copyright (C) 2001, Kjell Wooding.  All rights reserved.
 .\"
@@ -50,34 +49,47 @@ The most commonly used functions are covered by
 .Xr pfctl 8 .
 .Pp
 Manipulations like loading a ruleset that involve more than a single
-ioctl call require a so-called ticket, which prevents the occurrence of
+.Xr ioctl 2
+call require a so-called
+.Em ticket ,
+which prevents the occurrence of
 multiple concurrent manipulations.
 .Pp
-Fields of ioctl parameter structures that refer to packet data (like
+Fields of
+.Xr ioctl 2
+parameter structures that refer to packet data (like
 addresses and ports) are generally expected in network byte-order.
-.Sh FILES
-.Bl -tag -width /dev/pf -compact
-.It Pa /dev/pf
-packet filtering device.
-.El
+.Pp
+Rules and address tables are contained in so-called
+.Em anchors .
+When servicing an
+.Xr ioctl 2
+request, if the anchor field of the argument structure is empty,
+the kernel will use the default anchor (i.e., the main ruleset)
+in operations.
+Anchors are specified by name and may be nested, with components
+separated by
+.Sq /
+characters, similar to how file system hierarchies are laid out.
+The final component of the anchor path is the anchor under which
+operations will be performed.
 .Sh IOCTL INTERFACE
-pf supports the following
+.Nm
+supports the following
 .Xr ioctl 2
-commands:
+commands, available through
+.Aq Pa net/pfvar.h :
 .Bl -tag -width xxxxxx
 .It Dv DIOCSTART
-Starts the packet filter.
+Start the packet filter.
 .It Dv DIOCSTOP
-Stops the packet filter.
+Stop the packet filter.
 .It Dv DIOCSTARTALTQ
-Starts the
-.Xr altq 4
-bandwidth control system.
+Start the ALTQ bandwidth control system (see
+.Xr altq 9 ) .
 .It Dv DIOCSTOPALTQ
-Stops the
-.Xr altq 4
-bandwidth control system.
-.It Dv DIOCBEGINADDRS  Fa "struct pfioc_pooladdr"
+Stop the ALTQ bandwidth control system.
+.It Dv DIOCBEGINADDRS Fa "struct pfioc_pooladdr *pp"
 .Bd -literal
 struct pfioc_pooladdr {
        u_int32_t               action;
@@ -87,24 +99,22 @@ struct pfioc_pooladdr {
        u_int8_t                r_action;
        u_int8_t                r_last;
        u_int8_t                af;
-       char                    anchor[PF_ANCHOR_NAME_SIZE];
-       char                    ruleset[PF_RULESET_NAME_SIZE];
+       char                    anchor[MAXPATHLEN];
        struct pf_pooladdr      addr;
 };
 .Ed
 .Pp
-Clears the buffer address pool
-and returns a
+Clear the buffer address pool and get a
 .Va ticket
 for subsequent
 .Dv DIOCADDADDR ,
-.Dv DIOCADDRULE
+.Dv DIOCADDRULE ,
 and
 .Dv DIOCCHANGERULE
 calls.
-.It Dv DIOCADDADDR     Fa "struct pfioc_pooladdr"
+.It Dv DIOCADDADDR Fa "struct pfioc_pooladdr *pp"
 .Pp
-Adds pool address
+Add the pool address
 .Va addr
 to the buffer address pool to be used in the following
 .Dv DIOCADDRULE
@@ -112,119 +122,120 @@ or
 .Dv DIOCCHANGERULE
 call.
 All other members of the structure are ignored.
-.It Dv DIOCADDRULE     Fa "struct pfioc_rule"
+.It Dv DIOCADDRULE Fa "struct pfioc_rule *pr"
 .Bd -literal
 struct pfioc_rule {
        u_int32_t       action;
        u_int32_t       ticket;
        u_int32_t       pool_ticket;
        u_int32_t       nr;
-       char            anchor[PF_ANCHOR_NAME_SIZE];
-       char            ruleset[PF_RULESET_NAME_SIZE];
+       char            anchor[MAXPATHLEN];
+       char            anchor_call[MAXPATHLEN];
        struct pf_rule  rule;
 };
 .Ed
 .Pp
-Adds
+Add
 .Va rule
 at the end of the inactive ruleset.
-Requires
+This call requires a
 .Va ticket
-obtained through preceding
+obtained through preceding
 .Dv DIOCXBEGIN
-call, and
+call and a
 .Va pool_ticket
-obtained through
+obtained through a
 .Dv DIOCBEGINADDRS
 call.
 .Dv DIOCADDADDR
 must also be called if any pool addresses are required.
 The optional
 .Va anchor
-and
-.Va ruleset
-names indicate the anchor and ruleset in which to append the rule.
+name indicates the anchor in which to append the rule.
 .Va nr
 and
 .Va action
 are ignored.
-.It Dv DIOCADDALTQ     Fa "struct pfioc_altq"
-Adds
+.It Dv DIOCADDALTQ Fa "struct pfioc_altq *pa"
+Add an ALTQ discipline or queue.
 .Bd -literal
 struct pfioc_altq {
        u_int32_t       action;
        u_int32_t       ticket;
        u_int32_t       nr;
-       struct pf_altq   altq;
+       struct pf_altq  altq;
 };
 .Ed
-.It Dv DIOCGETRULES    Fa "struct pfioc_rule"
-Returns
+.It Dv DIOCGETRULES Fa "struct pfioc_rule *pr"
+Get a
 .Va ticket
 for subsequent
 .Dv DIOCGETRULE
-calls and
+calls and the number
 .Va nr
 of rules in the active ruleset.
-.It Dv DIOCGETRULE     Fa "struct pfioc_rule"
-Returns
+.It Dv DIOCGETRULE Fa "struct pfioc_rule *pr"
+Get a
 .Va rule
-number
+by its number
 .Va nr
-using
+using the
 .Va ticket
 obtained through a preceding
 .Dv DIOCGETRULES
 call.
-.It Dv DIOCGETADDRS    Fa "struct pfioc_pooladdr"
-Returns
+If
+.Va action
+is set to
+.Dv PF_GET_CLR_CNTR ,
+the per-rule statistics on the requested rule are cleared.
+.It Dv DIOCGETADDRS Fa "struct pfioc_pooladdr *pp"
+Get a
 .Va ticket
 for subsequent
 .Dv DIOCGETADDR
-calls and
+calls and the number
 .Va nr
 of pool addresses in the rule specified with
 .Va r_action ,
 .Va r_num ,
-.Va anchor
 and
-.Va ruleset .
-.It Dv DIOCGETADDR     Fa "struct pfioc_pooladdr"
-Returns pool address
+.Va anchor .
+.It Dv DIOCGETADDR Fa "struct pfioc_pooladdr *pp"
+Get the pool address
 .Va addr
-number
+by its number
 .Va nr
 from the rule specified with
 .Va r_action ,
 .Va r_num ,
-.Va anchor
 and
-.Va ruleset
-using
+.Va anchor
+using the
 .Va ticket
 obtained through a preceding
 .Dv DIOCGETADDRS
 call.
-.It Dv DIOCGETALTQS    Fa "struct pfioc_altq"
-Returns
+.It Dv DIOCGETALTQS Fa "struct pfioc_altq *pa"
+Get a
 .Va ticket
 for subsequent
 .Dv DIOCGETALTQ
-calls and
+calls and the number
 .Va nr
 of queues in the active list.
-.It Dv DIOCGETALTQ     Fa "struct pfioc_altq"
-Returns
+.It Dv DIOCGETALTQ Fa "struct pfioc_altq *pa"
+Get the queueing discipline
 .Va altq
-number
+by its number
 .Va nr
-using
+using the
 .Va ticket
 obtained through a preceding
 .Dv DIOCGETALTQS
 call.
-.It Dv DIOCGETQSTATS   Fa "struct pfioc_qstats"
-Returns statistics on a queue.
+.It Dv DIOCGETQSTATS Fa "struct pfioc_qstats *pq"
+Get the statistics on a queue.
 .Bd -literal
 struct pfioc_qstats {
        u_int32_t        ticket;
@@ -235,54 +246,99 @@ struct pfioc_qstats {
 };
 .Ed
 .Pp
-A pointer to a buffer of statistics
-.Va buf
+This call fills in a pointer to the buffer of statistics
+.Va buf ,
 of length
-.Va nbytes
+.Va nbytes ,
 for the queue specified by
 .Va nr .
-.It Dv DIOCADDSTATE    Fa "struct pfioc_state"
-Adds a state entry.
-.It Dv DIOCGETSTATE    Fa "struct pfioc_state"
+.It Dv DIOCGETRULESETS Fa "struct pfioc_ruleset *pr"
+.Bd -literal
+struct pfioc_ruleset {
+       u_int32_t        nr;
+       char             path[MAXPATHLEN];
+       char             name[PF_ANCHOR_NAME_SIZE];
+};
+.Ed
+.Pp
+Get the number
+.Va nr
+of rulesets (i.e., anchors) directly attached to the anchor named by
+.Va path
+for use in subsequent
+.Dv DIOCGETRULESET
+calls.
+Nested anchors, since they are not directly attached to the given
+anchor, will not be included.
+This ioctl returns
+.Er EINVAL
+if the given anchor does not exist.
+.It Dv DIOCGETRULESET Fa "struct pfioc_ruleset *pr"
+Get a ruleset (i.e., an anchor)
+.Va name
+by its number
+.Va nr
+from the given anchor
+.Va path ,
+the maximum number of which can be obtained from a preceding
+.Dv DIOCGETRULESETS
+call.
+This ioctl returns
+.Er EINVAL
+if the given anchor does not exist or
+.Er EBUSY
+if another process is concurrently updating a ruleset.
+.It Dv DIOCADDSTATE Fa "struct pfioc_state *ps"
+Add a state entry.
 .Bd -literal
 struct pfioc_state {
        u_int32_t        nr;
        struct pf_state  state;
 };
 .Ed
-.Pp
-Extracts the entry with the specified number from the state table.
-.It Dv DIOCKILLSTATES  Fa "struct pfioc_state_kill"
-Removes matching entries from the state table.
-Returns the number of killed states in psk_af.
+.It Dv DIOCGETSTATE Fa "struct pfioc_state *ps"
+Extract the entry with the specified number
+.Va nr
+from the state table.
+.It Dv DIOCKILLSTATES Fa "struct pfioc_state_kill *psk"
+Remove matching entries from the state table.
+This ioctl returns the number of killed states in
+.Va psk_af .
 .Bd -literal
 struct pfioc_state_kill {
-       int                     psk_af;
+       sa_family_t             psk_af;
        int                     psk_proto;
        struct pf_rule_addr     psk_src;
        struct pf_rule_addr     psk_dst;
        char                    psk_ifname[IFNAMSIZ];
 };
 .Ed
-.It Dv DIOCCLRSTATES  Fa "struct pfioc_state_kill"
-Clears all states.
+.It Dv DIOCCLRSTATES Fa "struct pfioc_state_kill *psk"
+Clear all states.
 It works like
 .Dv DIOCKILLSTATES ,
-but ignores the psk_af, psk_proto, psk_src and psk_dst fields of the
-.Fa pfioc_state_kill
+but ignores the
+.Va psk_af ,
+.Va psk_proto ,
+.Va psk_src ,
+and
+.Va psk_dst
+fields of the
+.Vt pfioc_state_kill
 structure.
-.It Dv DIOCSETSTATUSIF Fa "struct pfioc_if"
+.It Dv DIOCSETSTATUSIF Fa "struct pfioc_if *pi"
+Specify the interface for which statistics are accumulated.
 .Bd -literal
 struct pfioc_if {
        char             ifname[IFNAMSIZ];
 };
 .Ed
-.Pp
-Specifies the interface for which statistics are accumulated.
-.It Dv DIOCGETSTATUS   Fa "struct pf_status"
+.It Dv DIOCGETSTATUS Fa "struct pf_status *s"
+Get the internal packet filter statistics.
 .Bd -literal
 struct pf_status {
        u_int64_t       counters[PFRES_MAX];
+       u_int64_t       lcounters[LCNT_MAX];
        u_int64_t       fcounters[FCNT_MAX];
        u_int64_t       scounters[SCNT_MAX];
        u_int64_t       pcounters[2][2][3];
@@ -293,14 +349,13 @@ struct pf_status {
        u_int32_t       since;
        u_int32_t       debug;
        char            ifname[IFNAMSIZ];
+       u_int8_t        pf_chksum[MD5_DIGEST_LENGTH];
 };
 .Ed
-.Pp
-Gets the internal packet filter statistics.
 .It Dv DIOCCLRSTATUS
-Clears the internal packet filter statistics.
-.It Dv DIOCNATLOOK     Fa "struct pfioc_natlook"
-Looks up a state table entry by source and destination addresses and ports.
+Clear the internal packet filter statistics.
+.It Dv DIOCNATLOOK Fa "struct pfioc_natlook *pnl"
+Look up a state table entry by source and destination addresses and ports.
 .Bd -literal
 struct pfioc_natlook {
        struct pf_addr   saddr;
@@ -316,99 +371,145 @@ struct pfioc_natlook {
        u_int8_t         direction;
 };
 .Ed
-.It Dv DIOCSETDEBUG    Fa "u_int32_t"
-Sets the debug level.
+.It Dv DIOCSETDEBUG Fa "u_int32_t *level"
+Set the debug level.
 .Bd -literal
-enum   { PF_DEBUG_NONE=0, PF_DEBUG_URGENT=1, PF_DEBUG_MISC=2 };
+enum   { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC,
+         PF_DEBUG_NOISY };
 .Ed
-.It Dv DIOCGETSTATES   Fa "struct pfioc_states"
+.It Dv DIOCGETSTATES Fa "struct pfioc_states *ps"
+Get state table entries.
 .Bd -literal
 struct pfioc_states {
        int     ps_len;
        union {
-               caddr_t psu_buf;
+               caddr_t          psu_buf;
                struct pf_state *psu_states;
        } ps_u;
 #define ps_buf         ps_u.psu_buf
 #define ps_states      ps_u.psu_states
 };
 .Ed
-.It Dv DIOCCHANGERULE  Fa "struct pfioc_rule"
-Adds or removes the
+.Pp
+If
+.Va ps_len
+is non-zero on entry, as many states as possible that can fit into this
+size will be copied into the supplied buffer
+.Va ps_states .
+On exit,
+.Va ps_len
+is always set to the total size required to hold all state table entries
+(i.e., it is set to
+.Li sizeof(struct pf_state) * nr ) .
+.It Dv DIOCCHANGERULE Fa "struct pfioc_rule *pcr"
+Add or remove the
 .Va rule
 in the ruleset specified by
 .Va rule.action .
-.Bd -literal
-enum   { PF_CHANGE_ADD_HEAD=1, PF_CHANGE_ADD_TAIL=2,
-         PF_CHANGE_ADD_BEFORE=3, PF_CHANGE_ADD_AFTER=4,
-         PF_CHANGE_REMOVE=5, PF_CHANGE_GET_TICKET=6 };
-.Ed
 .Pp
 The type of operation to be performed is indicated by
-.Va action .
+.Va action ,
+which can be any of the following:
+.Bd -literal
+enum   { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL,
+         PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER,
+         PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET };
+.Ed
 .Pp
 .Va ticket
-must be set to the value obtained with PF_CHANGE_GET_TICKET
-for all actions except PF_CHANGE_GET_TICKET.
+must be set to the value obtained with
+.Dv PF_CHANGE_GET_TICKET
+for all actions except
+.Dv PF_CHANGE_GET_TICKET .
 .Va pool_ticket
 must be set to the value obtained with the
 .Dv DIOCBEGINADDRS
-call for all actions except PF_CHANGE_REMOVE and PF_CHANGE_GET_TICKET.
-.Pp
-.Va anchor
+call for all actions except
+.Dv PF_CHANGE_REMOVE
 and
-.Va ruleset
-indicate which anchor and ruleset the operation applies to.
+.Dv PF_CHANGE_GET_TICKET .
+.Va anchor
+indicates to which anchor the operation applies.
 .Va nr
-indicates the rule number against which PF_CHANGE_ADD_BEFORE,
-PF_CHANGE_ADD_AFTER or PF_CHANGE_REMOVE actions are applied.
-.It Dv DIOCCHANGEADDR  Fa "struct pfioc_pooladdr"
-Adds or removes a pool address
+indicates the rule number against which
+.Dv PF_CHANGE_ADD_BEFORE ,
+.Dv PF_CHANGE_ADD_AFTER ,
+or
+.Dv PF_CHANGE_REMOVE
+actions are applied.
+.\" It Dv DIOCCHANGEALTQ Fa "struct pfioc_altq *pcr"
+.It Dv DIOCCHANGEADDR Fa "struct pfioc_pooladdr *pca"
+Add or remove the pool address
 .Va addr
-from a rule specified with
+from the rule specified by
 .Va r_action ,
 .Va r_num ,
-.Va anchor
 and
-.Va ruleset .
-.It Dv DIOCSETTIMEOUT  Fa "struct pfioc_tm"
+.Va anchor .
+.It Dv DIOCSETTIMEOUT Fa "struct pfioc_tm *pt"
 .Bd -literal
 struct pfioc_tm {
        int              timeout;
        int              seconds;
 };
 .Ed
-.It Dv DIOCGETTIMEOUT  Fa "struct pfioc_tm"
+.Pp
+Set the state timeout of
+.Va timeout
+to
+.Va seconds .
+The old value will be placed into
+.Va seconds .
+For possible values of
+.Va timeout ,
+consult the
+.Dv PFTM_*
+values in
+.Aq Pa net/pfvar.h .
+.It Dv DIOCGETTIMEOUT Fa "struct pfioc_tm *pt"
+Get the state timeout of
+.Va timeout .
+The value will be placed into the
+.Va seconds
+field.
 .It Dv DIOCCLRRULECTRS
 Clear per-rule statistics.
-.It Dv DIOCSETLIMIT   Fa "struct pfioc_limit"
-Sets hard limits on the memory pools used by the packet filter.
+.It Dv DIOCSETLIMIT Fa "struct pfioc_limit *pl"
+Set the hard limits on the memory pools used by the packet filter.
 .Bd -literal
 struct pfioc_limit {
        int             index;
        unsigned        limit;
 };
+
+enum   { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS,
+         PF_LIMIT_TABLES, PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX };
 .Ed
-.It Dv DIOCGETLIMIT   Fa "struct pfioc_limit"
-.It Dv DIOCRCLRTABLES Fa "struct pfioc_table"
+.It Dv DIOCGETLIMIT Fa "struct pfioc_limit *pl"
+Get the hard
+.Va limit
+for the memory pool indicated by
+.Va index .
+.It Dv DIOCRCLRTABLES Fa "struct pfioc_table *io"
 Clear all tables.
-All the IOCTLs that manipulate radix tables
+All the ioctls that manipulate radix tables
 use the same structure described below.
 For
-.Dv DIOCRCLRTABLES, pfrio_ndel contains on exit the number
-of tables deleted.
+.Dv DIOCRCLRTABLES ,
+.Va pfrio_ndel
+contains on exit the number of tables deleted.
 .Bd -literal
 struct pfioc_table {
-        struct pfr_table         pfrio_table;
-        void                    *pfrio_buffer;
-        int                      pfrio_esize;
-        int                      pfrio_size;
-        int                      pfrio_size2;
-        int                      pfrio_nadd;
-        int                      pfrio_ndel;
-        int                      pfrio_nchange;
-        int                      pfrio_flags;
-        int                      pfrio_ticket;
+       struct pfr_table         pfrio_table;
+       void                    *pfrio_buffer;
+       int                      pfrio_esize;
+       int                      pfrio_size;
+       int                      pfrio_size2;
+       int                      pfrio_nadd;
+       int                      pfrio_ndel;
+       int                      pfrio_nchange;
+       int                      pfrio_flags;
+       u_int32_t                pfrio_ticket;
 };
 #define pfrio_exists    pfrio_nadd
 #define pfrio_nzero     pfrio_nadd
@@ -417,192 +518,359 @@ struct pfioc_table {
 #define pfrio_setflag   pfrio_size2
 #define pfrio_clrflag   pfrio_nadd
 .Ed
-.It Dv DIOCRADDTABLES Fa "struct pfioc_table"
-Creates one or more tables.
-On entry, pfrio_buffer[pfrio_size] contains a table of pfr_table structures.
-On exit, pfrio_nadd contains the number of tables effectively created.
+.It Dv DIOCRADDTABLES Fa "struct pfioc_table *io"
+Create one or more tables.
+On entry,
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_table
+containing at least
+.Vt pfrio_size
+elements.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_table .
+On exit,
+.Va pfrio_nadd
+contains the number of tables effectively created.
 .Bd -literal
 struct pfr_table {
-        char                     pfrt_anchor[PF_ANCHOR_NAME_SIZE];
-        char                     pfrt_ruleset[PF_RULESET_NAME_SIZE];
-        char                     pfrt_name[PF_TABLE_NAME_SIZE];
-        u_int32_t                pfrt_flags;
-        u_int8_t                 pfrt_fback;
+       char            pfrt_anchor[MAXPATHLEN];
+       char            pfrt_name[PF_TABLE_NAME_SIZE];
+       u_int32_t       pfrt_flags;
+       u_int8_t        pfrt_fback;
 };
 .Ed
-.It Dv DIOCRDELTABLES Fa "struct pfioc_table"
-Deletes one or more tables.
-On entry, pfrio_buffer[pfrio_size] contains a table of pfr_table structures.
-On exit, pfrio_nadd contains the number of tables effectively deleted.
-.It Dv DIOCRGETTABLES Fa "struct pfioc_table"
+.It Dv DIOCRDELTABLES Fa "struct pfioc_table *io"
+Delete one or more tables.
+On entry,
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_table
+containing at least
+.Vt pfrio_size
+elements.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_table .
+On exit,
+.Va pfrio_ndel
+contains the number of tables effectively deleted.
+.It Dv DIOCRGETTABLES Fa "struct pfioc_table *io"
 Get the list of all tables.
-On entry, pfrio_buffer[pfrio_size] contains a valid writeable buffer for
-pfr_table structures.
-On exit, pfrio_size contains the number of tables written into the buffer.
+On entry,
+.Va pfrio_buffer[pfrio_size]
+contains a valid writeable buffer for
+.Vt pfr_table
+structures.
+On exit,
+.Va pfrio_size
+contains the number of tables written into the buffer.
 If the buffer is too small, the kernel does not store anything but just
 returns the required buffer size, without error.
-.It Dv DIOCRGETTSTATS Fa "struct pfioc_table"
-Like
-.Dv DIOCRGETTABLES ,
-but returns an array of pfr_tstats structures.
+.It Dv DIOCRGETTSTATS Fa "struct pfioc_table *io"
+This call is like
+.Dv DIOCRGETTABLES
+but is used to get an array of
+.Vt pfr_tstats
+structures.
 .Bd -literal
 struct pfr_tstats {
-        struct pfr_table pfrts_t;
-        u_int64_t        pfrts_packets
-                             [PFR_DIR_MAX][PFR_OP_TABLE_MAX];
-        u_int64_t        pfrts_bytes
-                             [PFR_DIR_MAX][PFR_OP_TABLE_MAX];
-        u_int64_t        pfrts_match;
-        u_int64_t        pfrts_nomatch;
-        long             pfrts_tzero;
-        int              pfrts_cnt;
-        int              pfrts_refcnt[PFR_REFCNT_MAX];
+       struct pfr_table pfrts_t;
+       u_int64_t        pfrts_packets
+                            [PFR_DIR_MAX][PFR_OP_TABLE_MAX];
+       u_int64_t        pfrts_bytes
+                            [PFR_DIR_MAX][PFR_OP_TABLE_MAX];
+       u_int64_t        pfrts_match;
+       u_int64_t        pfrts_nomatch;
+       long             pfrts_tzero;
+       int              pfrts_cnt;
+       int              pfrts_refcnt[PFR_REFCNT_MAX];
 };
-#define pfrts_name      pfrts_t.pfrt_name
-#define pfrts_flags     pfrts_t.pfrt_flags
+#define pfrts_name      pfrts_t.pfrt_name
+#define pfrts_flags     pfrts_t.pfrt_flags
 .Ed
-.It Dv DIOCRCLRTSTATS Fa "struct pfioc_table"
-Clears the statistics of one or more tables.
-On entry, pfrio_buffer[pfrio_size] contains a table of pfr_table structures.
-On exit, pfrio_nzero contains the number of tables effectively cleared.
-.It Dv DIOCRCLRADDRS Fa "struct pfioc_table"
+.It Dv DIOCRCLRTSTATS Fa "struct pfioc_table *io"
+Clear the statistics of one or more tables.
+On entry,
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_table
+containing at least
+.Vt pfrio_size
+elements.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_table .
+On exit,
+.Va pfrio_nzero
+contains the number of tables effectively cleared.
+.It Dv DIOCRCLRADDRS Fa "struct pfioc_table *io"
 Clear all addresses in a table.
-On entry, pfrio_table contains the table to clear.
-On exit, pfrio_ndel contains the number of addresses removed.
-.It Dv DIOCRADDADDRS Fa "struct pfioc_table"
+On entry,
+.Va pfrio_table
+contains the table to clear.
+On exit,
+.Va pfrio_ndel
+contains the number of addresses removed.
+.It Dv DIOCRADDADDRS Fa "struct pfioc_table *io"
 Add one or more addresses to a table.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains the list of pfr_addr structures to add.
-On exit, pfrio_nadd contains the number of addresses effectively added.
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_addr
+containing at least
+.Vt pfrio_size
+elements to add to the table.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_addr .
+On exit,
+.Va pfrio_nadd
+contains the number of addresses effectively added.
 .Bd -literal
 struct pfr_addr {
-        union {
-                struct in_addr   _pfra_ip4addr;
-                struct in6_addr  _pfra_ip6addr;
-        }                pfra_u;
-        u_int8_t         pfra_af;
-        u_int8_t         pfra_net;
-        u_int8_t         pfra_not;
-        u_int8_t         pfra_fback;
+       union {
+               struct in_addr   _pfra_ip4addr;
+               struct in6_addr  _pfra_ip6addr;
+       }                pfra_u;
+       u_int8_t         pfra_af;
+       u_int8_t         pfra_net;
+       u_int8_t         pfra_not;
+       u_int8_t         pfra_fback;
 };
 #define pfra_ip4addr    pfra_u._pfra_ip4addr
 #define pfra_ip6addr    pfra_u._pfra_ip6addr
 .Ed
-.It Dv DIOCRDELADDRS Fa "struct pfioc_table"
+.It Dv DIOCRDELADDRS Fa "struct pfioc_table *io"
 Delete one or more addresses from a table.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains the list of pfr_addr structures to delete.
-On exit, pfrio_ndel contains the number of addresses effectively deleted.
-.It Dv DIOCRSETADDRS Fa "struct pfioc_table"
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_addr
+containing at least
+.Vt pfrio_size
+elements to delete from the table.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_addr .
+On exit,
+.Va pfrio_ndel
+contains the number of addresses effectively deleted.
+.It Dv DIOCRSETADDRS Fa "struct pfioc_table *io"
 Replace the content of a table by a new address list.
 This is the most complicated command, which uses all the structure members.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains the new list of pfr_addr structures.
-In addition to that, if size2 is nonzero, pfrio_buffer[pfrio_size..pfrio_size2]
-must be a writeable buffer, into which the kernel can copy the addresses that
-have been deleted during the replace operation.
-On exit, pfrio_ndel, pfrio_nadd and pfrio_nchange contain the number of
-addresses deleted, added and changed by the kernel.
-If pfrio_size2 was set on
-entry, pfrio_size2 will point to the size of the buffer used, exactly like
+.Pp
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_addr
+containing at least
+.Vt pfrio_size
+elements which become the new contents of the table.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_addr .
+Additionally, if
+.Va pfrio_size2
+is non-zero,
+.Va pfrio_buffer[pfrio_size..pfrio_size2]
+must be a writeable buffer, into which the kernel can copy the
+addresses that have been deleted during the replace operation.
+On exit,
+.Va pfrio_ndel ,
+.Va pfrio_nadd ,
+and
+.Va pfrio_nchange
+contain the number of addresses deleted, added, and changed by the
+kernel.
+If
+.Va pfrio_size2
+was set on entry,
+.Va pfrio_size2
+will point to the size of the buffer used, exactly like
 .Dv DIOCRGETADDRS .
-.It Dv DIOCRGETADDRS Fa "struct pfioc_table"
+.It Dv DIOCRGETADDRS Fa "struct pfioc_table *io"
 Get all the addresses of a table.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains a valid writeable buffer for pfr_addr structures.
-On exit, pfrio_size contains the number of addresses written into the buffer.
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer[pfrio_size]
+contains a valid writeable buffer for
+.Vt pfr_addr
+structures.
+On exit,
+.Va pfrio_size
+contains the number of addresses written into the buffer.
 If the buffer was too small, the kernel does not store anything but just
-return the required buffer size, without returning an error.
-.It Dv DIOCRGETASTATS Fa "struct pfioc_table"
-Like
-.Dv DIOCRGETADDRS ,
-but returns an array of pfr_astats structures.
+returns the required buffer size, without returning an error.
+.It Dv DIOCRGETASTATS Fa "struct pfioc_table *io"
+This call is like
+.Dv DIOCRGETADDRS
+but is used to get an array of
+.Vt pfr_astats
+structures.
 .Bd -literal
 struct pfr_astats {
-        struct pfr_addr  pfras_a;
-        u_int64_t        pfras_packets
-                             [PFR_DIR_MAX][PFR_OP_ADDR_MAX];
-        u_int64_t        pfras_bytes
-                             [PFR_DIR_MAX][PFR_OP_ADDR_MAX];
-        long             pfras_tzero;
+       struct pfr_addr  pfras_a;
+       u_int64_t        pfras_packets
+                            [PFR_DIR_MAX][PFR_OP_ADDR_MAX];
+       u_int64_t        pfras_bytes
+                            [PFR_DIR_MAX][PFR_OP_ADDR_MAX];
+       long             pfras_tzero;
 };
 .Ed
-.It Dv DIOCRCLRASTATS Fa "struct pfioc_table"
-Clears the statistics of one or more addresses.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains a table of pfr_addr structures to clear.
-On exit, pfrio_nzero contains the number of addresses effectively cleared.
-.It Dv DIOCRTSTADDRS Fa "struct pfioc_table"
+.It Dv DIOCRCLRASTATS Fa "struct pfioc_table *io"
+Clear the statistics of one or more addresses.
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_addr
+containing at least
+.Vt pfrio_size
+elements to be cleared from the table.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_addr .
+On exit,
+.Va pfrio_nzero
+contains the number of addresses effectively cleared.
+.It Dv DIOCRTSTADDRS Fa "struct pfioc_table *io"
 Test if the given addresses match a table.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains a table of pfr_addr structures to test.
-On exit, the kernel updates the pfr_addr table by setting the pfra_fback
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_addr
+containing at least
+.Vt pfrio_size
+elements, each of which will be tested for a match in the table.
+.Vt pfrio_esize
+must be the size of
+.Vt struct pfr_addr .
+On exit, the kernel updates the
+.Vt pfr_addr
+array by setting the
+.Va pfra_fback
 member appropriately.
-.It Dv DIOCRSETTFLAGS Fa "struct pfioc_table"
+.It Dv DIOCRSETTFLAGS Fa "struct pfioc_table *io"
 Change the
-.Va const
+.Dv PFR_TFLAG_CONST
 or
-.Va persist
-flag of a table.
-On entry, pfrio_buffer[pfrio_size] contains a table of pfr_table structures,
-and pfrio_setflag contains the flags to add, while pfrio_clrflag contains the
-flags to remove.
-On exit, pfrio_nchange and pfrio_ndel contain the number of tables altered
-or deleted by the kernel.
+.Dv PFR_TFLAG_PERSIST
+flags of a table.
+On entry,
+.Va pfrio_buffer
+must point to an array of
+.Vt struct pfr_table
+containing at least
+.Vt pfrio_size
+elements.
+.Va pfrio_esize
+must be the size of
+.Vt struct pfr_table .
+.Va pfrio_setflag
+must contain the flags to add, while
+.Va pfrio_clrflag
+must contain the flags to remove.
+On exit,
+.Va pfrio_nchange
+and
+.Va pfrio_ndel
+contain the number of tables altered or deleted by the kernel.
 Yes, tables can be deleted if one removes the
-.Va persist
+.Dv PFR_TFLAG_PERSIST
 flag of an unreferenced table.
-.It Dv DIOCRINADEFINE Fa "struct pfioc_table"
+.It Dv DIOCRINADEFINE Fa "struct pfioc_table *io"
 Defines a table in the inactive set.
-On entry, pfrio_table contains the table id and pfrio_buffer[pfrio_size]
-contains the list of pfr_addr structures to put in the table.
-A valid ticket must also be supplied to pfrio_ticket.
-On exit, pfrio_nadd contains 0 if the table was already defined in the
-inactive list, or 1 if a new table has been created.
-pfrio_naddr contains the number of addresses effectively put in the table.
-.It Dv DIOCXBEGIN Fa "struct pfioc_trans"
+On entry,
+.Va pfrio_table
+contains the table ID and
+.Va pfrio_buffer[pfrio_size]
+contains an array of
+.Vt pfr_addr
+structures to put in the table.
+A valid ticket must also be supplied to
+.Va pfrio_ticket .
+On exit,
+.Va pfrio_nadd
+contains 0 if the table was already defined in the inactive list
+or 1 if a new table has been created.
+.Va pfrio_naddr
+contains the number of addresses effectively put in the table.
+.It Dv DIOCXBEGIN Fa "struct pfioc_trans *io"
 .Bd -literal
-#define PF_RULESET_ALTQ         (PF_RULESET_MAX)
-#define PF_RULESET_TABLE        (PF_RULESET_MAX+1)
 struct pfioc_trans {
-        int              size;  /* number of elements */
-        int              esize; /* size of each element in bytes */
-        struct pfioc_trans_e {
-                int             rs_num;
-                char            anchor[PF_ANCHOR_NAME_SIZE];
-                char            ruleset[PF_RULESET_NAME_SIZE];
-                u_int32_t       ticket;
-        }               *array;
+       int              size;  /* number of elements */
+       int              esize; /* size of each element in bytes */
+       struct pfioc_trans_e {
+               int             rs_num;
+               char            anchor[MAXPATHLEN];
+               u_int32_t       ticket;
+       }               *array;
 };
 .Ed
 .Pp
-Clears all the inactive rulesets specified in the
-.Fa "struct pfioc_trans_e"
+Clear all the inactive rulesets specified in the
+.Vt pfioc_trans_e
 array.
-For each ruleset, a ticket is returned for subsequent "add rule" IOCTLs,
+For each ruleset, a ticket is returned for subsequent "add rule" ioctls,
 as well as for the
 .Dv DIOCXCOMMIT
 and
 .Dv DIOCXROLLBACK
 calls.
-.It Dv DIOCXCOMMIT Fa "struct pfioc_trans"
+.Pp
+Ruleset types, identified by
+.Va rs_num ,
+include the following:
+.Pp
+.Bl -tag -width PF_RULESET_FILTER -offset ind -compact
+.It Dv PF_RULESET_SCRUB
+Scrub (packet normalization) rules.
+.It Dv PF_RULESET_FILTER
+Filter rules.
+.It Dv PF_RULESET_NAT
+NAT (Network Address Translation) rules.
+.It Dv PF_RULESET_BINAT
+Bidirectional NAT rules.
+.It Dv PF_RULESET_RDR
+Redirect rules.
+.It Dv PF_RULESET_ALTQ
+ALTQ disciplines.
+.It Dv PF_RULESET_TABLE
+Address tables.
+.El
+.It Dv DIOCXCOMMIT Fa "struct pfioc_trans *io"
 Atomically switch a vector of inactive rulesets to the active rulesets.
-Implemented as a standard 2-phase commit, which will either fail for all
-rulesets or completely succeed.
+This call is implemented as a standard two-phase commit, which will either
+fail for all rulesets or completely succeed.
 All tickets need to be valid.
-Returns
+This ioctl returns
 .Er EBUSY
-if a concurrent process is trying to update some of the same rulesets
-concurrently.
-.It Dv DIOCXROLLBACK Fa "struct pfioc_trans"
+if another process is concurrently updating some of the same rulesets.
+.It Dv DIOCXROLLBACK Fa "struct pfioc_trans *io"
 Clean up the kernel by undoing all changes that have taken place on the
 inactive rulesets since the last
 .Dv DIOCXBEGIN .
 .Dv DIOCXROLLBACK
 will silently ignore rulesets for which the ticket is invalid.
-.It Dv DIOCFPFLUSH
+.It Dv DIOCSETHOSTID Fa "u_int32_t *hostid"
+Set the host ID, which is used by
+.Xr pfsync 4
+to identify which host created state table entries.
+.It Dv DIOCOSFPFLUSH
 Flush the passive OS fingerprint table.
-.It Dv DIOCFPADD Fa "struct pf_osfp_ioctl"
+.It Dv DIOCOSFPADD Fa "struct pf_osfp_ioctl *io"
 .Bd -literal
 struct pf_osfp_ioctl {
        struct pf_osfp_entry {
@@ -612,12 +880,14 @@ struct pf_osfp_ioctl {
                char                    fp_version_nm[PF_OSFP_LEN];
                char                    fp_subtype_nm[PF_OSFP_LEN];
        }                       fp_os;
-       u_int16_t               fp_mss;
+       pf_tcpopts_t            fp_tcpopts;
        u_int16_t               fp_wsize;
        u_int16_t               fp_psize;
-       u_int8_t                fp_ttl;
+       u_int16_t               fp_mss;
+       u_int16_t               fp_flags;
+       u_int8_t                fp_optcnt;
        u_int8_t                fp_wscale;
-       u_int8_t                fp_flags;
+       u_int8_t                fp_ttl;
        int                     fp_getnum;
 };
 .Ed
@@ -637,41 +907,42 @@ The members
 .Va fp_wsize ,
 .Va fp_psize ,
 .Va fp_ttl ,
+.Va fp_optcnt ,
 and
 .Va fp_wscale
-are set to the TCP MSS, the TCP window size, the IP length and the IP TTL of
-the TCP SYN packet respectively.
+are set to the TCP MSS, the TCP window size, the IP length, the IP TTL,
+the number of TCP options, and the TCP window scaling constant of the
+TCP SYN packet, respectively.
+.Pp
 The
 .Va fp_flags
 member is filled according to the
-.In net/pf/pfvar.h
-include file PF_OSFP_* defines.
+.Aq Pa net/pfvar.h
+include file
+.Dv PF_OSFP_*
+defines.
+The
+.Va fp_tcpopts
+member contains packed TCP options.
+Each option uses
+.Dv PF_OSFP_TCPOPT_BITS
+bits in the packed value.
+Options include any of
+.Dv PF_OSFP_TCPOPT_NOP ,
+.Dv PF_OSFP_TCPOPT_SACK ,
+.Dv PF_OSFP_TCPOPT_WSCALE ,
+.Dv PF_OSFP_TCPOPT_MSS ,
+or
+.Dv PF_OSFP_TCPOPT_TS .
+.Pp
 The
 .Va fp_getnum
-is not used with this ioctl.
+member is not used with this ioctl.
 .Pp
-The structure's slack space must be zeroed for correct operation; memset
+The structure's slack space must be zeroed for correct operation;
+.Xr memset 3
 the whole structure to zero before filling and sending to the kernel.
-.It Dv DIOCFPGET Fa "struct pf_osfp_ioctl"
-.Bd -literal
-struct pf_osfp_ioctl {
-       struct pf_osfp_entry {
-               SLIST_ENTRY(pf_osfp_entry) fp_entry;
-               pf_osfp_t               fp_os;
-               char                    fp_class_nm[PF_OSFP_LEN];
-               char                    fp_version_nm[PF_OSFP_LEN];
-               char                    fp_subtype_nm[PF_OSFP_LEN];
-       }                       fp_os;
-       u_int16_t               fp_mss;
-       u_int16_t               fp_wsize;
-       u_int16_t               fp_psize;
-       u_int8_t                fp_ttl;
-       u_int8_t                fp_wscale;
-       u_int8_t                fp_flags;
-       int                     fp_getnum;
-};
-.Ed
-.Pp
+.It Dv DIOCOSFPGET Fa "struct pf_osfp_ioctl *io"
 Get the passive OS fingerprint number
 .Va fp_getnum
 from the kernel's fingerprint list.
@@ -680,24 +951,21 @@ Get the whole list by repeatedly incrementing the
 .Va fp_getnum
 number until the ioctl returns
 .Er EBUSY .
-.It Dv DIOCGETSRCNODES Fa "struct pfioc_src_nodes"
+.It Dv DIOCGETSRCNODES Fa "struct pfioc_src_nodes *psn"
 .Bd -literal
 struct pfioc_src_nodes {
-        int     psn_len;
-        union {
-                caddr_t          psu_buf;
-                struct pf_src_node      *psu_src_nodes;
-        } psn_u;
-#define psn_buf         psn_u.psu_buf
-#define psn_src_nodes   psn_u.psu_src_nodes
+       int     psn_len;
+       union {
+               caddr_t         psu_buf;
+               struct pf_src_node      *psu_src_nodes;
+       } psn_u;
+#define psn_buf                psn_u.psu_buf
+#define psn_src_nodes  psn_u.psu_src_nodes
 };
 .Ed
 .Pp
-Get the list of source nodes kept by the
-.Ar sticky-address
-and
-.Ar source-track
-options.
+Get the list of source nodes kept by sticky addresses and source
+tracking.
 The ioctl must be called once with
 .Va psn_len
 set to 0.
@@ -711,29 +979,25 @@ placed in
 .Va psn_buf .
 The ioctl must then be called again to fill this buffer with the actual
 source node data.
-After the ioctl call
+After that call,
 .Va psn_len
 will be set to the length of the buffer actually used.
-.It Dv DIOCCLRSRCNODES Fa "struct pfioc_table"
+.It Dv DIOCCLRSRCNODES
 Clear the tree of source tracking nodes.
-.It Dv DIOCIGETIFACES Fa "struct pfioc_iface"
-Gets the list of interfaces and interface drivers known to
+.It Dv DIOCIGETIFACES Fa "struct pfioc_iface *io"
+Get the list of interfaces and interface drivers known to
 .Nm .
-All the IOCTLs that manipulate interfaces
+All the ioctls that manipulate interfaces
 use the same structure described below:
 .Bd -literal
 struct pfioc_iface {
-        char                     pfiio_name[IFNAMSIZ];
-        void                    *pfiio_buffer;
-        int                      pfiio_esize;
-        int                      pfiio_size;
-        int                      pfiio_nzero;
-        int                      pfiio_flags;
+       char                     pfiio_name[IFNAMSIZ];
+       void                    *pfiio_buffer;
+       int                      pfiio_esize;
+       int                      pfiio_size;
+       int                      pfiio_nzero;
+       int                      pfiio_flags;
 };
-
-#define PFI_FLAG_GROUP     0x0001  /* gets groups of interfaces */
-#define PFI_FLAG_INSTANCE  0x0002  /* gets single interfaces */
-#define PFI_FLAG_ALLMASK   0x0003
 .Ed
 .Pp
 If not empty,
@@ -743,57 +1007,60 @@ can be used to restrict the search to a specific interface or driver.
 is the user-supplied buffer for returning the data.
 On entry,
 .Va pfiio_size
-represents the number of
-.Va pfi_if
+contains the number of
+.Vt pfi_kif
 entries that can fit into the buffer.
 The kernel will replace this value by the real number of entries it wants
 to return.
 .Va pfiio_esize
-should be set to sizeof(struct pfi_if).
-.Va pfiio_flags
 should be set to
-.Dv PFI_FLAG_GROUP , PFI_FLAG_INSTANCE ,
-or both to tell the kernel to return a group of interfaces
-(drivers, like "fxp"), real interface instances (like "fxp1") or both.
+.Li sizeof(struct pfi_kif) .
+.Pp
 The data is returned in the
-.Va pfi_if
+.Vt pfi_kif
 structure described below:
 .Bd -literal
-struct pfi_if {
-        char                             pfif_name[IFNAMSIZ];
-        u_int64_t                        pfif_packets[2][2][2];
-        u_int64_t                        pfif_bytes[2][2][2];
-        u_int64_t                        pfif_addcnt;
-        u_int64_t                        pfif_delcnt;
-        long                             pfif_tzero;
-        int                              pfif_states;
-        int                              pfif_rules;
-        int                              pfif_flags;
+struct pfi_kif {
+       RB_ENTRY(pfi_kif)                pfik_tree;
+       char                             pfik_name[IFNAMSIZ];
+       u_int64_t                        pfik_packets[2][2][2];
+       u_int64_t                        pfik_bytes[2][2][2];
+       u_int32_t                        pfik_tzero;
+       int                              pfik_flags;
+       struct pf_state_tree_lan_ext     pfik_lan_ext;
+       struct pf_state_tree_ext_gwy     pfik_ext_gwy;
+       TAILQ_ENTRY(pfi_kif)             pfik_w_states;
+       void                            *pfik_ah_cookie;
+       struct ifnet                    *pfik_ifp;
+       struct ifg_group                *pfik_group;
+       int                              pfik_states;
+       int                              pfik_rules;
+       TAILQ_HEAD(, pfi_dynaddr)        pfik_dynaddrs;
 };
-
-#define PFI_IFLAG_GROUP         0x0001  /* group of interfaces */
-#define PFI_IFLAG_INSTANCE      0x0002  /* single instance */
-#define PFI_IFLAG_CLONABLE      0x0010  /* clonable group */
-#define PFI_IFLAG_DYNAMIC       0x0020  /* dynamic group */
-#define PFI_IFLAG_ATTACHED      0x0040  /* interface attached */
-#define PFI_IFLAG_REFERENCED    0x0080  /* referenced by rules */
 .Ed
-.It Dv DIOCICLRISTATS Fa "struct pfioc_iface"
-Clear the statistics counters of one or more interfaces.
-.Va pfiio_name
-and
-.Va pfrio_flags
-can be used to select which interfaces need to be cleared.
+.It Dv DIOCSETIFFLAG Fa "struct pfioc_iface *io"
+Set the user setable flags (described above) of the
+.Nm
+internal interface description.
 The filtering process is the same as for
 .Dv DIOCIGETIFACES .
-.Va pfiio_nzero
-will be set by the kernel to the number of interfaces and drivers
-that have been cleared.
+.Bd -literal
+#define PFI_IFLAG_SKIP         0x0100  /* skip filtering on interface */
+.Ed
+.It Dv DIOCCLRIFFLAG Fa "struct pfioc_iface *io"
+Works as
+.Dv DIOCSETIFFLAG
+above but clears the flags.
+.El
+.Sh FILES
+.Bl -tag -width /dev/pf -compact
+.It Pa /dev/pf
+packet filtering device.
 .El
 .Sh EXAMPLES
 The following example demonstrates how to use the
 .Dv DIOCNATLOOK
-command to find the internal host/port of a NATed connection.
+command to find the internal host/port of a NATed connection:
 .Bd -literal
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -862,7 +1129,8 @@ main(int argc, char *argv[])
 .Xr bridge 4 ,
 .Xr pflog 4 ,
 .Xr pfsync 4 ,
-.Xr pfctl 8
+.Xr pfctl 8 ,
+.Xr altq 9
 .Sh HISTORY
 The
 .Nm
index 5268c4a..1895768 100644 (file)
@@ -1,5 +1,4 @@
-.\"    $OpenBSD: pflog.4,v 1.7 2004/03/21 19:47:59 miod Exp $
-.\"    $DragonFly: src/share/man/man4/pflog.4,v 1.4 2007/07/29 17:27:45 swildner Exp $
+.\"    $OpenBSD: pflog.4,v 1.9 2006/10/25 12:51:31 jmc Exp $
 .\"
 .\" Copyright (c) 2001 Tobias Weingartner
 .\" All rights reserved.
@@ -46,6 +45,14 @@ on the
 interface, or stored to disk using
 .Xr pflogd 8 .
 .Pp
+The pflog0 interface is created automatically at boot if both
+.Xr pf 4
+and
+.Xr pflogd 8
+are enabled;
+further instances can be created using
+.Xr ifconfig 8 .
+.Pp
 Each packet retrieved on this interface has a header associated
 with it of length
 .Dv PFLOG_HDRLEN .
@@ -64,23 +71,31 @@ struct pfloghdr {
        char            ruleset[PF_RULESET_NAME_SIZE];
        u_int32_t       rulenr;
        u_int32_t       subrulenr;
+       uid_t           uid;
+       pid_t           pid;
+       uid_t           rule_uid;
+       pid_t           rule_pid;
        u_int8_t        dir;
        u_int8_t        pad[3];
 };
 .Ed
 .Sh EXAMPLES
+Create a
+.Nm
+interface
+and monitor all packets logged on it:
 .Bd -literal -offset indent
 # ifconfig pflog0 up
 # tcpdump -n -e -ttt -i pflog0
 .Ed
 .Sh SEE ALSO
-.Xr tcpdump 1 ,
 .Xr inet 4 ,
 .Xr inet6 4 ,
 .Xr netintro 4 ,
 .Xr pf 4 ,
 .Xr ifconfig 8 ,
-.Xr pflogd 8
+.Xr pflogd 8,
+.Xr tcpdump 1
 .Sh HISTORY
 The
 .Nm
index 127249b..358ba74 100644 (file)
@@ -1,7 +1,7 @@
-.\"    $OpenBSD: pfsync.4,v 1.14 2004/03/21 19:47:59 miod Exp $
-.\"    $DragonFly: src/share/man/man4/pfsync.4,v 1.6 2007/11/03 18:37:42 swildner Exp $
+\"     $OpenBSD: pfsync.4,v 1.24 2006/10/23 07:05:49 jmc Exp $
 .\"
 .\" Copyright (c) 2002 Michael Shalayeff
+.\" Copyright (c) 2003-2004 Ryan McBride
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd April 9, 2007
+.Dd August 5, 2010
 .Dt PFSYNC 4
 .Os
 .Sh NAME
 .Nm pfsync
-.Nd packet filter states table logging interface
+.Nd packet filter state table logging interface
 .Sh SYNOPSIS
 .Cd "device pfsync"
 .Sh DESCRIPTION
@@ -69,20 +69,20 @@ state into one message where possible.
 The maximum number of times this can be done before the update is sent out
 is controlled by the
 .Ar maxupd
-to ifconfig.
+parameter to ifconfig
 (see
 .Xr ifconfig 8
-and the example below for more details)
+and the example below for more details).
 .Pp
 Each packet retrieved on this interface has a header associated
 with it of length
 .Dv PFSYNC_HDRLEN .
 The header indicates the version of the protocol, address family,
-action taken on the following states and the number of state
+action taken on the following states, and the number of state
 table entries attached in this packet.
-This structure, defined in
-.In net/pf/if_pfsync.h
-looks like:
+This structure is defined in
+.Aq Pa net/pf/if_pfsync.h
+as:
 .Bd -literal -offset indent
 struct pfsync_header {
        u_int8_t version;
@@ -96,21 +96,35 @@ States can be synchronised between two or more firewalls using this
 interface, by specifying a synchronisation interface using
 .Xr ifconfig 8 .
 For example, the following command sets fxp0 as the synchronisation
-interface.
+interface:
 .Bd -literal -offset indent
-# ifconfig pfsync0 syncif fxp0
+# ifconfig pfsync0 syncdev fxp0
 .Ed
 .Pp
-State change messages are sent out on the synchronisation
+By default, state change messages are sent out on the synchronisation
 interface using IP multicast packets.
 The protocol is IP protocol 240, PFSYNC, and the multicast group
 used is 224.0.0.240.
+When a peer address is specified using the
+.Ic syncpeer
+keyword, the peer address is used as a destination for the pfsync traffic,
+and the traffic can then be protected using
+.Xr ipsec 4 .
+In such a configuration, the syncdev should be set to the
+.Xr enc 4
+interface, as this is where the traffic arrives when it is decapsulated,
+e.g.:
+.Bd -literal -offset indent
+# ifconfig pfsync0 syncpeer 10.0.0.2 syncdev enc0
+.Ed
 .Pp
-It is important that the synchronisation interface be on a trusted
-network as there is no authentication on the protocol and it would
+It is important that the pfsync traffic be well secured
+as there is no authentication on the protocol and it would
 be trivial to spoof packets which create states, bypassing the pf ruleset.
-Ideally, this is a network dedicated to pfsync messages,
-i.e. a crossover cable between two firewalls.
+Either run the pfsync protocol on a trusted network \- ideally  a network
+dedicated to pfsync messages such as a crossover cable between two firewalls,
+or specify a peer address and protect the traffic with
+.Xr ipsec 4 .
 .Pp
 There is a one-to-one correspondence between packets seen by
 .Xr bpf 4
@@ -175,7 +189,7 @@ The following should be added to the top of
 .Pa /etc/pf.conf :
 .Bd -literal -offset indent
 pass quick on { sis2 } proto pfsync
-pass quick on { sis0 sis1 } proto carp keep state
+pass on { sis0 sis1 } proto carp
 .Ed
 .Pp
 If it is preferable that one firewall handle the traffic,
@@ -197,16 +211,18 @@ The following must also be added to
 net.inet.carp.preempt=1
 .Ed
 .Sh SEE ALSO
-.Xr tcpdump 1 ,
 .Xr bpf 4 ,
 .Xr carp 4 ,
 .Xr inet 4 ,
 .Xr inet6 4 ,
+.Xr ipsec 4 ,
 .Xr netintro 4 ,
 .Xr pf 4 ,
 .Xr pf.conf 5 ,
 .Xr protocols 5 ,
-.Xr ifconfig 8
+.Xr ifconfig 8 ,
+.Xr ifstated 8 ,
+.Xr tcpdump 1
 .Sh HISTORY
 The
 .Nm
index 8f6b522..e6a816b 100644 (file)
@@ -1849,7 +1849,6 @@ libkern/rindex.c                  standard
 libkern/scanc.c                                standard
 libkern/skpc.c                         standard
 libkern/strcat.c                       standard
-libkern/strchr.c                       standard
 libkern/strcmp.c                       standard
 libkern/strcasecmp.c                   standard
 libkern/fnmatch.c                      standard
diff --git a/sys/libkern/strchr.c b/sys/libkern/strchr.c
deleted file mode 100644 (file)
index 4e7cf69..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993
- *     The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *     This product includes software developed by the University of
- *     California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-
-#include <sys/libkern.h>
-#include <sys/_null.h>
-
-char *
-strchr(const char *str, int c)
-{
-       const char *s;
-
-       for (s = str; (*s) && (*s != (char)c); ++s);
-       return ((*s) ? __DECONST(char *, s) : NULL);
-}
index 2168494..bbe5e75 100644 (file)
@@ -485,6 +485,7 @@ cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
        cbq_state_t     *cbqp = (cbq_state_t *)ifq->altq_disc;
        struct rm_class *cl;
+       struct pf_mtag *pf;
        int len;
 
        /* grab class set by classifier */
@@ -494,8 +495,9 @@ cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
                m_freem(m);
                return (ENOBUFS);
        }
-       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
-               cl = clh_to_clp(cbqp, m->m_pkthdr.altq_qid);
+
+       if ((pf = altq_find_pftag(m)) != NULL)
+               cl = clh_to_clp(cbqp, pf->qid);
        else
                cl = NULL;
        if (cl == NULL) {
index e171aff..4c79581 100644 (file)
@@ -66,8 +66,8 @@
 
 /*
  * FAIRQ - take traffic classified by keep state (hashed into
- * mbuf->m_pkthdr.altq_state_hash) and bucketize it.  Fairly extract
- * the first packet from each bucket in a round-robin fashion.
+ *        pf->state_hash) and bucketize it.  Fairly extract
+ *        the first packet from each bucket in a round-robin fashion.
  *
  * TODO - better overall qlimit support (right now it is per-bucket).
  *     - NOTE: red etc is per bucket, not overall.
@@ -123,7 +123,7 @@ static int  fairq_class_destroy(struct fairq_class *);
 static int     fairq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
 static struct mbuf *fairq_dequeue(struct ifaltq *, struct mbuf *, int);
 
-static int     fairq_addq(struct fairq_class *, struct mbuf *);
+static int     fairq_addq(struct fairq_class *, struct mbuf *, struct pf_mtag *);
 static struct mbuf *fairq_getq(struct fairq_class *, uint64_t);
 static struct mbuf *fairq_pollq(struct fairq_class *, uint64_t, int *);
 static fairq_bucket_t *fairq_selectq(struct fairq_class *, int);
@@ -498,6 +498,7 @@ fairq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
        struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
        struct fairq_class *cl;
+       struct pf_mtag *pf;
        int error;
        int len;
 
@@ -512,8 +513,8 @@ fairq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
                goto done;
        }
 
-       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
-               cl = clh_to_clp(pif, m->m_pkthdr.altq_qid);
+       if ((pf = altq_find_pftag(m)) != NULL)
+               cl = clh_to_clp(pif, pf->qid);
        else
                cl = NULL;
        if (cl == NULL) {
@@ -527,7 +528,7 @@ fairq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
        cl->cl_flags |= FARF_HAS_PACKETS;
        cl->cl_pktattr = NULL;
        len = m_pktlen(m);
-       if (fairq_addq(cl, m) != 0) {
+       if (fairq_addq(cl, m, pf) != 0) {
                /* drop occurred.  mbuf was freed in fairq_addq. */
                PKTCNTR_ADD(&cl->cl_dropcnt, len);
                error = ENOBUFS;
@@ -626,7 +627,7 @@ fairq_dequeue(struct ifaltq *ifq, struct mbuf *mpolled, int op)
 }
 
 static int
-fairq_addq(struct fairq_class *cl, struct mbuf *m)
+fairq_addq(struct fairq_class *cl, struct mbuf *m, struct pf_mtag *pf)
 {
        fairq_bucket_t *b;
        u_int hindex;
@@ -636,13 +637,13 @@ fairq_addq(struct fairq_class *cl, struct mbuf *m)
         * If the packet doesn't have any keep state put it on the end of
         * our queue.  XXX this can result in out of order delivery.
         */
-       if ((m->m_pkthdr.fw_flags & ALTQ_MBUF_STATE_HASHED) == 0) {
+       if (pf == NULL || (pf->flags & PF_TAG_STATE_HASHED) == 0) {
                if (cl->cl_head)
                        b = cl->cl_head->prev;
                else
                        b = &cl->cl_buckets[0];
        } else {
-               hindex = m->m_pkthdr.altq_state_hash & cl->cl_nbucket_mask;
+               hindex = pf->state_hash & cl->cl_nbucket_mask;
                b = &cl->cl_buckets[hindex];
        }
 
index 45c7b48..43609de 100644 (file)
@@ -645,6 +645,7 @@ hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
        struct hfsc_if  *hif = (struct hfsc_if *)ifq->altq_disc;
        struct hfsc_class *cl;
+       struct pf_mtag *pf;
        int len;
 
        /* grab class set by classifier */
@@ -655,8 +656,8 @@ hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
                return (ENOBUFS);
        }
        crit_enter();
-       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
-               cl = clh_to_clp(hif, m->m_pkthdr.altq_qid);
+       if ((pf = altq_find_pftag(m)) != NULL)
+               cl = clh_to_clp(hif, pf->qid);
        else
                cl = NULL;
        if (cl == NULL || is_a_parent_class(cl)) {
index 0ca183e..00dd4e1 100644 (file)
@@ -415,6 +415,7 @@ priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
        struct priq_if *pif = (struct priq_if *)ifq->altq_disc;
        struct priq_class *cl;
+       struct pf_mtag *pf;
        int error;
        int len;
 
@@ -429,8 +430,8 @@ priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
                goto done;
        }
 
-       if (m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED)
-               cl = clh_to_clp(pif, m->m_pkthdr.altq_qid);
+       if ((pf = altq_find_pftag(m)) != NULL)
+               cl = clh_to_clp(pif, pf->qid);
        else
                cl = NULL;
        if (cl == NULL) {
index 3163ea2..1a4dade 100644 (file)
@@ -419,14 +419,16 @@ drop_early(int fp_len, int fp_probd, int count)
 int
 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
 {
+       struct pf_mtag *pf;
        struct mbuf *m0;
        void *hdr;
        int  af;
 
-       if ((m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) == 0)
+       pf = altq_find_pftag(m);
+       if (pf == NULL)
                return (0);
-       af = m->m_pkthdr.ecn_af;
-       hdr = m->m_pkthdr.header;
+       af = pf->af;
+       hdr = pf->hdr;
 
        if (af != AF_INET && af != AF_INET6)
                return (0);
index ea70e53..9b39768 100644 (file)
@@ -877,3 +877,15 @@ read_machclk(void)
        }
        return (val);
 }
+
+struct pf_mtag *
+altq_find_pftag(struct mbuf *m)
+{
+       struct m_tag *mtag;
+
+       mtag = m_tag_find(m, PF_MBUF_TAGGED, NULL);
+       if (mtag)
+               return((struct pf_mtag *)(mtag + 1));
+       return(NULL);
+}
+
index e5a54ba..c5db2cc 100644 (file)
@@ -64,8 +64,10 @@ extern int pfaltq_running;
 struct ifnet;
 struct mbuf;
 struct pf_altq;
+struct pf_mtag;
 
 void   *altq_lookup(const char *, int);
+struct pf_mtag *altq_find_pftag(struct mbuf *m);
 uint8_t        read_dsfield(struct mbuf *, struct altq_pktattr *);
 void   write_dsfield(struct mbuf *, struct altq_pktattr *, uint8_t);
 int    tbr_set(struct ifaltq *, struct tb_profile *);
index 297e8dc..0e3234e 100644 (file)
@@ -1249,6 +1249,28 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
        rel_mplock();
 }
 
+/*
+ * Incoming linkage from device drivers, where we have a mbuf chain
+ * but need to prepend some arbitrary header from a linear buffer.
+ *
+ * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
+ * struct m_hdr on the stack.  This is safe as bpf only reads from the
+ * fields in this header that we initialize, and will not try to free
+ * it or keep a pointer to it.
+ */
+void
+bpf_mtap_hdr(struct bpf_if *arg, caddr_t data, u_int dlen, struct mbuf *m, u_int direction)
+{
+       struct m_hdr mh;
+
+       mh.mh_flags = 0;
+       mh.mh_next = m;
+       mh.mh_len = dlen;
+       mh.mh_data = data;
+
+       return bpf_mtap(arg, (struct mbuf *) &mh);
+}
+
 void
 bpf_mtap_family(struct bpf_if *bp, struct mbuf *m, sa_family_t family)
 {
index d8b86c6..d205afe 100644 (file)
@@ -249,6 +249,7 @@ int  bpf_validate(const struct bpf_insn *, int);
 void    bpf_tap(struct bpf_if *, u_char *, u_int);
 void    bpf_mtap(struct bpf_if *, struct mbuf *);
 void    bpf_mtap_family(struct bpf_if *, struct mbuf *m, __uint8_t family);
+void    bpf_mtap_hdr(struct bpf_if *, caddr_t, u_int, struct mbuf *, u_int);
 void    bpf_ptap(struct bpf_if *, struct mbuf *, const void *, u_int);
 void    bpfattach(struct ifnet *, u_int, u_int);
 void    bpfattach_dlt(struct ifnet *, u_int, u_int, struct bpf_if **);
index e4c25c2..a422f2a 100644 (file)
@@ -322,6 +322,36 @@ struct     ifconf {
 #define        ifc_req ifc_ifcu.ifcu_req       /* array of structures returned */
 };
 
+/*
+ * interface groups
+ */
+
+#define        IFG_ALL         "all"           /* group contains all interfaces */
+/* XXX: will we implement this? */
+#define        IFG_EGRESS      "egress"        /* if(s) default route(s) point to */
+
+struct ifg_req {
+       union {
+               char                     ifgrqu_group[IFNAMSIZ];
+               char                     ifgrqu_member[IFNAMSIZ];
+       } ifgrq_ifgrqu;
+#define        ifgrq_group     ifgrq_ifgrqu.ifgrqu_group
+#define        ifgrq_member    ifgrq_ifgrqu.ifgrqu_member
+};
+
+/*
+ * Used to lookup groups for an interface
+ */
+struct ifgroupreq {
+       char    ifgr_name[IFNAMSIZ];
+       u_int   ifgr_len;
+       union {
+               char    ifgru_group[IFNAMSIZ];
+               struct  ifg_req *ifgru_groups;
+       } ifgr_ifgru;
+#define ifgr_group     ifgr_ifgru.ifgru_group
+#define ifgr_groups    ifgr_ifgru.ifgru_groups
+};
 
 /*
  * Structure for SIOC[AGD]LIFADDR
index 529ca73..e880bfc 100644 (file)
 #define        IFT_PVC         0xf1
 #define        IFT_FAITH       0xf2
 #define        IFT_STF         0xf3
+#define        IFT_ENC         0xf4
 #define        IFT_PFLOG       0xf5            /* Packet filter logging */
 #define        IFT_PFSYNC      0xf6            /* Packet filter state syncing */
 #define        IFT_CARP        0xf8            /* Common Address Redundancy Protocol */
index 8259fad..c016aa8 100644 (file)
@@ -233,6 +233,8 @@ struct ifnet {
                (struct ifnet *, struct sockaddr **, struct sockaddr *);
        int     (*if_start_cpuid)       /* cpuid to run if_start */
                (struct ifnet *);
+       TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */
+                                       /* protected by if_addr_mtx */
 #ifdef DEVICE_POLLING
        void    (*if_poll)              /* IFF_POLLING support */
                (struct ifnet *, enum poll_cmd, int);
@@ -272,6 +274,7 @@ struct ifnet {
        struct lwkt_serialize if_default_serializer; /* if not supplied */
        int     if_cpuid;
        struct netmsg *if_start_nmsg; /* percpu messages to schedule if_start */
+       void    *if_pf_kif; /* pf interface abstraction */
 };
 typedef void if_init_f_t (void *);
 
@@ -599,6 +602,38 @@ EVENTHANDLER_DECLARE(ifnet_attach_event, ifnet_attach_event_handler_t);
 typedef void (*ifnet_detach_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_detach_event, ifnet_detach_event_handler_t);
 
+/*
+ * interface groups
+ */
+struct ifg_group {
+       char                             ifg_group[IFNAMSIZ];
+       u_int                            ifg_refcnt;
+       void                            *ifg_pf_kif;
+       TAILQ_HEAD(, ifg_member)         ifg_members;
+       TAILQ_ENTRY(ifg_group)           ifg_next;
+};
+
+struct ifg_member {
+       TAILQ_ENTRY(ifg_member)  ifgm_next;
+       struct ifnet            *ifgm_ifp;
+};
+
+struct ifg_list {
+       struct ifg_group        *ifgl_group;
+       TAILQ_ENTRY(ifg_list)    ifgl_next;
+};
+
+/* group attach event */
+typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
+EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
+/* group detach event */
+typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
+EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
+/* group change event */
+typedef void (*group_change_event_handler_t)(void *, const char *);
+EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
+
+
 #ifdef INVARIANTS
 #define ASSERT_IFAC_VALID(ifac)        do { \
        KKASSERT((ifac)->ifa_magic == IFA_CONTAINER_MAGIC); \
index 106e337..91bf512 100644 (file)
@@ -2,7 +2,7 @@
 
 KMOD=  pf
 SRCS=  if_pflog.c pf.c pf_if.c pf_ioctl.c pf_norm.c pf_osfp.c pf_subr.c
-SRCS+= pf_table.c
+SRCS+= pf_table.c pf_ruleset.c
 SRCS+= use_pflog.h use_pfsync.h opt_inet.h opt_inet6.h
 SRCS+= opt_icmp_bandlim.h
 
index 1ed7bbe..7b6da40 100644 (file)
@@ -1,7 +1,7 @@
 /*     $FreeBSD: src/sys/contrib/pf/net/if_pflog.c,v 1.9 2004/06/22 20:13:24 brooks Exp $      */
 /*     $OpenBSD: if_pflog.c,v 1.11 2003/12/31 11:18:25 cedric Exp $    */
 /*     $DragonFly: src/sys/net/pf/if_pflog.c,v 1.6 2006/12/22 23:44:57 swildner Exp $ */
-
+/*     $OpenBSD: if_pflog.c,v 1.22 2006/12/15 09:31:20 otto Exp $      */
 /*
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and 
@@ -45,6 +45,7 @@
 #include <sys/systm.h>
 #include <sys/in_cksum.h>
 #include <sys/mbuf.h>
+#include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #define DPRINTF(x)
 #endif
 
-
-static void    pflog_clone_destroy(struct ifnet *);
-static int     pflog_clone_create(struct if_clone *, int, caddr_t);
+void   pflogattach(int);
 int    pflogoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
                       struct rtentry *);
 int    pflogioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
 void   pflogrtrequest(int, struct rtentry *, struct sockaddr *);
 void   pflogstart(struct ifnet *);
 
-static MALLOC_DEFINE(M_PFLOG, PFLOGNAME, "Packet Filter Logging Interface");
-static LIST_HEAD(pflog_list, pflog_softc) pflog_list;
-struct if_clone pflog_cloner = IF_CLONE_INITIALIZER("pflog", pflog_clone_create,
-           pflog_clone_destroy, 1, 1);
-
-static void
-pflog_clone_destroy(struct ifnet *ifp)
-{
-       struct pflog_softc *sc;
+int    pflog_clone_create(struct if_clone *, int, caddr_t);
+void pflog_clone_destroy(struct ifnet *);
 
-       sc = ifp->if_softc;
+LIST_HEAD(, pflog_softc)       pflogif_list;
+struct if_clone        pflog_cloner =
+    IF_CLONE_INITIALIZER("pflog", pflog_clone_create, pflog_clone_destroy, 1, 1);
 
-       /*
-        * Do we really need this?
-        */
-       IF_DRAIN(&ifp->if_snd);
+struct ifnet   *pflogifs[PFLOGIFS_MAX];        /* for fast access */
 
-       bpfdetach(ifp);
-       if_detach(ifp);
-       LIST_REMOVE(sc, sc_next);
-       kfree(sc, M_PFLOG);
+void
+pflogattach(int npflog)
+{
+       int     i;
+       LIST_INIT(&pflogif_list);
+       for (i = 0; i < PFLOGIFS_MAX; i++)
+               pflogifs[i] = NULL;
+       (void) pflog_clone_create(&pflog_cloner, 0, NULL);
+       if_clone_attach(&pflog_cloner);
 }
 
-static int
+int
 pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param __unused)
 {
-       struct pflog_softc *sc;
+       struct ifnet *ifp;
+       struct pflog_softc *pflogif;
 
-       MALLOC(sc, struct pflog_softc *, sizeof(*sc), M_PFLOG, M_WAITOK|M_ZERO);
+       if (unit >= PFLOGIFS_MAX)
+               return (EINVAL);
+
+       if ((pflogif = kmalloc(sizeof(*pflogif), M_DEVBUF, M_WAITOK)) == NULL)
+               return (ENOMEM);
+       bzero(pflogif, sizeof(*pflogif));
+
+       pflogif->sc_unit = unit;
+       ifp = &pflogif->sc_if;
+       ksnprintf(ifp->if_xname, sizeof ifp->if_xname, "pflog%d", unit);
+       ifp->if_softc = pflogif;
+       ifp->if_mtu = PFLOGMTU;
+       ifp->if_ioctl = pflogioctl;
+       ifp->if_output = pflogoutput;
+       ifp->if_start = pflogstart;
+       ifp->if_type = IFT_PFLOG;
+       ifp->if_snd.ifq_maxlen = ifqmaxlen;
+       ifp->if_hdrlen = PFLOG_HDRLEN;
+       if_attach(ifp, NULL);
+
+       bpfattach(&pflogif->sc_if, DLT_PFLOG, PFLOG_HDRLEN);
+
+       crit_enter();
+       LIST_INSERT_HEAD(&pflogif_list, pflogif, sc_list);
+       pflogifs[unit] = ifp;
+       crit_exit();
 
-       if_initname(&sc->sc_if, ifc->ifc_name, unit);
-        sc->sc_if.if_mtu = PFLOGMTU;
-        sc->sc_if.if_ioctl = pflogioctl;
-        sc->sc_if.if_output = pflogoutput;
-        sc->sc_if.if_start = pflogstart;
-        sc->sc_if.if_type = IFT_PFLOG;
-        sc->sc_if.if_snd.ifq_maxlen = ifqmaxlen;
-        sc->sc_if.if_hdrlen = PFLOG_HDRLEN;
-        sc->sc_if.if_softc = sc;
-        if_attach(&sc->sc_if, NULL);
+       return (0);
+}
+
+void
+pflog_clone_destroy(struct ifnet *ifp)
+{
+       struct pflog_softc      *pflogif = ifp->if_softc;
 
-        LIST_INSERT_HEAD(&pflog_list, sc, sc_next);
-       bpfattach(&sc->sc_if, DLT_PFLOG, PFLOG_HDRLEN);
+       crit_enter();
+       pflogifs[pflogif->sc_unit] = NULL;
+       LIST_REMOVE(pflogif, sc_list);
+       crit_exit();
 
-        return (0);
+#if NBPFILTER > 0
+       bpfdetach(ifp);
+#endif
+       if_detach(ifp);
+       kfree(pflogif, M_DEVBUF);
 }
 
 /*
@@ -147,10 +172,19 @@ pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param __unused)
 void
 pflogstart(struct ifnet *ifp)
 {
-       crit_enter();
-       IF_DROP(&ifp->if_snd);
-       IF_DRAIN(&ifp->if_snd);
-       crit_exit();
+       struct mbuf *m;
+
+       for (;;) {
+               crit_enter();
+               IF_DROP(&ifp->if_snd);
+               IF_DEQUEUE(&ifp->if_snd, m);
+               crit_exit();
+
+               if (m == NULL)
+                       return;
+               else
+                       m_freem(m);
+       }
 }
 
 int
@@ -193,15 +227,17 @@ pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr)
 int
 pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir,
     u_int8_t reason, struct pf_rule *rm, struct pf_rule *am,
-    struct pf_ruleset *ruleset)
+    struct pf_ruleset *ruleset, struct pf_pdesc *pd)
 {
-       struct ifnet *ifn;
+       struct ifnet *ifn = NULL;
        struct pfloghdr hdr;
-       struct mbuf m1;
 
        if (kif == NULL || m == NULL || rm == NULL)
                return (-1);
 
+       if ((ifn = pflogifs[rm->logif]) == NULL || !ifn->if_bpf)
+               return (0);
+
        bzero(&hdr, sizeof(hdr));
        hdr.length = PFLOG_REAL_HDRLEN;
        hdr.af = af;
@@ -215,36 +251,42 @@ pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir,
        } else {
                hdr.rulenr = htonl(am->nr);
                hdr.subrulenr = htonl(rm->nr);
-               if (ruleset != NULL)
-                       memcpy(hdr.ruleset, ruleset->name,
+               if (ruleset != NULL && ruleset->anchor != NULL) {
+                       strlcpy(hdr.ruleset, ruleset->anchor->name,
                            sizeof(hdr.ruleset));
-
-                       
+               }
        }
+       if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done)
+               pd->lookup.done = pf_socket_lookup(dir, pd, NULL);
+       if (pd->lookup.done > 0) {
+               hdr.uid = pd->lookup.uid;
+               hdr.pid = pd->lookup.pid;
+       } else {
+               hdr.uid = UID_MAX;
+               hdr.pid = NO_PID;
+       }
+       hdr.rule_uid = rm->cuid;
+       hdr.rule_pid = rm->cpid;
        hdr.dir = dir;
 
 #ifdef INET
        if (af == AF_INET) {
-               struct ip *ip = mtod(m, struct ip *);
-
+               struct ip *ip;
+               ip = mtod(m, struct ip *);      
                ip->ip_len = htons(ip->ip_len);
                ip->ip_off = htons(ip->ip_off);
 
                if (dir == PF_OUT) {
                        ip->ip_sum = 0;
                        ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
-               }
+               }                               
        }
 #endif /* INET */
 
-       m1.m_next = m;
-       m1.m_len = PFLOG_HDRLEN;
-       m1.m_data = (char *) &hdr;
-
-       KASSERT((!LIST_EMPTY(&pflog_list)), ("pflog: no interface"));
-       ifn = &LIST_FIRST(&pflog_list)->sc_if;
-
-       BPF_MTAP(ifn, &m1);
+       ifn->if_opackets++;
+       ifn->if_obytes += m->m_pkthdr.len;
+       bpf_mtap_hdr(ifn->if_bpf, (char *)&hdr, PFLOG_HDRLEN, m,
+           BPF_DIRECTION_OUT);
 
 #ifdef INET
        if (af == AF_INET) {
@@ -265,15 +307,16 @@ pflog_modevent(module_t mod, int type, void *data)
 
        switch (type) {
        case MOD_LOAD:
-               LIST_INIT(&pflog_list);
+               LIST_INIT(&pflogif_list);
                if_clone_attach(&pflog_cloner);
                break;
 
        case MOD_UNLOAD:
                if_clone_detach(&pflog_cloner);
-               while (!LIST_EMPTY(&pflog_list))
+               while (!LIST_EMPTY(&pflogif_list)) {
                        pflog_clone_destroy(
-                               &LIST_FIRST(&pflog_list)->sc_if);
+                               &LIST_FIRST(&pflogif_list)->sc_if);
+               }
                break;
 
        default:
index 6eeb1b0..7b3a8ed 100644 (file)
@@ -1,7 +1,7 @@
 /* $FreeBSD: src/sys/contrib/pf/net/if_pflog.h,v 1.4 2004/06/16 23:24:00 mlaier Exp $ */
 /* $OpenBSD: if_pflog.h,v 1.10 2004/03/19 04:52:04 frantzen Exp $ */
 /* $DragonFly: src/sys/net/pf/if_pflog.h,v 1.1 2004/09/19 22:32:47 joerg Exp $ */
-
+/* $OpenBSD: if_pflog.h,v 1.14 2006/10/25 11:27:01 henning Exp $ */
 /*
  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
  *
 #ifndef _NET_IF_PFLOG_H_
 #define _NET_IF_PFLOG_H_
 
+#define        PFLOGIFS_MAX    16
+
 struct pflog_softc {
-       struct ifnet    sc_if;  /* the interface */
-       LIST_ENTRY(pflog_softc) sc_next;
+       struct ifnet            sc_if;  /* the interface */
+       int                     sc_unit;
+       LIST_ENTRY(pflog_softc) sc_list;
 };
 
-/* XXX keep in sync with pfvar.h */
-#ifndef PF_RULESET_NAME_SIZE
-#define PF_RULESET_NAME_SIZE    16
-#endif
+#define PFLOG_RULESET_NAME_SIZE        16
 
 struct pfloghdr {
        u_int8_t        length;
@@ -48,9 +48,13 @@ struct pfloghdr {
        u_int8_t        action;
        u_int8_t        reason;
        char            ifname[IFNAMSIZ];
-       char            ruleset[PF_RULESET_NAME_SIZE];
+       char            ruleset[PFLOG_RULESET_NAME_SIZE];
        u_int32_t       rulenr;
        u_int32_t       subrulenr;
+       uid_t           uid;
+       pid_t           pid;
+       uid_t           rule_uid;
+       pid_t           rule_pid;
        u_int8_t        dir;
        u_int8_t        pad[3];
 };
@@ -75,9 +79,9 @@ struct old_pfloghdr {
 #include "use_pflog.h"
 
 #if NPFLOG > 0
-#define        PFLOG_PACKET(i,x,a,b,c,d,e,f,g) pflog_packet(i,a,b,c,d,e,f,g)
+#define        PFLOG_PACKET(i,x,a,b,c,d,e,f,g,h) pflog_packet(i,a,b,c,d,e,f,g,h)
 #else
-#define        PFLOG_PACKET(i,x,a,b,c,d,e,f,g) ((void)0)
+#define        PFLOG_PACKET(i,x,a,b,c,d,e,f,g,h) ((void)0)
 #endif /* NPFLOG > 0 */
 #endif /* _KERNEL */
 #endif /* _NET_IF_PFLOG_H_ */
index 17a4790..8a78505 100644 (file)
@@ -1,10 +1,6 @@
-/*     $FreeBSD: src/sys/contrib/pf/net/if_pfsync.c,v 1.11 2004/08/14 15:32:40 dwmalone Exp $  */
-/*     $OpenBSD: if_pfsync.c,v 1.26 2004/03/28 18:14:20 mcbride Exp $  */
-/*     $DragonFly: src/sys/net/pf/if_pfsync.c,v 1.8 2008/04/12 17:39:41 dillon Exp $ */
+/*     $OpenBSD: if_pfsync.c,v 1.73 2006/11/16 13:13:38 henning Exp $  */
 
 /*
- * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
- *
  * Copyright (c) 2002 Michael Shalayeff
  * All rights reserved.
  *
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "opt_inet.h"
-#include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
-#include <sys/priv.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/timeout.h>
 #include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/sockio.h>
-#include <sys/thread2.h>
-#include <vm/vm_zone.h>
-
-#include <machine/inttypes.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/bpf.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_seq.h>
 
 #ifdef INET
-#include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #endif
 
 #ifdef INET6
-#ifndef INET
-#include <netinet/in.h>
-#endif
 #include <netinet6/nd6.h>
 #endif /* INET6 */
 
-#include <net/pf/pfvar.h>
-#include <net/pf/if_pfsync.h>
+#include "carp.h"
+#if NCARP > 0
+#include <netinet/ip_carp.h>
+#endif
 
-#define        PFSYNCNAME      "pfsync"
+#include <net/pfvar.h>
+#include <net/if_pfsync.h>
+
+#include "bpfilter.h"
+#include "pfsync.h"
 
 #define PFSYNC_MINMTU  \
     (sizeof(struct pfsync_header) + sizeof(struct pf_state))
 
 #ifdef PFSYNCDEBUG
-#define DPRINTF(x)    do { if (pfsyncdebug) kprintf x ; } while (0)
+#define DPRINTF(x)    do { if (pfsyncdebug) printf x ; } while (0)
 int pfsyncdebug;
 #else
 #define DPRINTF(x)
 #endif
 
-int                    pfsync_sync_ok;
-struct pfsyncstats     pfsyncstats;
+struct pfsync_softc    *pfsyncif = NULL;
+struct pfsyncstats      pfsyncstats;
 
-static void    pfsync_clone_destroy(struct ifnet *);
-static int     pfsync_clone_create(struct if_clone *, int, caddr_t);
+void   pfsyncattach(int);
+int    pfsync_clone_create(struct if_clone *, int);
+int    pfsync_clone_destroy(struct ifnet *);
 void   pfsync_setmtu(struct pfsync_softc *, int);
-int    pfsync_insert_net_state(struct pfsync_state *);
+int    pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
+           struct pf_state_peer *);
+int    pfsync_insert_net_state(struct pfsync_state *, u_int8_t);
+void   pfsync_update_net_tdb(struct pfsync_tdb *);
 int    pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
            struct rtentry *);
-int    pfsyncioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
+int    pfsyncioctl(struct ifnet *, u_long, caddr_t);
 void   pfsyncstart(struct ifnet *);
 
 struct mbuf *pfsync_get_mbuf(struct pfsync_softc *, u_int8_t, void **);
 int    pfsync_request_update(struct pfsync_state_upd *, struct in_addr *);
 int    pfsync_sendout(struct pfsync_softc *);
+int    pfsync_tdb_sendout(struct pfsync_softc *);
+int    pfsync_sendout_mbuf(struct pfsync_softc *, struct mbuf *);
 void   pfsync_timeout(void *);
+void   pfsync_tdb_timeout(void *);
 void   pfsync_send_bus(struct pfsync_softc *, u_int8_t);
 void   pfsync_bulk_update(void *);
 void   pfsync_bulkfail(void *);
 
-static MALLOC_DEFINE(M_PFSYNC, PFSYNCNAME, "Packet Filter State Sync. Interface");
-static LIST_HEAD(pfsync_list, pfsync_softc) pfsync_list;
-struct if_clone pfsync_cloner = IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create,
-    pfsync_clone_destroy, 1, 1);
-
-static void
-pfsync_clone_destroy(struct ifnet *ifp)
-{
-       struct pfsync_softc *sc;
+int    pfsync_sync_ok;
+extern int ifqmaxlen;
 
-       sc = ifp->if_softc;
-       callout_stop(&sc->sc_tmo);
-       callout_stop(&sc->sc_bulk_tmo);
-       callout_stop(&sc->sc_bulkfail_tmo);
+struct if_clone        pfsync_cloner =
+    IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
 
-       bpfdetach(ifp);
-       if_detach(ifp);
-       LIST_REMOVE(sc, sc_next);
-       kfree(sc, M_PFSYNC);
+void
+pfsyncattach(int npfsync)
+{
+       if_clone_attach(&pfsync_cloner);
 }
-
-static int
-pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param __unused)
+int
+pfsync_clone_create(struct if_clone *ifc, int unit)
 {
-       struct pfsync_softc *sc;
        struct ifnet *ifp;
 
-       MALLOC(sc, struct pfsync_softc *, sizeof(*sc), M_PFSYNC,
-           M_WAITOK|M_ZERO);
+       if (unit != 0)
+               return (EINVAL);
 
        pfsync_sync_ok = 1;
-       sc->sc_mbuf = NULL;
-       sc->sc_mbuf_net = NULL;
-       sc->sc_statep.s = NULL;
-       sc->sc_statep_net.s = NULL;
-       sc->sc_maxupdates = 128;
-       sc->sc_sendaddr.s_addr = htonl(INADDR_PFSYNC_GROUP);
-       sc->sc_ureq_received = 0;
-       sc->sc_ureq_sent = 0;
-
-       ifp = &sc->sc_if;
-       if_initname(ifp, ifc->ifc_name, unit);
+       if ((pfsyncif = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK)) == NULL)
+               return (ENOMEM);
+       bzero(pfsyncif, sizeof(*pfsyncif));
+       pfsyncif->sc_mbuf = NULL;
+       pfsyncif->sc_mbuf_net = NULL;
+       pfsyncif->sc_mbuf_tdb = NULL;
+       pfsyncif->sc_statep.s = NULL;
+       pfsyncif->sc_statep_net.s = NULL;
+       pfsyncif->sc_statep_tdb.t = NULL;
+       pfsyncif->sc_maxupdates = 128;
+       pfsyncif->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
+       pfsyncif->sc_sendaddr.s_addr = INADDR_PFSYNC_GROUP;
+       pfsyncif->sc_ureq_received = 0;
+       pfsyncif->sc_ureq_sent = 0;
+       pfsyncif->sc_bulk_send_next = NULL;
+       pfsyncif->sc_bulk_terminator = NULL;
+       ifp = &pfsyncif->sc_if;
+       snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
+       ifp->if_softc = pfsyncif;
        ifp->if_ioctl = pfsyncioctl;
        ifp->if_output = pfsyncoutput;
        ifp->if_start = pfsyncstart;
        ifp->if_type = IFT_PFSYNC;
        ifp->if_snd.ifq_maxlen = ifqmaxlen;
        ifp->if_hdrlen = PFSYNC_HDRLEN;
-       ifp->if_baudrate = IF_Mbps(100);
-       ifp->if_softc = sc;
-       pfsync_setmtu(sc, MCLBYTES);
-       callout_init(&sc->sc_tmo);
-       callout_init(&sc->sc_bulk_tmo);
-       callout_init(&sc->sc_bulkfail_tmo);
-       if_attach(&sc->sc_if, NULL);
+       pfsync_setmtu(pfsyncif, ETHERMTU);
+       timeout_set(&pfsyncif->sc_tmo, pfsync_timeout, pfsyncif);
+       timeout_set(&pfsyncif->sc_tdb_tmo, pfsync_tdb_timeout, pfsyncif);
+       timeout_set(&pfsyncif->sc_bulk_tmo, pfsync_bulk_update, pfsyncif);
+       timeout_set(&pfsyncif->sc_bulkfail_tmo, pfsync_bulkfail, pfsyncif);
+       if_attach(ifp);
+       if_alloc_sadl(ifp);
+
+#if NCARP > 0
+       if_addgroup(ifp, "carp");
+#endif
+
+#if NBPFILTER > 0
+       bpfattach(&pfsyncif->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
+#endif
 
-       LIST_INSERT_HEAD(&pfsync_list, sc, sc_next);
-       bpfattach(&sc->sc_if, DLT_PFSYNC, PFSYNC_HDRLEN);
+       return (0);
+}
 
+int
+pfsync_clone_destroy(struct ifnet *ifp)
+{
+#if NBPFILTER > 0
+       bpfdetach(ifp);
+#endif
+       if_detach(ifp);
+       free(pfsyncif, M_DEVBUF);
+       pfsyncif = NULL;
        return (0);
 }
 
@@ -172,51 +187,92 @@ pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param __unused)
 void
 pfsyncstart(struct ifnet *ifp)
 {
-       crit_enter();
-       IF_DROP(&ifp->if_snd);
-       IF_DRAIN(&ifp->if_snd);
-       crit_exit();
+       struct mbuf *m;
+       int s;
+
+       for (;;) {
+               s = splnet();
+               IF_DROP(&ifp->if_snd);
+               IF_DEQUEUE(&ifp->if_snd, m);
+               splx(s);
+
+               if (m == NULL)
+                       return;
+               else
+                       m_freem(m);
+       }
+}
+
+int
+pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
+    struct pf_state_peer *d)
+{
+       if (s->scrub.scrub_flag && d->scrub == NULL) {
+               d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT);
+               if (d->scrub == NULL)
+                       return (ENOMEM);
+               bzero(d->scrub, sizeof(*d->scrub));
+       }
+
+       return (0);
 }
 
 int
-pfsync_insert_net_state(struct pfsync_state *sp)
+pfsync_insert_net_state(struct pfsync_state *sp, u_int8_t chksum_flag)
 {
        struct pf_state *st = NULL;
        struct pf_rule *r = NULL;
        struct pfi_kif  *kif;
 
        if (sp->creatorid == 0 && pf_status.debug >= PF_DEBUG_MISC) {
-               kprintf("pfsync_insert_net_state: invalid creator id:"
-                   " %08" PRIx32 "\n", ntohl(sp->creatorid));
+               printf("pfsync_insert_net_state: invalid creator id:"
+                   " %08x\n", ntohl(sp->creatorid));
                return (EINVAL);
        }
 
-       kif = pfi_lookup_create(sp->ifname);
+       kif = pfi_kif_get(sp->ifname);
        if (kif == NULL) {
                if (pf_status.debug >= PF_DEBUG_MISC)
-                       kprintf("pfsync_insert_net_state: "
+                       printf("pfsync_insert_net_state: "
                            "unknown interface: %s\n", sp->ifname);
                /* skip this state */
                return (0);
        }
 
        /*
-        * Just use the default rule until we have infrastructure to find the
-        * best matching rule.
+        * If the ruleset checksums match, it's safe to associate the state
+        * with the rule of that number.
         */
-       r = &pf_default_rule;
+       if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && chksum_flag)
+               r = pf_main_ruleset.rules[
+                   PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
+       else
+               r = &pf_default_rule;
 
        if (!r->max_states || r->states < r->max_states)
                st = pool_get(&pf_state_pl, PR_NOWAIT);
        if (st == NULL) {
-               pfi_maybe_destroy(kif);
+               pfi_kif_unref(kif, PFI_KIF_REF_NONE);
                return (ENOMEM);
        }
        bzero(st, sizeof(*st));
 
+       /* allocate memory for scrub info */
+       if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
+           pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) {
+               pfi_kif_unref(kif, PFI_KIF_REF_NONE);
+               if (st->src.scrub)
+                       pool_put(&pf_state_scrub_pl, st->src.scrub);
+               pool_put(&pf_state_pl, st);
+               return (ENOMEM);
+       }
+
        st->rule.ptr = r;
        /* XXX get pointers to nat_rule and anchor */
 
+       /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
+       r->states++;
+
        /* fill in the rest of the state entry */
        pf_state_host_ntoh(&sp->lan, &st->lan);
        pf_state_host_ntoh(&sp->gwy, &st->gwy);
@@ -226,8 +282,7 @@ pfsync_insert_net_state(struct pfsync_state *sp)
        pf_state_peer_ntoh(&sp->dst, &st->dst);
 
        bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
-       st->hash = pf_state_hash(st);
-       st->creation = ntohl(sp->creation) + time_second;
+       st->creation = time_second - ntohl(sp->creation);
        st->expire = ntohl(sp->expire) + time_second;
 
        st->af = sp->af;
@@ -239,11 +294,16 @@ pfsync_insert_net_state(struct pfsync_state *sp)
 
        bcopy(sp->id, &st->id, sizeof(st->id));
        st->creatorid = sp->creatorid;
-       st->sync_flags = sp->sync_flags | PFSTATE_FROMSYNC;
-
+       st->sync_flags = PFSTATE_FROMSYNC;
 
        if (pf_insert_state(kif, st)) {
-               pfi_maybe_destroy(kif);
+               pfi_kif_unref(kif, PFI_KIF_REF_NONE);
+               /* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */
+               r->states--;
+               if (st->dst.scrub)
+                       pool_put(&pf_state_scrub_pl, st->dst.scrub);
+               if (st->src.scrub)
+                       pool_put(&pf_state_scrub_pl, st->src.scrub);
                pool_put(&pf_state_pl, st);
                return (EINVAL);
        }
@@ -256,22 +316,25 @@ pfsync_input(struct mbuf *m, ...)
 {
        struct ip *ip = mtod(m, struct ip *);
        struct pfsync_header *ph;
-       struct pfsync_softc *sc = LIST_FIRST(&pfsync_list);
-       struct pf_state *st, key;
+       struct pfsync_softc *sc = pfsyncif;
+       struct pf_state *st;
+       struct pf_state_cmp key;
        struct pfsync_state *sp;
        struct pfsync_state_upd *up;
        struct pfsync_state_del *dp;
        struct pfsync_state_clr *cp;
        struct pfsync_state_upd_req *rup;
        struct pfsync_state_bus *bus;
+       struct pfsync_tdb *pt;
        struct in_addr src;
        struct mbuf *mp;
-       int iplen, action, error, i, count, offp;
+       int iplen, action, error, i, s, count, offp, sfail, stale = 0;
+       u_int8_t chksum_flag = 0;
 
        pfsyncstats.pfsyncs_ipackets++;
 
        /* verify that we have a sync interface configured */
-       if (!sc->sc_sync_ifp || !pf_status.running)
+       if (!sc || !sc->sc_sync_ifp || !pf_status.running)
                goto done;
 
        /* verify that the packet came in on the right interface */
@@ -320,8 +383,12 @@ pfsync_input(struct mbuf *m, ...)
        /* Cheaper to grab this now than having to mess with mbufs later */
        src = ip->ip_src;
 
+       if (!bcmp(&ph->pf_chksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
+               chksum_flag++;
+
        switch (action) {
        case PFSYNC_ACT_CLR: {
+               struct pf_state *nexts;
                struct pfi_kif  *kif;
                u_int32_t creatorid;
                if ((mp = m_pulldown(m, iplen + sizeof(*ph),
@@ -332,29 +399,32 @@ pfsync_input(struct mbuf *m, ...)
                cp = (struct pfsync_state_clr *)(mp->m_data + offp);
                creatorid = cp->creatorid;
 
-               crit_enter();
+               s = splsoftnet();
                if (cp->ifname[0] == '\0') {
-                       RB_FOREACH(st, pf_state_tree_id, &tree_id) {
-                               if (st->creatorid == creatorid)
-                                       st->timeout = PFTM_PURGE;
+                       for (st = RB_MIN(pf_state_tree_id, &tree_id);
+                           st; st = nexts) {
+                               nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
+                               if (st->creatorid == creatorid) {
+                                       st->sync_flags |= PFSTATE_FROMSYNC;
+                                       pf_unlink_state(st);
+                               }
                        }
                } else {
-                       kif = pfi_lookup_if(cp->ifname);
-                       if (kif == NULL) {
-                               if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync_input: PFSYNC_ACT_CLR "
-                                           "bad interface: %s\n", cp->ifname);
-                               crit_exit();
-                               goto done;
+                       if ((kif = pfi_kif_get(cp->ifname)) == NULL) {
+                               splx(s);
+                               return;
                        }
-                       RB_FOREACH(st, pf_state_tree_lan_ext,
-                           &kif->pfik_lan_ext) {
-                               if (st->creatorid == creatorid)
-                                       st->timeout = PFTM_PURGE;
+                       for (st = RB_MIN(pf_state_tree_lan_ext,
+                           &kif->pfik_lan_ext); st; st = nexts) {
+                               nexts = RB_NEXT(pf_state_tree_lan_ext,
+                                   &kif->pfik_lan_ext, st);
+                               if (st->creatorid == creatorid) {
+                                       st->sync_flags |= PFSTATE_FROMSYNC;
+                                       pf_unlink_state(st);
+                               }
                        }
                }
-               pf_purge_expired_states();
-               crit_exit();
+               splx(s);
 
                break;
        }
@@ -365,7 +435,7 @@ pfsync_input(struct mbuf *m, ...)
                        return;
                }
 
-               crit_enter();
+               s = splsoftnet();
                for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp);
                    i < count; i++, sp++) {
                        /* check for invalid values */
@@ -375,21 +445,22 @@ pfsync_input(struct mbuf *m, ...)
                            sp->direction > PF_OUT ||
                            (sp->af != AF_INET && sp->af != AF_INET6)) {
                                if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync_insert: PFSYNC_ACT_INS: "
+                                       printf("pfsync_insert: PFSYNC_ACT_INS: "
                                            "invalid value\n");
                                pfsyncstats.pfsyncs_badstate++;
                                continue;
                        }
 
-                       if ((error = pfsync_insert_net_state(sp))) {
+                       if ((error = pfsync_insert_net_state(sp,
+                           chksum_flag))) {
                                if (error == ENOMEM) {
-                                       crit_exit();
+                                       splx(s);
                                        goto done;
                                }
                                continue;
                        }
                }
-               crit_exit();
+               splx(s);
                break;
        case PFSYNC_ACT_UPD:
                if ((mp = m_pulldown(m, iplen + sizeof(*ph),
@@ -398,15 +469,17 @@ pfsync_input(struct mbuf *m, ...)
                        return;
                }
 
-               crit_enter();
+               s = splsoftnet();
                for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp);
                    i < count; i++, sp++) {
+                       int flags = PFSYNC_FLAG_STALE;
+
                        /* check for invalid values */
                        if (sp->timeout >= PFTM_MAX ||
                            sp->src.state > PF_TCPS_PROXY_DST ||
                            sp->dst.state > PF_TCPS_PROXY_DST) {
                                if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync_insert: PFSYNC_ACT_UPD: "
+                                       printf("pfsync_insert: PFSYNC_ACT_UPD: "
                                            "invalid value\n");
                                pfsyncstats.pfsyncs_badstate++;
                                continue;
@@ -418,17 +491,79 @@ pfsync_input(struct mbuf *m, ...)
                        st = pf_find_state_byid(&key);
                        if (st == NULL) {
                                /* insert the update */
-                               if (pfsync_insert_net_state(sp))
+                               if (pfsync_insert_net_state(sp, chksum_flag))
                                        pfsyncstats.pfsyncs_badstate++;
                                continue;
                        }
+                       sfail = 0;
+                       if (st->proto == IPPROTO_TCP) {
+                               /*
+                                * The state should never go backwards except
+                                * for syn-proxy states.  Neither should the
+                                * sequence window slide backwards.
+                                */
+                               if (st->src.state > sp->src.state &&
+                                   (st->src.state < PF_TCPS_PROXY_SRC ||
+                                   sp->src.state >= PF_TCPS_PROXY_SRC))
+                                       sfail = 1;
+                               else if (SEQ_GT(st->src.seqlo,
+                                   ntohl(sp->src.seqlo)))
+                                       sfail = 3;
+                               else if (st->dst.state > sp->dst.state) {
+                                       /* There might still be useful
+                                        * information about the src state here,
+                                        * so import that part of the update,
+                                        * then "fail" so we send the updated
+                                        * state back to the peer who is missing
+                                        * our what we know. */
+                                       pf_state_peer_ntoh(&sp->src, &st->src);
+                                       /* XXX do anything with timeouts? */
+                                       sfail = 7;
+                                       flags = 0;
+                               } else if (st->dst.state >= TCPS_SYN_SENT &&
+                                   SEQ_GT(st->dst.seqlo, ntohl(sp->dst.seqlo)))
+                                       sfail = 4;
+                       } else {
+                               /*
+                                * Non-TCP protocol state machine always go
+                                * forwards
+                                */
+                               if (st->src.state > sp->src.state)
+                                       sfail = 5;
+                               else if (st->dst.state > sp->dst.state)
+                                       sfail = 6;
+                       }
+                       if (sfail) {
+                               if (pf_status.debug >= PF_DEBUG_MISC)
+                                       printf("pfsync: %s stale update "
+                                           "(%d) id: %016llx "
+                                           "creatorid: %08x\n",
+                                           (sfail < 7 ?  "ignoring"
+                                            : "partial"), sfail,
+                                           betoh64(st->id),
+                                           ntohl(st->creatorid));
+                               pfsyncstats.pfsyncs_badstate++;
+
+                               if (!(sp->sync_flags & PFSTATE_STALE)) {
+                                       /* we have a better state, send it */
+                                       if (sc->sc_mbuf != NULL && !stale)
+                                               pfsync_sendout(sc);
+                                       stale++;
+                                       if (!st->sync_flags)
+                                               pfsync_pack_state(
+                                                   PFSYNC_ACT_UPD, st, flags);
+                               }
+                               continue;
+                       }
+                       pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
                        pf_state_peer_ntoh(&sp->src, &st->src);
                        pf_state_peer_ntoh(&sp->dst, &st->dst);
                        st->expire = ntohl(sp->expire) + time_second;
                        st->timeout = sp->timeout;
-
                }
-               crit_exit();
+               if (stale && sc->sc_mbuf != NULL)
+                       pfsync_sendout(sc);
+               splx(s);
                break;
        /*
         * It's not strictly necessary for us to support the "uncompressed"
@@ -441,7 +576,7 @@ pfsync_input(struct mbuf *m, ...)
                        return;
                }
 
-               crit_enter();
+               s = splsoftnet();
                for (i = 0, sp = (struct pfsync_state *)(mp->m_data + offp);
                    i < count; i++, sp++) {
                        bcopy(sp->id, &key.id, sizeof(key.id));
@@ -452,16 +587,10 @@ pfsync_input(struct mbuf *m, ...)
                                pfsyncstats.pfsyncs_badstate++;
                                continue;
                        }
-                       /*
-                        * XXX
-                        * pf_purge_expired_states() is expensive,
-                        * we really want to purge the state directly.
-                        */
-                       st->timeout = PFTM_PURGE;
                        st->sync_flags |= PFSTATE_FROMSYNC;
+                       pf_unlink_state(st);
                }
-               pf_purge_expired_states();
-               crit_exit();
+               splx(s);
                break;
        case PFSYNC_ACT_UPD_C: {
                int update_requested = 0;
@@ -472,7 +601,7 @@ pfsync_input(struct mbuf *m, ...)
                        return;
                }
 
-               crit_enter();
+               s = splsoftnet();
                for (i = 0, up = (struct pfsync_state_upd *)(mp->m_data + offp);
                    i < count; i++, up++) {
                        /* check for invalid values */
@@ -480,7 +609,7 @@ pfsync_input(struct mbuf *m, ...)
                            up->src.state > PF_TCPS_PROXY_DST ||
                            up->dst.state > PF_TCPS_PROXY_DST) {
                                if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync_insert: "
+                                       printf("pfsync_insert: "
                                            "PFSYNC_ACT_UPD_C: "
                                            "invalid value\n");
                                pfsyncstats.pfsyncs_badstate++;
@@ -493,19 +622,74 @@ pfsync_input(struct mbuf *m, ...)
                        st = pf_find_state_byid(&key);
                        if (st == NULL) {
                                /* We don't have this state. Ask for it. */
-                               pfsync_request_update(up, &src);
+                               error = pfsync_request_update(up, &src);
+                               if (error == ENOMEM) {
+                                       splx(s);
+                                       goto done;
+                               }
                                update_requested = 1;
                                pfsyncstats.pfsyncs_badstate++;
                                continue;
                        }
+                       sfail = 0;
+                       if (st->proto == IPPROTO_TCP) {
+                               /*
+                                * The state should never go backwards except
+                                * for syn-proxy states.  Neither should the
+                                * sequence window slide backwards.
+                                */
+                               if (st->src.state > up->src.state &&
+                                   (st->src.state < PF_TCPS_PROXY_SRC ||
+                                   up->src.state >= PF_TCPS_PROXY_SRC))
+                                       sfail = 1;
+                               else if (st->dst.state > up->dst.state)
+                                       sfail = 2;
+                               else if (SEQ_GT(st->src.seqlo,
+                                   ntohl(up->src.seqlo)))
+                                       sfail = 3;
+                               else if (st->dst.state >= TCPS_SYN_SENT &&
+                                   SEQ_GT(st->dst.seqlo, ntohl(up->dst.seqlo)))
+                                       sfail = 4;
+                       } else {
+                               /*
+                                * Non-TCP protocol state machine always go
+                                * forwards
+                                */
+                               if (st->src.state > up->src.state)
+                                       sfail = 5;
+                               else if (st->dst.state > up->dst.state)
+                                       sfail = 6;
+                       }
+                       if (sfail) {
+                               if (pf_status.debug >= PF_DEBUG_MISC)
+                                       printf("pfsync: ignoring stale update "
+                                           "(%d) id: %016llx "
+                                           "creatorid: %08x\n", sfail,
+                                           betoh64(st->id),
+                                           ntohl(st->creatorid));
+                               pfsyncstats.pfsyncs_badstate++;
+
+                               /* we have a better state, send it out */
+                               if ((!stale || update_requested) &&
+                                   sc->sc_mbuf != NULL) {
+                                       pfsync_sendout(sc);
+                                       update_requested = 0;
+                               }
+                               stale++;
+                               if (!st->sync_flags)
+                                       pfsync_pack_state(PFSYNC_ACT_UPD, st,
+                                           PFSYNC_FLAG_STALE);
+                               continue;
+                       }
+                       pfsync_alloc_scrub_memory(&up->dst, &st->dst);
                        pf_state_peer_ntoh(&up->src, &st->src);
                        pf_state_peer_ntoh(&up->dst, &st->dst);
                        st->expire = ntohl(up->expire) + time_second;
                        st->timeout = up->timeout;
                }
-               if (update_requested)
+               if ((update_requested || stale) && sc->sc_mbuf)
                        pfsync_sendout(sc);
-               crit_exit();
+               splx(s);
                break;
        }
        case PFSYNC_ACT_DEL_C:
@@ -515,7 +699,7 @@ pfsync_input(struct mbuf *m, ...)
                        return;
                }
 
-               crit_enter();
+               s = splsoftnet();
                for (i = 0, dp = (struct pfsync_state_del *)(mp->m_data + offp);
                    i < count; i++, dp++) {
                        bcopy(dp->id, &key.id, sizeof(key.id));
@@ -526,16 +710,10 @@ pfsync_input(struct mbuf *m, ...)
                                pfsyncstats.pfsyncs_badstate++;
                                continue;
                        }
-                       /*
-                        * XXX
-                        * pf_purge_expired_states() is expensive,
-                        * we really want to purge the state directly.
-                        */
-                       st->timeout = PFTM_PURGE;
                        st->sync_flags |= PFSTATE_FROMSYNC;
+                       pf_unlink_state(st);
                }
-               pf_purge_expired_states();
-               crit_exit();
+               splx(s);
                break;
        case PFSYNC_ACT_INS_F:
        case PFSYNC_ACT_DEL_F:
@@ -548,8 +726,7 @@ pfsync_input(struct mbuf *m, ...)
                        return;
                }
 
-               crit_enter();
-               /* XXX send existing. pfsync_pack_state should handle this. */
+               s = splsoftnet();
                if (sc->sc_mbuf != NULL)
                        pfsync_sendout(sc);
                for (i = 0,
@@ -559,26 +736,30 @@ pfsync_input(struct mbuf *m, ...)
                        key.creatorid = rup->creatorid;
 
                        if (key.id == 0 && key.creatorid == 0) {
-                               sc->sc_ureq_received = mycpu->gd_time_seconds;
+                               sc->sc_ureq_received = time_uptime;
+                               if (sc->sc_bulk_send_next == NULL)
+                                       sc->sc_bulk_send_next =
+                                           TAILQ_FIRST(&state_list);
+                               sc->sc_bulk_terminator = sc->sc_bulk_send_next;
                                if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync: received "
+                                       printf("pfsync: received "
                                            "bulk update request\n");
                                pfsync_send_bus(sc, PFSYNC_BUS_START);
-                               callout_reset(&sc->sc_bulk_tmo, 1 * hz,
-                                   pfsync_bulk_update,
-                                   LIST_FIRST(&pfsync_list));
+                               timeout_add(&sc->sc_bulk_tmo, 1 * hz);
                        } else {
                                st = pf_find_state_byid(&key);
                                if (st == NULL) {
                                        pfsyncstats.pfsyncs_badstate++;
                                        continue;
                                }
-                               pfsync_pack_state(PFSYNC_ACT_UPD, st, 0);
+                               if (!st->sync_flags)
+                                       pfsync_pack_state(PFSYNC_ACT_UPD,
+                                           st, 0);
                        }
                }
                if (sc->sc_mbuf != NULL)
                        pfsync_sendout(sc);
-               crit_exit();
+               splx(s);
                break;
        case PFSYNC_ACT_BUS:
                /* If we're not waiting for a bulk update, who cares. */
@@ -593,33 +774,48 @@ pfsync_input(struct mbuf *m, ...)
                bus = (struct pfsync_state_bus *)(mp->m_data + offp);
                switch (bus->status) {
                case PFSYNC_BUS_START:
-                       callout_reset(&sc->sc_bulkfail_tmo,
+                       timeout_add(&sc->sc_bulkfail_tmo,
                            pf_pool_limits[PF_LIMIT_STATES].limit /
-                           (PFSYNC_BULKPACKETS * sc->sc_maxcount), 
-                           pfsync_bulkfail, LIST_FIRST(&pfsync_list));
+                           (PFSYNC_BULKPACKETS * sc->sc_maxcount));
                        if (pf_status.debug >= PF_DEBUG_MISC)
-                               kprintf("pfsync: received bulk "
+                               printf("pfsync: received bulk "
                                    "update start\n");
                        break;
                case PFSYNC_BUS_END:
-                       if (mycpu->gd_time_seconds - ntohl(bus->endtime) >=
+                       if (time_uptime - ntohl(bus->endtime) >=
                            sc->sc_ureq_sent) {
                                /* that's it, we're happy */
                                sc->sc_ureq_sent = 0;
                                sc->sc_bulk_tries = 0;
-                               callout_stop(&sc->sc_bulkfail_tmo);
+                               timeout_del(&sc->sc_bulkfail_tmo);
+#if NCARP > 0
+                               if (!pfsync_sync_ok)
+                                       carp_group_demote_adj(&sc->sc_if, -1);
+#endif
                                pfsync_sync_ok = 1;
                                if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync: received valid "
+                                       printf("pfsync: received valid "
                                            "bulk update end\n");
                        } else {
                                if (pf_status.debug >= PF_DEBUG_MISC)
-                                       kprintf("pfsync: received invalid "
+                                       printf("pfsync: received invalid "
                                            "bulk update end: bad timestamp\n");
                        }
                        break;
                }
                break;
+       case PFSYNC_ACT_TDB_UPD:
+               if ((mp = m_pulldown(m, iplen + sizeof(*ph),
+                   count * sizeof(*pt), &offp)) == NULL) {
+                       pfsyncstats.pfsyncs_badlen++;
+                       return;
+               }
+               s = splsoftnet();
+               for (i = 0, pt = (struct pfsync_tdb *)(mp->m_data + offp);
+                   i < count; i++, pt++)
+                       pfsync_update_net_tdb(pt);
+               splx(s);
+               break;
        }
 
 done:
@@ -637,14 +833,15 @@ pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
 
 /* ARGSUSED */
 int
-pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr)
+pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
+       struct proc *p = curproc;
        struct pfsync_softc *sc = ifp->if_softc;
        struct ifreq *ifr = (struct ifreq *)data;
        struct ip_moptions *imo = &sc->sc_imo;
        struct pfsyncreq pfsyncr;
        struct ifnet    *sifp;
-       int error;
+       int s, error;
 
        switch (cmd) {
        case SIOCSIFADDR:
@@ -661,49 +858,59 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr)
                        return (EINVAL);
                if (ifr->ifr_mtu > MCLBYTES)
                        ifr->ifr_mtu = MCLBYTES;
-               crit_enter();
+               s = splnet();
                if (ifr->ifr_mtu < ifp->if_mtu)
                        pfsync_sendout(sc);
                pfsync_setmtu(sc, ifr->ifr_mtu);
-               crit_exit();
+               splx(s);
                break;
        case SIOCGETPFSYNC:
                bzero(&pfsyncr, sizeof(pfsyncr));
                if (sc->sc_sync_ifp)
-                       strlcpy(pfsyncr.pfsyncr_syncif,
+                       strlcpy(pfsyncr.pfsyncr_syncdev,
                            sc->sc_sync_ifp->if_xname, IFNAMSIZ);
+               pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
                pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
                if ((error = copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))))
                        return (error);
                break;
        case SIOCSETPFSYNC:
-               if ((error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY)) != 0)
+               if ((error = suser(p, p->p_acflag)) != 0)
                        return (error);
                if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
                        return (error);
 
+               if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
+                       sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
+               else
+                       sc->sc_sync_peer.s_addr =
+                           pfsyncr.pfsyncr_syncpeer.s_addr;
+
                if (pfsyncr.pfsyncr_maxupdates > 255)
                        return (EINVAL);
                sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
 
-               if (pfsyncr.pfsyncr_syncif[0] == 0) {
+               if (pfsyncr.pfsyncr_syncdev[0] == 0) {
                        sc->sc_sync_ifp = NULL;
                        if (sc->sc_mbuf_net != NULL) {
                                /* Don't keep stale pfsync packets around. */
-                               crit_enter();
+                               s = splnet();
                                m_freem(sc->sc_mbuf_net);
                                sc->sc_mbuf_net = NULL;
                                sc->sc_statep_net.s = NULL;
-                               crit_exit();
+                               splx(s);
+                       }
+                       if (imo->imo_num_memberships > 0) {
+                               in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
+                               imo->imo_multicast_ifp = NULL;
                        }
                        break;
                }
-               if ((sifp = ifunit(pfsyncr.pfsyncr_syncif)) == NULL)
+
+               if ((sifp = ifunit(pfsyncr.pfsyncr_syncdev)) == NULL)
                        return (EINVAL);
-               else if (sifp == sc->sc_sync_ifp)
-                       break;
 
-               crit_enter();
+               s = splnet();
                if (sifp->if_mtu < sc->sc_if.if_mtu ||
                    (sc->sc_sync_ifp != NULL &&
                    sifp->if_mtu < sc->sc_sync_ifp->if_mtu) ||
@@ -718,32 +925,50 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data, struct ucred *cr)
                        imo->imo_multicast_ifp = NULL;
                }
 
-               if (sc->sc_sync_ifp) {
+               if (sc->sc_sync_ifp &&
+                   sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
                        struct in_addr addr;
 
-                       addr.s_addr = htonl(INADDR_PFSYNC_GROUP);
-                       /* XXX do we only use one group? Also see above */
+                       if (!(sc->sc_sync_ifp->if_flags & IFF_MULTICAST)) {
+                               sc->sc_sync_ifp = NULL;
+                               splx(s);
+                               return (EADDRNOTAVAIL);
+                       }
+
+                       addr.s_addr = INADDR_PFSYNC_GROUP;
+
                        if ((imo->imo_membership[0] =
                            in_addmulti(&addr, sc->sc_sync_ifp)) == NULL) {
-                               crit_exit();
+                               sc->sc_sync_ifp = NULL;
+                               splx(s);
                                return (ENOBUFS);
                        }
                        imo->imo_num_memberships++;
                        imo->imo_multicast_ifp = sc->sc_sync_ifp;
                        imo->imo_multicast_ttl = PFSYNC_DFLTTL;
                        imo->imo_multicast_loop = 0;
+               }
 
+               if (sc->sc_sync_ifp ||
+                   sc->sc_sendaddr.s_addr != INADDR_PFSYNC_GROUP) {
                        /* Request a full state table update. */
-                       sc->sc_ureq_sent = mycpu->gd_time_seconds;
+                       sc->sc_ureq_sent = time_uptime;
+#if NCARP > 0
+                       if (pfsync_sync_ok)
+                               carp_group_demote_adj(&sc->sc_if, 1);
+#endif
                        pfsync_sync_ok = 0;
                        if (pf_status.debug >= PF_DEBUG_MISC)
-                               kprintf("pfsync: requesting bulk update\n");
-                       callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
-                           pfsync_bulkfail, LIST_FIRST(&pfsync_list));
-                       pfsync_request_update(NULL, NULL);
+                               printf("pfsync: requesting bulk update\n");
+                       timeout_add(&sc->sc_bulkfail_tmo, 5 * hz);
+                       error = pfsync_request_update(NULL, NULL);
+                       if (error == ENOMEM) {
+                               splx(s);
+                               return (ENOMEM);
+                       }
                        pfsync_sendout(sc);
                }
-               crit_exit();
+               splx(s);
 
                break;
 
@@ -779,7 +1004,7 @@ pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp)
        struct mbuf *m;
        int len;
 
-       MGETHDR(m, MB_DONTWAIT, MT_DATA);
+       MGETHDR(m, M_DONTWAIT, MT_DATA);
        if (m == NULL) {
                sc->sc_if.if_oerrors++;
                return (NULL);
@@ -806,6 +1031,10 @@ pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp)
                len = sizeof(struct pfsync_header) +
                    sizeof(struct pfsync_state_bus);
                break;
+       case PFSYNC_ACT_TDB_UPD:
+               len = (sc->sc_maxcount * sizeof(struct pfsync_tdb)) +
+                   sizeof(struct pfsync_header);
+               break;
        default:
                len = (sc->sc_maxcount * sizeof(struct pfsync_state)) +
                    sizeof(struct pfsync_header);
@@ -813,7 +1042,7 @@ pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp)
        }
 
        if (len > MHLEN) {
-               MCLGET(m, MB_DONTWAIT);
+               MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        sc->sc_if.if_oerrors++;
@@ -830,32 +1059,42 @@ pfsync_get_mbuf(struct pfsync_softc *sc, u_int8_t action, void **sp)
        h->af = 0;
        h->count = 0;
        h->action = action;
+       if (action != PFSYNC_ACT_TDB_UPD)
+               bcopy(&pf_status.pf_chksum, &h->pf_chksum,
+                   PF_MD5_DIGEST_LENGTH);
 
        *sp = (void *)((char *)h + PFSYNC_HDRLEN);
-       callout_reset(&sc->sc_tmo, hz, pfsync_timeout,
-           LIST_FIRST(&pfsync_list));
+       if (action == PFSYNC_ACT_TDB_UPD)
+               timeout_add(&sc->sc_tdb_tmo, hz);
+       else
+               timeout_add(&sc->sc_tmo, hz);
        return (m);
 }
 
 int
-pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
+pfsync_pack_state(u_int8_t action, struct pf_state *st, int flags)
 {
-       struct ifnet *ifp = &(LIST_FIRST(&pfsync_list))->sc_if;
-       struct pfsync_softc *sc = ifp->if_softc;
+       struct ifnet *ifp = NULL;
+       struct pfsync_softc *sc = pfsyncif;
        struct pfsync_header *h, *h_net;
        struct pfsync_state *sp = NULL;
        struct pfsync_state_upd *up = NULL;
        struct pfsync_state_del *dp = NULL;
        struct pf_rule *r;
        u_long secs;
-       int ret = 0;
+       int s, ret = 0;
        u_int8_t i = 255, newaction = 0;
 
+       if (sc == NULL)
+               return (0);
+       ifp = &sc->sc_if;
+
        /*
         * If a packet falls in the forest and there's nobody around to
         * hear, does it make a sound?
         */
-       if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL) {
+       if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL &&
+           sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
                /* Don't leave any stale pfsync packets hanging around. */
                if (sc->sc_mbuf != NULL) {
                        m_freem(sc->sc_mbuf);
@@ -868,11 +1107,11 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
        if (action >= PFSYNC_ACT_MAX)
                return (EINVAL);
 
-       crit_enter();
+       s = splnet();
        if (sc->sc_mbuf == NULL) {
                if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action,
                    (void *)&sc->sc_statep.s)) == NULL) {
-                       crit_exit();
+                       splx(s);
                        return (ENOMEM);
                }
                h = mtod(sc->sc_mbuf, struct pfsync_header *);
@@ -882,7 +1121,7 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
                        pfsync_sendout(sc);
                        if ((sc->sc_mbuf = pfsync_get_mbuf(sc, action,
                            (void *)&sc->sc_statep.s)) == NULL) {
-                               crit_exit();
+                               splx(s);
                                return (ENOMEM);
                        }
                        h = mtod(sc->sc_mbuf, struct pfsync_header *);
@@ -911,9 +1150,7 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
 
        secs = time_second;
 
-       st->pfsync_time = mycpu->gd_time_seconds;
-       TAILQ_REMOVE(&state_updates, st, u.s.entry_updates);
-       TAILQ_INSERT_TAIL(&state_updates, st, u.s.entry_updates);
+       st->pfsync_time = time_uptime;
 
        if (sp == NULL) {
                /* not a "duplicate" update */
@@ -935,10 +1172,10 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
                bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
 
                sp->creation = htonl(secs - st->creation);
-               sp->packets[0] = htonl(st->packets[0]);
-               sp->packets[1] = htonl(st->packets[1]);
-               sp->bytes[0] = htonl(st->bytes[0]);
-               sp->bytes[1] = htonl(st->bytes[1]);
+               pf_state_counter_hton(st->packets[0], sp->packets[0]);
+               pf_state_counter_hton(st->packets[1], sp->packets[1]);
+               pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
+               pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
                if ((r = st->rule.ptr) == NULL)
                        sp->rule = htonl(-1);
                else
@@ -954,7 +1191,8 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
                sp->allow_opts = st->allow_opts;
                sp->timeout = st->timeout;
 
-               sp->sync_flags = st->sync_flags & PFSTATE_NOSYNC;
+               if (flags & PFSYNC_FLAG_STALE)
+                       sp->sync_flags |= PFSTATE_STALE;
        }
 
        pf_state_peer_hton(&st->src, &sp->src);
@@ -966,7 +1204,7 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
                sp->expire = htonl(st->expire - secs);
 
        /* do we need to build "compressed" actions for network transfer? */
-       if (sc->sc_sync_ifp && compress) {
+       if (sc->sc_sync_ifp && flags & PFSYNC_FLAG_COMPRESS) {
                switch (action) {
                case PFSYNC_ACT_UPD:
                        newaction = PFSYNC_ACT_UPD_C;
@@ -984,7 +1222,7 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
                if (sc->sc_mbuf_net == NULL) {
                        if ((sc->sc_mbuf_net = pfsync_get_mbuf(sc, newaction,
                            (void *)&sc->sc_statep_net.s)) == NULL) {
-                               crit_exit();
+                               splx(s);
                                return (ENOMEM);
                        }
                }
@@ -1028,7 +1266,7 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
            (sc->sc_maxupdates && (sp->updates >= sc->sc_maxupdates)))
                ret = pfsync_sendout(sc);
 
-       crit_exit();
+       splx(s);
        return (ret);
 }
 
@@ -1036,26 +1274,28 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int compress)
 int
 pfsync_request_update(struct pfsync_state_upd *up, struct in_addr *src)
 {
-       struct ifnet *ifp = &(LIST_FIRST(&pfsync_list))->sc_if;
+       struct ifnet *ifp = NULL;
        struct pfsync_header *h;
-       struct pfsync_softc *sc = ifp->if_softc;
+       struct pfsync_softc *sc = pfsyncif;
        struct pfsync_state_upd_req *rup;
        int ret = 0;
 
+       if (sc == NULL)
+               return (0);
+
+       ifp = &sc->sc_if;
        if (sc->sc_mbuf == NULL) {
                if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ,
-                   (void *)&sc->sc_statep.s)) == NULL) {
+                   (void *)&sc->sc_statep.s)) == NULL)
                        return (ENOMEM);
-               }
                h = mtod(sc->sc_mbuf, struct pfsync_header *);
        } else {
                h = mtod(sc->sc_mbuf, struct pfsync_header *);
                if (h->action != PFSYNC_ACT_UREQ) {
                        pfsync_sendout(sc);
                        if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_UREQ,
-                           (void *)&sc->sc_statep.s)) == NULL) {
+                           (void *)&sc->sc_statep.s)) == NULL)
                                return (ENOMEM);
-                       }
                        h = mtod(sc->sc_mbuf, struct pfsync_header *);
                }
        }
@@ -1080,17 +1320,21 @@ pfsync_request_update(struct pfsync_state_upd *up, struct in_addr *src)
 int
 pfsync_clear_states(u_int32_t creatorid, char *ifname)
 {
-       struct ifnet *ifp = &(LIST_FIRST(&pfsync_list))->sc_if;
-       struct pfsync_softc *sc = ifp->if_softc;
+       struct ifnet *ifp = NULL;
+       struct pfsync_softc *sc = pfsyncif;
        struct pfsync_state_clr *cp;
-       int ret;
+       int s, ret;
 
-       crit_enter();
+       if (sc == NULL)
+               return (0);
+
+       ifp = &sc->sc_if;
+       s = splnet();
        if (sc->sc_mbuf != NULL)
                pfsync_sendout(sc);
        if ((sc->sc_mbuf = pfsync_get_mbuf(sc, PFSYNC_ACT_CLR,
            (void *)&sc->sc_statep.c)) == NULL) {
-               crit_exit();
+               splx(s);
                return (ENOMEM);
        }
        sc->sc_mbuf->m_pkthdr.len = sc->sc_mbuf->m_len += sizeof(*cp);
@@ -1100,7 +1344,7 @@ pfsync_clear_states(u_int32_t creatorid, char *ifname)
                strlcpy(cp->ifname, ifname, IFNAMSIZ);
 
        ret = (pfsync_sendout(sc));
-       crit_exit();
+       splx(s);
        return (ret);
 }
 
@@ -1108,13 +1352,26 @@ void
 pfsync_timeout(void *v)
 {
        struct pfsync_softc *sc = v;
+       int s;
 
-       crit_enter();
+       s = splnet();
        pfsync_sendout(sc);
-       crit_exit();
+       splx(s);
 }
 
 void
+pfsync_tdb_timeout(void *v)
+{
+       struct pfsync_softc *sc = v;
+       int s;
+
+       s = splnet();
+       pfsync_tdb_sendout(sc);
+       splx(s);
+}
+
+/* This must be called in splnet() */
+void
 pfsync_send_bus(struct pfsync_softc *sc, u_int8_t status)
 {
        struct pfsync_state_bus *bus;
@@ -1129,7 +1386,7 @@ pfsync_send_bus(struct pfsync_softc *sc, u_int8_t status)
                bus = sc->sc_statep.b;
                bus->creatorid = pf_status.hostid;
                bus->status = status;
-               bus->endtime = htonl(mycpu->gd_time_seconds - sc->sc_ureq_received);
+               bus->endtime = htonl(time_uptime - sc->sc_ureq_received);
                pfsync_sendout(sc);
        }
 }
@@ -1138,10 +1395,10 @@ void
 pfsync_bulk_update(void *v)
 {
        struct pfsync_softc *sc = v;
-       int i = 0;
+       int s, i = 0;
        struct pf_state *state;
 
-       crit_enter();
+       s = splnet();
        if (sc->sc_mbuf != NULL)
                pfsync_sendout(sc);
 
@@ -1149,65 +1406,89 @@ pfsync_bulk_update(void *v)
         * Grab at most PFSYNC_BULKPACKETS worth of states which have not
         * been sent since the latest request was made.
         */
-       while ((state = TAILQ_FIRST(&state_updates)) != NULL &&
-           ++i < (sc->sc_maxcount * PFSYNC_BULKPACKETS)) {
-               if (state->pfsync_time > sc->sc_ureq_received) {
-                       /* we're done */
-                       pfsync_send_bus(sc, PFSYNC_BUS_END);
-                       sc->sc_ureq_received = 0;
-                       callout_stop(&sc->sc_bulk_tmo);
-                       if (pf_status.debug >= PF_DEBUG_MISC)
-                               kprintf("pfsync: bulk update complete\n");
-                       break;
-               } else {
-                       /* send an update and move to end of list */
-                       if (!state->sync_flags)
+       state = sc->sc_bulk_send_next;
+       if (state)
+               do {
+                       /* send state update if syncable and not already sent */
+                       if (!state->sync_flags
+                           && state->timeout < PFTM_MAX
+                           && state->pfsync_time <= sc->sc_ureq_received) {
                                pfsync_pack_state(PFSYNC_ACT_UPD, state, 0);
-                       state->pfsync_time = mycpu->gd_time_seconds;
-                       TAILQ_REMOVE(&state_updates, state, u.s.entry_updates);
-                       TAILQ_INSERT_TAIL(&state_updates, state,
-                           u.s.entry_updates);
-
-                       /* look again for more in a bit */
-                       callout_reset(&sc->sc_bulk_tmo, 1, pfsync_timeout,
-                           LIST_FIRST(&pfsync_list));
-               }
+                               i++;
+                       }
+
+                       /* figure next state to send */
+                       state = TAILQ_NEXT(state, u.s.entry_list);
+
+                       /* wrap to start of list if we hit the end */
+                       if (!state)
+                               state = TAILQ_FIRST(&state_list);
+               } while (i < sc->sc_maxcount * PFSYNC_BULKPACKETS &&
+                   state != sc->sc_bulk_terminator);
+
+       if (!state || state == sc->sc_bulk_terminator) {
+               /* we're done */
+               pfsync_send_bus(sc, PFSYNC_BUS_END);
+               sc->sc_ureq_received = 0;
+               sc->sc_bulk_send_next = NULL;
+               sc->sc_bulk_terminator = NULL;
+               timeout_del(&sc->sc_bulk_tmo);
+               if (pf_status.debug >= PF_DEBUG_MISC)
+                       printf("pfsync: bulk update complete\n");
+       } else {
+               /* look again for more in a bit */
+               timeout_add(&sc->sc_bulk_tmo, 1);
+               sc->sc_bulk_send_next = state;
        }
        if (sc->sc_mbuf != NULL)
                pfsync_sendout(sc);
-       crit_exit();
+       splx(s);
 }
 
 void
 pfsync_bulkfail(void *v)
 {
        struct pfsync_softc *sc = v;
+       int s, error;
 
        if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
                /* Try again in a bit */
-               callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulkfail,
-                   LIST_FIRST(&pfsync_list));
-               pfsync_request_update(NULL, NULL);
-               pfsync_sendout(sc);
+               timeout_add(&sc->sc_bulkfail_tmo, 5 * hz);
+               s = splnet();
+               error = pfsync_request_update(NULL, NULL);
+               if (error == ENOMEM) {
+                       if (pf_status.debug >= PF_DEBUG_MISC)
+                               printf("pfsync: cannot allocate mbufs for "
+                                   "bulk update\n");
+               } else
+                       pfsync_sendout(sc);
+               splx(s);
        } else {
                /* Pretend like the transfer was ok */
                sc->sc_ureq_sent = 0;
                sc->sc_bulk_tries = 0;
+#if NCARP > 0
+               if (!pfsync_sync_ok)
+                       carp_group_demote_adj(&sc->sc_if, -1);
+#endif
                pfsync_sync_ok = 1;
                if (pf_status.debug >= PF_DEBUG_MISC)
-                       kprintf("pfsync: failed to receive "
+                       printf("pfsync: failed to receive "
                            "bulk update status\n");
-               callout_stop(&sc->sc_bulkfail_tmo);
+               timeout_del(&sc->sc_bulkfail_tmo);
        }
 }
 
+/* This must be called in splnet() */
 int
 pfsync_sendout(struct pfsync_softc *sc)
 {
+#if NBPFILTER > 0
        struct ifnet *ifp = &sc->sc_if;
+#endif
        struct mbuf *m;
 
-       callout_stop(&sc->sc_tmo);
+       timeout_del(&sc->sc_tmo);
 
        if (sc->sc_mbuf == NULL)
                return (0);
@@ -1215,8 +1496,10 @@ pfsync_sendout(struct pfsync_softc *sc)
        sc->sc_mbuf = NULL;
        sc->sc_statep.s = NULL;
 
-       KASSERT(m != NULL, ("pfsync_sendout: null mbuf"));
-       BPF_MTAP(ifp, m);
+#if NBPFILTER > 0
+       if (ifp->if_bpf)
+               bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
+#endif
 
        if (sc->sc_mbuf_net) {
                m_freem(m);
@@ -1225,12 +1508,42 @@ pfsync_sendout(struct pfsync_softc *sc)
                sc->sc_statep_net.s = NULL;
        }
 
-       if (sc->sc_sync_ifp) {
-               struct ip *ip;
-               struct ifaddr *ifa;
-               struct sockaddr sa;
+       return pfsync_sendout_mbuf(sc, m);
+}
+
+int
+pfsync_tdb_sendout(struct pfsync_softc *sc)
+{
+#if NBPFILTER > 0
+       struct ifnet *ifp = &sc->sc_if;
+#endif
+       struct mbuf *m;
+
+       timeout_del(&sc->sc_tdb_tmo);
+
+       if (sc->sc_mbuf_tdb == NULL)
+               return (0);
+       m = sc->sc_mbuf_tdb;
+       sc->sc_mbuf_tdb = NULL;
+       sc->sc_statep_tdb.t = NULL;
+
+#if NBPFILTER > 0
+       if (ifp->if_bpf)
+               bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
+#endif
+
+       return pfsync_sendout_mbuf(sc, m);
+}
 
-               M_PREPEND(m, sizeof(struct ip), MB_DONTWAIT);
+int
+pfsync_sendout_mbuf(struct pfsync_softc *sc, struct mbuf *m)
+{
+       struct sockaddr sa;
+       struct ip *ip;
+
+       if (sc->sc_sync_ifp ||
+           sc->sc_sync_peer.s_addr != INADDR_PFSYNC_GROUP) {
+               M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
                if (m == NULL) {
                        pfsyncstats.pfsyncs_onomem++;
                        return (0);
@@ -1239,28 +1552,20 @@ pfsync_sendout(struct pfsync_softc *sc)
                ip->ip_v = IPVERSION;
                ip->ip_hl = sizeof(*ip) >> 2;
                ip->ip_tos = IPTOS_LOWDELAY;
-               ip->ip_len = m->m_pkthdr.len;
-#ifdef RANDOM_IP_ID
-               ip->ip_id = ip_randomid();
-#else
-               ip->ip_id = ntohs(ip_id++);
-#endif
-               ip->ip_off = IP_DF;
+               ip->ip_len = htons(m->m_pkthdr.len);
+               ip->ip_id = htons(ip_randomid());
+               ip->ip_off = htons(IP_DF);
                ip->ip_ttl = PFSYNC_DFLTTL;
                ip->ip_p = IPPROTO_PFSYNC;
                ip->ip_sum = 0;
 
                bzero(&sa, sizeof(sa));
-               sa.sa_family = AF_INET;
-               ifa = ifaof_ifpforaddr(&sa, sc->sc_sync_ifp);
-               if (ifa == NULL)
-                       return (0);
-               ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
+               ip->ip_src.s_addr = INADDR_ANY;
 
-               if (sc->sc_sendaddr.s_addr == htonl(INADDR_PFSYNC_GROUP))
+               if (sc->sc_sendaddr.s_addr == INADDR_PFSYNC_GROUP)
                        m->m_flags |= M_MCAST;
                ip->ip_dst = sc->sc_sendaddr;
-               sc->sc_sendaddr.s_addr = htonl(INADDR_PFSYNC_GROUP);
+               sc->sc_sendaddr.s_addr = sc->sc_sync_peer.s_addr;
 
                pfsyncstats.pfsyncs_opackets++;
 
@@ -1272,39 +1577,153 @@ pfsync_sendout(struct pfsync_softc *sc)
        return (0);
 }
 
-static int
-pfsync_modevent(module_t mod, int type, void *data)
+/* Update an in-kernel tdb. Silently fail if no tdb is found. */
+void
+pfsync_update_net_tdb(struct pfsync_tdb *pt)
 {
-       int error = 0;
+       struct tdb              *tdb;
+       int                      s;
+
+       /* check for invalid values */
+       if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
+           (pt->dst.sa.sa_family != AF_INET &&
+            pt->dst.sa.sa_family != AF_INET6))
+               goto bad;
+
+       s = spltdb();
+       tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
+       if (tdb) {
+               pt->rpl = ntohl(pt->rpl);
+               pt->cur_bytes = betoh64(pt->cur_bytes);
+
+               /* Neither replay nor byte counter should ever decrease. */
+               if (pt->rpl < tdb->tdb_rpl ||
+                   pt->cur_bytes < tdb->tdb_cur_bytes) {
+                       splx(s);
+                       goto bad;
+               }
 
-       switch (type) {
-       case MOD_LOAD:
-               LIST_INIT(&pfsync_list);
-               if_clone_attach(&pfsync_cloner);
-               break;
+               tdb->tdb_rpl = pt->rpl;
+               tdb->tdb_cur_bytes = pt->cur_bytes;
+       }
+       splx(s);
+       return;
+
+ bad:
+       if (pf_status.debug >= PF_DEBUG_MISC)
+               printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
+                   "invalid value\n");
+       pfsyncstats.pfsyncs_badstate++;
+       return;
+}
 
-       case MOD_UNLOAD:
-               if_clone_detach(&pfsync_cloner);
-               while (!LIST_EMPTY(&pfsync_list))
-                       pfsync_clone_destroy(
-                               &LIST_FIRST(&pfsync_list)->sc_if);
-               break;
+/* One of our local tdbs have been updated, need to sync rpl with others */
+int
+pfsync_update_tdb(struct tdb *tdb, int output)
+{
+       struct ifnet *ifp = NULL;
+       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_header *h;
+       struct pfsync_tdb *pt = NULL;
+       int s, i, ret;
 
-       default:
-               error = EINVAL;
-               break;
+       if (sc == NULL)
+               return (0);
+
+       ifp = &sc->sc_if;
+       if (ifp->if_bpf == NULL && sc->sc_sync_ifp == NULL &&
+           sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
+               /* Don't leave any stale pfsync packets hanging around. */
+               if (sc->sc_mbuf_tdb != NULL) {
+                       m_freem(sc->sc_mbuf_tdb);
+                       sc->sc_mbuf_tdb = NULL;
+                       sc->sc_statep_tdb.t = NULL;
+               }
+               return (0);
        }
 
-       return error;
-}
+       s = splnet();
+       if (sc->sc_mbuf_tdb == NULL) {
+               if ((sc->sc_mbuf_tdb = pfsync_get_mbuf(sc, PFSYNC_ACT_TDB_UPD,
+                   (void *)&sc->sc_statep_tdb.t)) == NULL) {
+                       splx(s);
+                       return (ENOMEM);
+               }
+               h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *);
+       } else {
+               h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *);
+               if (h->action != PFSYNC_ACT_TDB_UPD) {
+                       /*
+                        * XXX will never happen as long as there's
+                        * only one "TDB action".
+                        */
+                       pfsync_tdb_sendout(sc);
+                       sc->sc_mbuf_tdb = pfsync_get_mbuf(sc,
+                           PFSYNC_ACT_TDB_UPD, (void *)&sc->sc_statep_tdb.t);
+                       if (sc->sc_mbuf_tdb == NULL) {
+                               splx(s);
+                               return (ENOMEM);
+                       }
+                       h = mtod(sc->sc_mbuf_tdb, struct pfsync_header *);
+               } else if (sc->sc_maxupdates) {
+                       /*
+                        * If it's an update, look in the packet to see if
+                        * we already have an update for the state.
+                        */
+                       struct pfsync_tdb *u =
+                           (void *)((char *)h + PFSYNC_HDRLEN);
+
+                       for (i = 0; !pt && i < h->count; i++) {
+                               if (tdb->tdb_spi == u->spi &&
+                                   tdb->tdb_sproto == u->sproto &&
+                                   !bcmp(&tdb->tdb_dst, &u->dst,
+                                   SA_LEN(&u->dst.sa))) {
+                                       pt = u;
+                                       pt->updates++;
+                               }
+                               u++;
+                       }
+               }
+       }
 
-static moduledata_t pfsync_mod = {
-       "pfsync",
-       pfsync_modevent,
-       0
-};
+       if (pt == NULL) {
+               /* not a "duplicate" update */
+               pt = sc->sc_statep_tdb.t++;
+               sc->sc_mbuf_tdb->m_pkthdr.len =
+                   sc->sc_mbuf_tdb->m_len += sizeof(struct pfsync_tdb);
+               h->count++;
+               bzero(pt, sizeof(*pt));
 
-#define PFSYNC_MODVER 1
+               pt->spi = tdb->tdb_spi;
+               memcpy(&pt->dst, &tdb->tdb_dst, sizeof pt->dst);
+               pt->sproto = tdb->tdb_sproto;
+       }
 
-DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
-MODULE_VERSION(pfsync, PFSYNC_MODVER);
+       /*
+        * When a failover happens, the master's rpl is probably above
+        * what we see here (we may be up to a second late), so
+        * increase it a bit for outbound tdbs to manage most such
+        * situations.
+        *
+        * For now, just add an offset that is likely to be larger
+        * than the number of packets we can see in one second. The RFC
+        * just says the next packet must have a higher seq value.
+        *
+        * XXX What is a good algorithm for this? We could use
+        * a rate-determined increase, but to know it, we would have
+        * to extend struct tdb.
+        * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
+        * will soon be replaced anyway. For now, just don't handle
+        * this edge case.
+        */
+#define RPL_INCR 16384
+       pt->rpl = htonl(tdb->tdb_rpl + (output ? RPL_INCR : 0));
+       pt->cur_bytes = htobe64(tdb->tdb_cur_bytes);
+
+       if (h->count == sc->sc_maxcount ||
+           (sc->sc_maxupdates && (pt->updates >= sc->sc_maxupdates)))
+               ret = pfsync_tdb_sendout(sc);
+
+       splx(s);
+       return (ret);
+}
index c9b86a6..5ed465e 100644 (file)
@@ -1,10 +1,6 @@
-/*     $FreeBSD: src/sys/contrib/pf/net/if_pfsync.h,v 1.4 2004/06/16 23:24:00 mlaier Exp $     */
-/*     $OpenBSD: if_pfsync.h,v 1.13 2004/03/22 04:54:17 mcbride Exp $  */
-/*     $DragonFly: src/sys/net/pf/if_pfsync.h,v 1.2 2004/09/20 01:43:13 dillon Exp $ */
+/*     $OpenBSD: if_pfsync.h,v 1.30 2006/10/31 14:49:01 henning Exp $  */
 
 /*
- * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
- *
  * Copyright (c) 2001 Michael Shalayeff
  * All rights reserved.
  *
 #ifndef _NET_IF_PFSYNC_H_
 #define _NET_IF_PFSYNC_H_
 
-#include <sys/ioccom.h>
-
-/*
- * pfvar.h is required to get struct pf_addr.  Also kdump and other utilities
- * blindly include header files to try to get all the ioctl constants and
- * buildworld will fail without this.  We need a better way XXX
- */
-#ifndef _NET_PFVAR_H_
-#include "pfvar.h"
-#endif
-
 
 #define PFSYNC_ID_LEN  sizeof(u_int64_t)
 
 struct pfsync_state_scrub {
        u_int16_t       pfss_flags;
        u_int8_t        pfss_ttl;       /* stashed TTL          */
+#define PFSYNC_SCRUB_FLAG_VALID        0x01
        u_int8_t        scrub_flag;
        u_int32_t       pfss_ts_mod;    /* timestamp modulation */
 } __packed;
@@ -69,8 +55,7 @@ struct pfsync_state_peer {
        u_int16_t       mss;            /* Maximum segment size option  */
        u_int8_t        state;          /* active state level           */
        u_int8_t        wscale;         /* window scaling factor        */
-       u_int8_t        scrub_flag;
-       u_int8_t        pad[5];
+       u_int8_t        pad[6];
 } __packed;
 
 struct pfsync_state {
@@ -87,8 +72,8 @@ struct pfsync_state {
        u_int32_t        nat_rule;
        u_int32_t        creation;
        u_int32_t        expire;
-       u_int32_t        packets[2];
-       u_int32_t        bytes[2];
+       u_int32_t        packets[2][2];
+       u_int32_t        bytes[2][2];
        u_int32_t        creatorid;
        sa_family_t      af;
        u_int8_t         proto;
@@ -100,6 +85,19 @@ struct pfsync_state {
        u_int8_t         updates;
 } __packed;
 
+#define PFSYNC_FLAG_COMPRESS   0x01
+#define PFSYNC_FLAG_STALE      0x02
+
+struct pfsync_tdb {
+       u_int32_t       spi;
+       union sockaddr_union dst;
+       u_int32_t       rpl;
+       u_int64_t       cur_bytes;
+       u_int8_t        sproto;
+       u_int8_t        updates;
+       u_int8_t        pad[2];
+} __packed;
+
 struct pfsync_state_upd {
        u_int32_t               id[2];
        struct pfsync_state_peer        src;
@@ -155,6 +153,10 @@ union sc_statep {
        struct pfsync_state_upd_req     *r;
 };
 
+union sc_tdb_statep {
+       struct pfsync_tdb       *t;
+};
+
 extern int     pfsync_sync_ok;
 
 struct pfsync_softc {
@@ -162,27 +164,34 @@ struct pfsync_softc {
        struct ifnet            *sc_sync_ifp;
 
        struct ip_moptions       sc_imo;
-       struct callout           sc_tmo;
-       struct callout           sc_bulk_tmo;
-       struct callout           sc_bulkfail_tmo;
+       struct timeout           sc_tmo;
+       struct timeout           sc_tdb_tmo;
+       struct timeout           sc_bulk_tmo;
+       struct timeout           sc_bulkfail_tmo;
+       struct in_addr           sc_sync_peer;
        struct in_addr           sc_sendaddr;
-       struct mbuf             *sc_mbuf;       /* current cummulative mbuf */
-       struct mbuf             *sc_mbuf_net;   /* current cummulative mbuf */
+       struct mbuf             *sc_mbuf;       /* current cumulative mbuf */
+       struct mbuf             *sc_mbuf_net;   /* current cumulative mbuf */
+       struct mbuf             *sc_mbuf_tdb;   /* dito for TDB updates */
        union sc_statep          sc_statep;
        union sc_statep          sc_statep_net;
+       union sc_tdb_statep      sc_statep_tdb;
        u_int32_t                sc_ureq_received;
        u_int32_t                sc_ureq_sent;
+       struct pf_state         *sc_bulk_send_next;
+       struct pf_state         *sc_bulk_terminator;
        int                      sc_bulk_tries;
        int                      sc_maxcount;   /* number of states in mtu */
        int                      sc_maxupdates; /* number of updates/state */
-       LIST_ENTRY(pfsync_softc) sc_next;
 };
+
+extern struct pfsync_softc     *pfsyncif;
 #endif
 
 
 struct pfsync_header {
        u_int8_t version;
-#define        PFSYNC_VERSION  2
+#define        PFSYNC_VERSION  3
        u_int8_t af;
        u_int8_t action;
 #define        PFSYNC_ACT_CLR          0       /* clear all states */
@@ -195,48 +204,51 @@ struct pfsync_header {
 #define        PFSYNC_ACT_DEL_F        7       /* delete fragments */
 #define        PFSYNC_ACT_UREQ         8       /* request "uncompressed" state */
 #define PFSYNC_ACT_BUS         9       /* Bulk Update Status */
-#define        PFSYNC_ACT_MAX          10
+#define PFSYNC_ACT_TDB_UPD     10      /* TDB replay counter update */
+#define        PFSYNC_ACT_MAX          11
        u_int8_t count;
+       u_int8_t pf_chksum[PF_MD5_DIGEST_LENGTH];
 } __packed;
 
 #define PFSYNC_BULKPACKETS     1       /* # of packets per timeout */
-#define PFSYNC_MAX_BULKTRIES   12      
+#define PFSYNC_MAX_BULKTRIES   12
 #define PFSYNC_HDRLEN  sizeof(struct pfsync_header)
 #define        PFSYNC_ACTIONS \
        "CLR ST", "INS ST", "UPD ST", "DEL ST", \
        "UPD ST COMP", "DEL ST COMP", "INS FR", "DEL FR", \
-       "UPD REQ", "BLK UPD STAT"
+       "UPD REQ", "BLK UPD STAT", "TDB UPD"
 
 #define PFSYNC_DFLTTL          255
 
 struct pfsyncstats {
-       u_long  pfsyncs_ipackets;       /* total input packets, IPv4 */
-       u_long  pfsyncs_ipackets6;      /* total input packets, IPv6 */
-       u_long  pfsyncs_badif;          /* not the right interface */
-       u_long  pfsyncs_badttl;         /* TTL is not PFSYNC_DFLTTL */
-       u_long  pfsyncs_hdrops;         /* packets shorter than header */
-       u_long  pfsyncs_badver;         /* bad (incl unsupp) version */
-       u_long  pfsyncs_badact;         /* bad action */
-       u_long  pfsyncs_badlen;         /* data length does not match */
-       u_long  pfsyncs_badauth;        /* bad authentication */
-       u_long  pfsyncs_badstate;       /* insert/lookup failed */
-
-       u_long  pfsyncs_opackets;       /* total output packets, IPv4 */
-       u_long  pfsyncs_opackets6;      /* total output packets, IPv6 */
-       u_long  pfsyncs_onomem;         /* no memory for an mbuf for a send */
-       u_long  pfsyncs_oerrors;        /* ip output error */
+       u_int64_t       pfsyncs_ipackets;       /* total input packets, IPv4 */
+       u_int64_t       pfsyncs_ipackets6;      /* total input packets, IPv6 */
+       u_int64_t       pfsyncs_badif;          /* not the right interface */
+       u_int64_t       pfsyncs_badttl;         /* TTL is not PFSYNC_DFLTTL */
+       u_int64_t       pfsyncs_hdrops;         /* packets shorter than hdr */
+       u_int64_t       pfsyncs_badver;         /* bad (incl unsupp) version */
+       u_int64_t       pfsyncs_badact;         /* bad action */
+       u_int64_t       pfsyncs_badlen;         /* data length does not match */
+       u_int64_t       pfsyncs_badauth;        /* bad authentication */
+       u_int64_t       pfsyncs_stale;          /* stale state */
+       u_int64_t       pfsyncs_badval;         /* bad values */
+       u_int64_t       pfsyncs_badstate;       /* insert/lookup failed */
+
+       u_int64_t       pfsyncs_opackets;       /* total output packets, IPv4 */
+       u_int64_t       pfsyncs_opackets6;      /* total output packets, IPv6 */
+       u_int64_t       pfsyncs_onomem;         /* no memory for an mbuf */
+       u_int64_t       pfsyncs_oerrors;        /* ip output error */
 };
 
 /*
  * Configuration structure for SIOCSETPFSYNC SIOCGETPFSYNC
  */
 struct pfsyncreq {
-       char    pfsyncr_syncif[IFNAMSIZ];
-       int     pfsyncr_maxupdates;
-       int     pfsyncr_authlevel;
+       char             pfsyncr_syncdev[IFNAMSIZ];
+       struct in_addr   pfsyncr_syncpeer;
+       int              pfsyncr_maxupdates;
+       int              pfsyncr_authlevel;
 };
-#define SIOCSETPFSYNC  _IOW('i', 247, struct ifreq)
-#define SIOCGETPFSYNC  _IOWR('i', 248, struct ifreq)
 
 
 #define pf_state_peer_hton(s,d) do {           \
@@ -247,6 +259,13 @@ struct pfsyncreq {
        (d)->mss = htons((s)->mss);             \
        (d)->state = (s)->state;                \
        (d)->wscale = (s)->wscale;              \
+       if ((s)->scrub) {                                               \
+               (d)->scrub.pfss_flags =                                 \
+                   htons((s)->scrub->pfss_flags & PFSS_TIMESTAMP);     \
+               (d)->scrub.pfss_ttl = (s)->scrub->pfss_ttl;             \
+               (d)->scrub.pfss_ts_mod = htonl((s)->scrub->pfss_ts_mod);\
+               (d)->scrub.scrub_flag = PFSYNC_SCRUB_FLAG_VALID;        \
+       }                                                               \
 } while (0)
 
 #define pf_state_peer_ntoh(s,d) do {           \
@@ -257,6 +276,13 @@ struct pfsyncreq {
        (d)->mss = ntohs((s)->mss);             \
        (d)->state = (s)->state;                \
        (d)->wscale = (s)->wscale;              \
+       if ((s)->scrub.scrub_flag == PFSYNC_SCRUB_FLAG_VALID &&         \
+           (d)->scrub != NULL) {                                       \
+               (d)->scrub->pfss_flags =                                \
+                   ntohs((s)->scrub.pfss_flags) & PFSS_TIMESTAMP;      \
+               (d)->scrub->pfss_ttl = (s)->scrub.pfss_ttl;             \
+               (d)->scrub->pfss_ts_mod = ntohl((s)->scrub.pfss_ts_mod);\
+       }                                                               \
 } while (0)
 
 #define pf_state_host_hton(s,d) do {                           \
@@ -269,6 +295,17 @@ struct pfsyncreq {
        (d)->port = (s)->port;                                  \
 } while (0)
 
+#define pf_state_counter_hton(s,d) do {                                \
+       d[0] = htonl((s>>32)&0xffffffff);                       \
+       d[1] = htonl(s&0xffffffff);                             \
+} while (0)
+
+#define pf_state_counter_ntoh(s,d) do {                                \
+       d = ntohl(s[0]);                                        \
+       d = d<<32;                                              \
+       d += ntohl(s[1]);                                       \
+} while (0)
+
 #ifdef _KERNEL
 void pfsync_input(struct mbuf *, ...);
 int pfsync_clear_states(u_int32_t, char *);
@@ -278,19 +315,22 @@ int pfsync_pack_state(u_int8_t, struct pf_state *, int);
            (st->proto == IPPROTO_PFSYNC))                      \
                st->sync_flags |= PFSTATE_NOSYNC;               \
        else if (!st->sync_flags)                               \
-               pfsync_pack_state(PFSYNC_ACT_INS, (st), 1);     \
+               pfsync_pack_state(PFSYNC_ACT_INS, (st),         \
+                   PFSYNC_FLAG_COMPRESS);                      \
        st->sync_flags &= ~PFSTATE_FROMSYNC;                    \
 } while (0)
 #define pfsync_update_state(st) do {                           \
        if (!st->sync_flags)                                    \
-               pfsync_pack_state(PFSYNC_ACT_UPD, (st), 1);     \
+               pfsync_pack_state(PFSYNC_ACT_UPD, (st),         \
+                   PFSYNC_FLAG_COMPRESS);                      \
        st->sync_flags &= ~PFSTATE_FROMSYNC;                    \
 } while (0)
 #define pfsync_delete_state(st) do {                           \
        if (!st->sync_flags)                                    \
-               pfsync_pack_state(PFSYNC_ACT_DEL, (st), 1);     \
-       st->sync_flags &= ~PFSTATE_FROMSYNC;                    \
+               pfsync_pack_state(PFSYNC_ACT_DEL, (st),         \
+                   PFSYNC_FLAG_COMPRESS);                      \
 } while (0)
+int pfsync_update_tdb(struct tdb *, int);
 #endif
 
 #endif /* _NET_IF_PFSYNC_H_ */
index 4afc291..d600b47 100644 (file)
@@ -2,6 +2,7 @@
 /*     $OpenBSD: pf.c,v 1.433.2.2 2004/07/17 03:22:34 brad Exp $ */
 /* add $OpenBSD: pf.c,v 1.448 2004/05/11 07:34:11 dhartmei Exp $ */
 /*     $DragonFly: src/sys/net/pf/pf.c,v 1.20 2008/06/05 18:06:32 swildner Exp $ */
+/*     $OpenBSD: pf.c,v 1.527 2007/02/22 15:23:23 pyr Exp $ */
 
 /*
  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
@@ -56,6 +57,8 @@
 #include <sys/sysctl.h>
 #include <sys/endian.h>
 #include <vm/vm_zone.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
 
 #include <machine/inttypes.h>
 
@@ -79,6 +82,7 @@
 #include <netinet/tcp_var.h>
 #include <netinet/udp_var.h>
 #include <netinet/icmp_var.h>
+#include <netinet/if_ether.h>
 
 #include <net/pf/pfvar.h>
 #include <net/pf/if_pflog.h>
 #include <net/netmsg2.h>
 
 extern int ip_optcopy(struct ip *, struct ip *);
+extern int debug_pfugidhack;
 
 #define DPFPRINTF(n, x)        if (pf_status.debug >= (n)) kprintf x
 
@@ -110,8 +115,6 @@ extern int ip_optcopy(struct ip *, struct ip *);
  * Global variables
  */
 
-struct pf_anchorqueue   pf_anchors;
-struct pf_ruleset       pf_main_ruleset;
 struct pf_altqqueue     pf_altqs[2];
 struct pf_palist        pf_pabuf;
 struct pf_altqqueue    *pf_altqs_active;
@@ -123,20 +126,28 @@ u_int32_t          ticket_altqs_inactive;
 int                     altqs_inactive_open;
 u_int32_t               ticket_pabuf;
 
-struct callout          pf_expire_to;                  /* expire timeout */
+struct pf_anchor_stackframe {
+       struct pf_ruleset                       *rs;
+       struct pf_rule                          *r;
+       struct pf_anchor_node                   *parent;
+       struct pf_anchor                        *child;
+} pf_anchor_stack[64];
 
 vm_zone_t               pf_src_tree_pl, pf_rule_pl;
 vm_zone_t               pf_state_pl, pf_altq_pl, pf_pooladdr_pl;
 
 void                    pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
-void                    pf_print_state(struct pf_state *);
-void                    pf_print_flags(u_int8_t);
 
-u_int16_t               pf_cksum_fixup(u_int16_t, u_int16_t, u_int16_t,
-                           u_int8_t);
+void                    pf_init_threshold(struct pf_threshold *, u_int32_t,
+                           u_int32_t);
+void                    pf_add_threshold(struct pf_threshold *);
+int                     pf_check_threshold(struct pf_threshold *);
+
 void                    pf_change_ap(struct pf_addr *, u_int16_t *,
                            u_int16_t *, u_int16_t *, struct pf_addr *,
                            u_int16_t, u_int8_t, sa_family_t);
+int                     pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
+                           struct tcphdr *, struct pf_state_peer *);
 #ifdef INET6
 void                    pf_change_a6(struct pf_addr *, u_int16_t *,
                            struct pf_addr *, u_int8_t);
@@ -148,7 +159,8 @@ void                         pf_change_icmp(struct pf_addr *, u_int16_t *,
 void                    pf_send_tcp(const struct pf_rule *, sa_family_t,
                            const struct pf_addr *, const struct pf_addr *,
                            u_int16_t, u_int16_t, u_int32_t, u_int32_t,
-                           u_int8_t, u_int16_t, u_int16_t, u_int8_t);
+                           u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
+                           u_int16_t, struct ether_header *, struct ifnet *);
 void                    pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
                            sa_family_t, struct pf_rule *);
 struct pf_rule         *pf_match_translation(struct pf_pdesc *, struct mbuf *,
@@ -163,19 +175,19 @@ struct pf_rule            *pf_get_translation(struct pf_pdesc *, struct mbuf *,
 int                     pf_test_tcp(struct pf_rule **, struct pf_state **,
                            int, struct pfi_kif *, struct mbuf *, int,
                            void *, struct pf_pdesc *, struct pf_rule **,
-                           struct pf_ruleset **);
+                           struct pf_ruleset **, struct ifqueue *, struct inpcb *);
 int                     pf_test_udp(struct pf_rule **, struct pf_state **,
                            int, struct pfi_kif *, struct mbuf *, int,
                            void *, struct pf_pdesc *, struct pf_rule **,
-                           struct pf_ruleset **);
+                           struct pf_ruleset **, struct ifqueue *, struct inpcb *);
 int                     pf_test_icmp(struct pf_rule **, struct pf_state **,
                            int, struct pfi_kif *, struct mbuf *, int,
                            void *, struct pf_pdesc *, struct pf_rule **,
-                           struct pf_ruleset **);
+                           struct pf_ruleset **, struct ifqueue *);
 int                     pf_test_other(struct pf_rule **, struct pf_state **,
                            int, struct pfi_kif *, struct mbuf *, int, void *,
                            struct pf_pdesc *, struct pf_rule **,
-                           struct pf_ruleset **);
+                           struct pf_ruleset **, struct ifqueue *);
 int                     pf_test_fragment(struct pf_rule **, int,
                            struct pfi_kif *, struct mbuf *, void *,
                            struct pf_pdesc *, struct pf_rule **,
@@ -188,11 +200,14 @@ int                        pf_test_state_udp(struct pf_state **, int,
                            void *, struct pf_pdesc *);
 int                     pf_test_state_icmp(struct pf_state **, int,
                            struct pfi_kif *, struct mbuf *, int,
-                           void *, struct pf_pdesc *);
+                           void *, struct pf_pdesc *, u_short *);
 int                     pf_test_state_other(struct pf_state **, int,
                            struct pfi_kif *, struct pf_pdesc *);
-static int              pf_match_tag(struct mbuf *, struct pf_rule *,
-                                     struct pf_rule *, int *);
+int                     pf_match_tag(struct mbuf *, struct pf_rule *,
+                            struct pf_mtag *, int *);
+int                     pf_step_out_of_anchor(int *, struct pf_ruleset **,
+                            int, struct pf_rule **, struct pf_rule **,
+                            int *);
 void                    pf_hash(struct pf_addr *, struct pf_addr *,
                            struct pf_poolhashkey *, sa_family_t);
 int                     pf_map_addr(u_int8_t, struct pf_rule *,
@@ -203,11 +218,11 @@ int                        pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *,
                            struct pf_addr *, u_int16_t*, u_int16_t, u_int16_t,
                            struct pf_src_node **);
 void                    pf_route(struct mbuf **, struct pf_rule *, int,
-                           struct ifnet *, struct pf_state *);
+                           struct ifnet *, struct pf_state *,
+                           struct pf_pdesc *);
 void                    pf_route6(struct mbuf **, struct pf_rule *, int,
-                           struct ifnet *, struct pf_state *);
-int                     pf_socket_lookup(uid_t *, gid_t *,
-                           int, struct pf_pdesc *);
+                           struct ifnet *, struct pf_state *,
+                           struct pf_pdesc *);
 u_int8_t                pf_get_wscale(struct mbuf *, int, u_int16_t,
                            sa_family_t);
 u_int16_t               pf_get_mss(struct mbuf *, int, u_int16_t,
@@ -221,19 +236,29 @@ int                        pf_check_proto_cksum(struct mbuf *, int, int,
 int                     pf_addr_wrap_neq(struct pf_addr_wrap *,
                            struct pf_addr_wrap *);
 struct pf_state                *pf_find_state_recurse(struct pfi_kif *,
-                           struct pf_state *, u_int8_t);
-
-struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX];
+                           struct pf_state_cmp *, u_int8_t);
+int                     pf_src_connlimit(struct pf_state **);
+int                     pf_check_congestion(struct ifqueue *);
+
+extern int pf_end_threads;
+
+struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
+       { &pf_state_pl, PFSTATE_HIWAT },
+       { &pf_src_tree_pl, PFSNODE_HIWAT },
+       { &pf_frent_pl, PFFRAG_FRENT_HIWAT },
+       { &pfr_ktable_pl, PFR_KTABLE_HIWAT },
+       { &pfr_kentry_pl, PFR_KENTRY_HIWAT }
+};
 
 #define STATE_LOOKUP()                                                 \
        do {                                                            \
                if (direction == PF_IN)                                 \
-                       *state = pf_find_state_recurse(         \
+                       *state = pf_find_state_recurse(                 \
                            kif, &key, PF_EXT_GWY);                     \
                else                                                    \
                        *state = pf_find_state_recurse(         \
                            kif, &key, PF_LAN_EXT);                     \
-               if (*state == NULL)                                     \
+               if (*state == NULL || (*state)->timeout == PFTM_PURGE)  \
                        return (PF_DROP);                               \
                if (direction == PF_OUT &&                              \
                    (((*state)->rule.ptr->rt == PF_ROUTETO &&           \
@@ -253,22 +278,39 @@ struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX];
        (s)->lan.addr.addr32[3] != (s)->gwy.addr.addr32[3])) || \
        (s)->lan.port != (s)->gwy.port
 
-#define BOUND_IFACE(r, k) (((r)->rule_flag & PFRULE_IFBOUND) ? (k) :   \
-       ((r)->rule_flag & PFRULE_GRBOUND) ? (k)->pfik_parent :         \
-       (k)->pfik_parent->pfik_parent)
+#define BOUND_IFACE(r, k) \
+       ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all
+
+#define STATE_INC_COUNTERS(s)                          \
+       do {                                            \
+               s->rule.ptr->states++;                  \
+               if (s->anchor.ptr != NULL)              \
+                       s->anchor.ptr->states++;        \
+               if (s->nat_rule.ptr != NULL)            \
+                       s->nat_rule.ptr->states++;      \
+       } while (0)
+
+#define STATE_DEC_COUNTERS(s)                          \
+       do {                                            \
+               if (s->nat_rule.ptr != NULL)            \
+                       s->nat_rule.ptr->states--;      \
+               if (s->anchor.ptr != NULL)              \
+                       s->anchor.ptr->states--;        \
+               s->rule.ptr->states--;                  \
+       } while (0)
 
-static int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
-static int pf_state_compare_lan_ext(struct pf_state *,
+static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
+static __inline int pf_state_compare_lan_ext(struct pf_state *,
        struct pf_state *);
-static int pf_state_compare_ext_gwy(struct pf_state *,
+static __inline int pf_state_compare_ext_gwy(struct pf_state *,
        struct pf_state *);
-static int pf_state_compare_id(struct pf_state *,
+static __inline int pf_state_compare_id(struct pf_state *,
        struct pf_state *);
 
 struct pf_src_tree tree_src_tracking;
 
 struct pf_state_tree_id tree_id;
-struct pf_state_queue state_updates;
+struct pf_state_queue state_list;
 
 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
 RB_GENERATE(pf_state_tree_lan_ext, pf_state,
@@ -278,7 +320,7 @@ RB_GENERATE(pf_state_tree_ext_gwy, pf_state,
 RB_GENERATE(pf_state_tree_id, pf_state,
     u.s.entry_id, pf_state_compare_id);
 
-static int
+static __inline int
 pf_src_compare(struct pf_src_node *a, struct pf_src_node *b)
 {
        int     diff;
@@ -335,7 +377,7 @@ pf_state_hash(struct pf_state *s)
        return(hv);
 }
 
-static int
+static __inline int
 pf_state_compare_lan_ext(struct pf_state *a, struct pf_state *b)
 {
        int     diff;
@@ -403,7 +445,7 @@ pf_state_compare_lan_ext(struct pf_state *a, struct pf_state *b)
        return (0);
 }
 
-static int
+static __inline int
 pf_state_compare_ext_gwy(struct pf_state *a, struct pf_state *b)
 {
        int     diff;
@@ -471,7 +513,7 @@ pf_state_compare_ext_gwy(struct pf_state *a, struct pf_state *b)
        return (0);
 }
 
-static int
+static __inline int
 pf_state_compare_id(struct pf_state *a, struct pf_state *b)
 {
        if (a->id > b->id)
@@ -504,17 +546,17 @@ pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
                break;
        }
 }
-#endif
+#endif /* INET6 */
 
 struct pf_state *
-pf_find_state_byid(struct pf_state *key)
+pf_find_state_byid(struct pf_state_cmp *key)
 {
        pf_status.fcounters[FCNT_STATE_SEARCH]++;
-       return (RB_FIND(pf_state_tree_id, &tree_id, key));
+       return (RB_FIND(pf_state_tree_id, &tree_id, (struct pf_state *)key));
 }
 
 struct pf_state *
-pf_find_state_recurse(struct pfi_kif *kif, struct pf_state *key, u_int8_t tree)
+pf_find_state_recurse(struct pfi_kif *kif, struct pf_state_cmp *key, u_int8_t tree)
 {
        struct pf_state *s;
 
@@ -522,20 +564,20 @@ pf_find_state_recurse(struct pfi_kif *kif, struct pf_state *key, u_int8_t tree)
 
        switch (tree) {
        case PF_LAN_EXT:
-               for (; kif != NULL; kif = kif->pfik_parent) {
-                       s = RB_FIND(pf_state_tree_lan_ext,
-                           &kif->pfik_lan_ext, key);
-                       if (s != NULL)
-                               return (s);
-               }
+               if ((s = RB_FIND(pf_state_tree_lan_ext, &kif->pfik_lan_ext,
+                   (struct pf_state *)key)) != NULL)
+                       return (s);
+               if ((s = RB_FIND(pf_state_tree_lan_ext, &pfi_all->pfik_lan_ext,
+                   (struct pf_state *)key)) != NULL)
+                       return (s);
                return (NULL);
        case PF_EXT_GWY:
-               for (; kif != NULL; kif = kif->pfik_parent) {
-                       s = RB_FIND(pf_state_tree_ext_gwy,
-                           &kif->pfik_ext_gwy, key);
-                       if (s != NULL)
-                               return (s);
-               }
+               if ((s = RB_FIND(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy,
+                   (struct pf_state *)key)) != NULL)
+                       return (s);
+               if ((s = RB_FIND(pf_state_tree_ext_gwy, &pfi_all->pfik_ext_gwy,
+                   (struct pf_state *)key)) != NULL)
+                       return (s);
                return (NULL);
        default:
                panic("pf_find_state_recurse");
@@ -543,7 +585,7 @@ pf_find_state_recurse(struct pfi_kif *kif, struct pf_state *key, u_int8_t tree)
 }
 
 struct pf_state *
-pf_find_state_all(struct pf_state *key, u_int8_t tree, int *more)
+pf_find_state_all(struct pf_state_cmp *key, u_int8_t tree, int *more)
 {
        struct pf_state *s, *ss = NULL;
        struct pfi_kif  *kif;
@@ -554,7 +596,7 @@ pf_find_state_all(struct pf_state *key, u_int8_t tree, int *more)
        case PF_LAN_EXT:
                TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) {
                        s = RB_FIND(pf_state_tree_lan_ext,
-                           &kif->pfik_lan_ext, key);
+                           &kif->pfik_lan_ext, (struct pf_state *)key);
                        if (s == NULL)
                                continue;
                        if (more == NULL)
@@ -566,7 +608,7 @@ pf_find_state_all(struct pf_state *key, u_int8_t tree, int *more)
        case PF_EXT_GWY:
                TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) {
                        s = RB_FIND(pf_state_tree_ext_gwy,
-                           &kif->pfik_ext_gwy, key);
+                           &kif->pfik_ext_gwy, (struct pf_state *)key);
                        if (s == NULL)
                                continue;
                        if (more == NULL)
@@ -580,6 +622,132 @@ pf_find_state_all(struct pf_state *key, u_int8_t tree, int *more)
        }
 }
 
+void
+pf_init_threshold(struct pf_threshold *threshold,
+    u_int32_t limit, u_int32_t seconds)
+{
+       threshold->limit = limit * PF_THRESHOLD_MULT;
+       threshold->seconds = seconds;
+       threshold->count = 0;
+       threshold->last = time_second;
+}
+
+void
+pf_add_threshold(struct pf_threshold *threshold)
+{
+       u_int32_t t = time_second, diff = t - threshold->last;
+
+       if (diff >= threshold->seconds)
+               threshold->count = 0;
+       else
+               threshold->count -= threshold->count * diff /
+                   threshold->seconds;
+       threshold->count += PF_THRESHOLD_MULT;
+       threshold->last = t;
+}
+
+int
+pf_check_threshold(struct pf_threshold *threshold)
+{
+       return (threshold->count > threshold->limit);
+}
+
+int
+pf_src_connlimit(struct pf_state **state)
+{
+       struct pf_state *s;
+       int bad = 0;
+
+       (*state)->src_node->conn++;
+       (*state)->src.tcp_est = 1;
+       pf_add_threshold(&(*state)->src_node->conn_rate);
+
+       if ((*state)->rule.ptr->max_src_conn &&
+           (*state)->rule.ptr->max_src_conn <
+           (*state)->src_node->conn) {
+               pf_status.lcounters[LCNT_SRCCONN]++;
+               bad++;
+       }
+
+       if ((*state)->rule.ptr->max_src_conn_rate.limit &&
+           pf_check_threshold(&(*state)->src_node->conn_rate)) {
+               pf_status.lcounters[LCNT_SRCCONNRATE]++;
+               bad++;
+       }
+
+       if (!bad)
+               return (0);
+
+       if ((*state)->rule.ptr->overload_tbl) {
+               struct pfr_addr p;
+               u_int32_t       killed = 0;
+
+               pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
+               if (pf_status.debug >= PF_DEBUG_MISC) {
+                       kprintf("pf_src_connlimit: blocking address ");
+                       pf_print_host(&(*state)->src_node->addr, 0,
+                           (*state)->af);
+               }
+
+               bzero(&p, sizeof(p));
+               p.pfra_af = (*state)->af;
+               switch ((*state)->af) {
+#ifdef INET
+               case AF_INET:
+                       p.pfra_net = 32;
+                       p.pfra_ip4addr = (*state)->src_node->addr.v4;
+                       break;
+#endif /* INET */
+#ifdef INET6
+               case AF_INET6:
+                       p.pfra_net = 128;
+                       p.pfra_ip6addr = (*state)->src_node->addr.v6;
+                       break;
+#endif /* INET6 */
+               }
+
+               pfr_insert_kentry((*state)->rule.ptr->overload_tbl,
+                   &p, time_second);
+
+               /* kill existing states if that's required. */
+               if ((*state)->rule.ptr->flush) {
+                       pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
+
+                       RB_FOREACH(s, pf_state_tree_id, &tree_id) {
+                               /*
+                                * Kill states from this source.  (Only those
+                                * from the same rule if PF_FLUSH_GLOBAL is not
+                                * set)
+                                */
+                               if (s->af == (*state)->af &&
+                                   (((*state)->direction == PF_OUT &&
+                                   PF_AEQ(&(*state)->src_node->addr,
+                                   &s->lan.addr, s->af)) ||
+                                   ((*state)->direction == PF_IN &&
+                                   PF_AEQ(&(*state)->src_node->addr,
+                                   &s->ext.addr, s->af))) &&
+                                   ((*state)->rule.ptr->flush &
+                                   PF_FLUSH_GLOBAL ||
+                                   (*state)->rule.ptr == s->rule.ptr)) {
+                                       s->timeout = PFTM_PURGE;
+                                       s->src.state = s->dst.state =
+                                           TCPS_CLOSED;
+                                       killed++;
+                               }
+                       }
+                       if (pf_status.debug >= PF_DEBUG_MISC)
+                               kprintf(", %u states killed", killed);
+               }
+               if (pf_status.debug >= PF_DEBUG_MISC)
+                       kprintf("\n");
+       }
+
+       /* kill this state */
+       (*state)->timeout = PFTM_PURGE;
+       (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
+       return (1);
+}
+
 int
 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
     struct pf_addr *src, sa_family_t af)
@@ -601,9 +769,16 @@ pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
                if (!rule->max_src_nodes ||
                    rule->src_nodes < rule->max_src_nodes)
                        (*sn) = pool_get(&pf_src_tree_pl, PR_NOWAIT);
+               else
+                       pf_status.lcounters[LCNT_SRCNODES]++;
                if ((*sn) == NULL)
                        return (-1);
                bzero(*sn, sizeof(struct pf_src_node));
+
+               pf_init_threshold(&(*sn)->conn_rate,
+                   rule->max_src_conn_rate.limit,
+                   rule->max_src_conn_rate.seconds);
+
                (*sn)->af = af;
                if (rule->rule_flag & PFRULE_RULESRCTRACK ||
                    rule->rpool.opts & PF_POOL_STICKYADDR)
@@ -629,8 +804,10 @@ pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
                pf_status.src_nodes++;
        } else {
                if (rule->max_src_states &&
-                   (*sn)->states >= rule->max_src_states)
+                   (*sn)->states >= rule->max_src_states) {
+                       pf_status.lcounters[LCNT_SRCSTATES]++;
                        return (-1);
+               }
        }
        return (0);
 }
@@ -696,11 +873,10 @@ pf_insert_state(struct pfi_kif *kif, struct pf_state *state)
                RB_REMOVE(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state);
                return (-1);
        }
-       TAILQ_INSERT_HEAD(&state_updates, state, u.s.entry_updates);
-
+       TAILQ_INSERT_TAIL(&state_list, state, u.s.entry_list);
        pf_status.fcounters[FCNT_STATE_INSERT]++;
        pf_status.states++;
-       pfi_attach_state(kif);
+       pfi_kif_ref(kif, PFI_KIF_REF_STATE);
 #if NPFSYNC
        pfsync_insert_state(state);
 #endif
@@ -708,18 +884,47 @@ pf_insert_state(struct pfi_kif *kif, struct pf_state *state)
 }
 
 void
-pf_purge_timeout(void *arg)
+pf_purge_thread(void *v)
 {
-       struct callout  *to = arg;
+       int nloops = 0;
+       int locked = 0;
+
+       for (;;) {
+               tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);
+
+               lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
+
+               if (pf_end_threads) {
+                       pf_purge_expired_states(pf_status.states, 1);
+                       pf_purge_expired_fragments();
+                       pf_purge_expired_src_nodes(1);
+                       pf_end_threads++;
+
+                       lockmgr(&pf_consistency_lock, LK_RELEASE);
+                       wakeup(pf_purge_thread);
+                       kthread_exit();
+               }
+               crit_enter();
+
+               /* process a fraction of the state table every second */
+               if(!pf_purge_expired_states(1 + (pf_status.states
+                   / pf_default_rule.timeout[PFTM_INTERVAL]), 0)) {
 
-       crit_enter();
-       pf_purge_expired_states();
-       pf_purge_expired_fragments();
-       pf_purge_expired_src_nodes();
-       crit_exit();
+                       pf_purge_expired_states(1 + (pf_status.states
+                           / pf_default_rule.timeout[PFTM_INTERVAL]), 1);
+               }
 
-       callout_reset(to, pf_default_rule.timeout[PFTM_INTERVAL] * hz,
-           pf_purge_timeout, to);
+               /* purge other expired types every PFTM_INTERVAL seconds */
+               if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
+                       pf_purge_expired_fragments();
+                       if (!pf_purge_expired_src_nodes(locked)) {
+                               pf_purge_expired_src_nodes(1);
+                       }
+                       nloops = 0;
+               }
+               crit_exit();
+               lockmgr(&pf_consistency_lock, LK_RELEASE);
+       }
 }
 
 u_int32_t
@@ -735,8 +940,9 @@ pf_state_expires(const struct pf_state *state)
                return (time_second);
        if (state->timeout == PFTM_UNTIL_PACKET)
                return (0);
-       KASSERT((state->timeout < PFTM_MAX), 
-           ("pf_state_expires: timeout > PFTM_MAX"));
+       KKASSERT(state->timeout != PFTM_UNLINKED);
+       KASSERT((state->timeout < PFTM_MAX),
+               ("pf_state_expires: timeout > PFTM_MAX"));
        timeout = state->rule.ptr->timeout[state->timeout];
        if (!timeout)
                timeout = pf_default_rule.timeout[state->timeout];
@@ -759,15 +965,22 @@ pf_state_expires(const struct pf_state *state)
        return (state->expire + timeout);
 }
 
-void
-pf_purge_expired_src_nodes(void)
+int
+pf_purge_expired_src_nodes(int waslocked)
 {
         struct pf_src_node             *cur, *next;
+        int                             locked = waslocked;
 
         for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) {
                 next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur);
 
                 if (cur->states <= 0 && cur->expire <= time_second) {
+                        if (! locked) {
+                                lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
+                                next = RB_NEXT(pf_src_tree,
+                                    &tree_src_tracking, cur);
+                                locked = 1;
+                        }
                         if (cur->rule.ptr != NULL) {
                                 cur->rule.ptr->src_nodes--;
                                 if (cur->rule.ptr->states <= 0 &&
@@ -780,6 +993,10 @@ pf_purge_expired_src_nodes(void)
                         pool_put(&pf_src_tree_pl, cur);
                 }
         }
+
+        if (locked && !waslocked)
+               lockmgr(&pf_consistency_lock, LK_RELEASE);
+       return(1);
 }
 
 void
@@ -788,6 +1005,10 @@ pf_src_tree_remove_state(struct pf_state *s)
        u_int32_t timeout;
 
        if (s->src_node != NULL) {
+               if (s->proto == IPPROTO_TCP) {
+                       if (s->src.tcp_est)
+                               --s->src_node->conn;
+               }
                if (--s->src_node->states <= 0) {
                        timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
                        if (!timeout)
@@ -808,53 +1029,104 @@ pf_src_tree_remove_state(struct pf_state *s)
        s->src_node = s->nat_src_node = NULL;
 }
 
-static int
-pf_purge_expired_states_callback(struct pf_state *cur, void *data __unused)
+/* callers should be at crit_enter() */
+void
+pf_unlink_state(struct pf_state *cur)
 {
-       if (pf_state_expires(cur) <= time_second) {
-               RB_REMOVE(pf_state_tree_ext_gwy,
-                   &cur->u.s.kif->pfik_ext_gwy, cur);
-               RB_REMOVE(pf_state_tree_lan_ext,
-                   &cur->u.s.kif->pfik_lan_ext, cur);
-               RB_REMOVE(pf_state_tree_id, &tree_id, cur);
-               if (cur->src.state == PF_TCPS_PROXY_DST) {
-                       pf_send_tcp(cur->rule.ptr, cur->af,
-                           &cur->ext.addr, &cur->lan.addr,
-                           cur->ext.port, cur->lan.port,
-                           cur->src.seqhi, cur->src.seqlo + 1, 0,
-                           TH_RST|TH_ACK, 0, 0);
-               }
+       if (cur->src.state == PF_TCPS_PROXY_DST) {
+               pf_send_tcp(cur->rule.ptr, cur->af,
+                   &cur->ext.addr, &cur->lan.addr,
+                   cur->ext.port, cur->lan.port,
+                   cur->src.seqhi, cur->src.seqlo + 1,
+                   TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL);
+       }
+       RB_REMOVE(pf_state_tree_ext_gwy,
+           &cur->u.s.kif->pfik_ext_gwy, cur);
+       RB_REMOVE(pf_state_tree_lan_ext,
+           &cur->u.s.kif->pfik_lan_ext, cur);
+       RB_REMOVE(pf_state_tree_id, &tree_id, cur);
 #if NPFSYNC
+       if (cur->creatorid == pf_status.hostid)
                pfsync_delete_state(cur);
 #endif
-               pf_src_tree_remove_state(cur);
-               if (--cur->rule.ptr->states <= 0 &&
-                   cur->rule.ptr->src_nodes <= 0)
-                       pf_rm_rule(NULL, cur->rule.ptr);
-               if (cur->nat_rule.ptr != NULL)
-                       if (--cur->nat_rule.ptr->states <= 0 &&
-                               cur->nat_rule.ptr->src_nodes <= 0)
-                               pf_rm_rule(NULL, cur->nat_rule.ptr);
-               if (cur->anchor.ptr != NULL)
-                       if (--cur->anchor.ptr->states <= 0)
-                               pf_rm_rule(NULL, cur->anchor.ptr);
-               pf_normalize_tcp_cleanup(cur);
-               pfi_detach_state(cur->u.s.kif);
-               TAILQ_REMOVE(&state_updates, cur, u.s.entry_updates);
-               pool_put(&pf_state_pl, cur);
-               pf_status.fcounters[FCNT_STATE_REMOVALS]++;
-               pf_status.states--;
-       }
-       return(0);
+       cur->timeout = PFTM_UNLINKED;
+       pf_src_tree_remove_state(cur);
 }
 
+/* callers should be at crit_enter() and hold the
+ * write_lock on pf_consistency_lock */
 void
-pf_purge_expired_states(void)
+pf_free_state(struct pf_state *cur)
 {
-       RB_SCAN(pf_state_tree_id, &tree_id, NULL,
-               pf_purge_expired_states_callback, NULL);
+#if NPFSYNC
+       if (pfsyncif != NULL &&
+           (pfsyncif->sc_bulk_send_next == cur ||
+           pfsyncif->sc_bulk_terminator == cur))
+               return;
+#endif
+       KKASSERT(cur->timeout == PFTM_UNLINKED);
+       if (--cur->rule.ptr->states <= 0 &&
+           cur->rule.ptr->src_nodes <= 0)
+               pf_rm_rule(NULL, cur->rule.ptr);
+       if (cur->nat_rule.ptr != NULL)
+               if (--cur->nat_rule.ptr->states <= 0 &&
+                       cur->nat_rule.ptr->src_nodes <= 0)
+                       pf_rm_rule(NULL, cur->nat_rule.ptr);
+       if (cur->anchor.ptr != NULL)
+               if (--cur->anchor.ptr->states <= 0)
+                       pf_rm_rule(NULL, cur->anchor.ptr);
+       pf_normalize_tcp_cleanup(cur);
+       pfi_kif_unref(cur->u.s.kif, PFI_KIF_REF_STATE);
+       TAILQ_REMOVE(&state_list, cur, u.s.entry_list);
+       if (cur->tag)
+               pf_tag_unref(cur->tag);
+       pool_put(&pf_state_pl, cur);
+       pf_status.fcounters[FCNT_STATE_REMOVALS]++;
+       pf_status.states--;
 }
 
+int
+pf_purge_expired_states(u_int32_t maxcheck, int waslocked)
+{
+       static struct pf_state  *cur = NULL;
+       struct pf_state         *next;
+       int                      locked = waslocked;
+
+       while (maxcheck--) {
+               /* wrap to start of list when we hit the end */
+               if (cur == NULL) {
+                       cur = TAILQ_FIRST(&state_list);
+                       if (cur == NULL)
+                               break;  /* list empty */
+               }
+
+               /* get next state, as cur may get deleted */
+               next = TAILQ_NEXT(cur, u.s.entry_list);
+
+               if (cur->timeout == PFTM_UNLINKED) {
+                       /* free unlinked state */
+                       if (! locked) {
+                               lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
+                               locked = 1;
+                       }
+                       pf_free_state(cur);
+               } else if (pf_state_expires(cur) <= time_second) {
+                       /* unlink and free expired state */
+                       pf_unlink_state(cur);
+                       if (! locked) {
+                               if (!lockmgr(&pf_consistency_lock, LK_EXCLUSIVE))
+                                       return (0);
+                               locked = 1;
+                       }
+                       pf_free_state(cur);
+               }
+               cur = next;
+       }
+
+       if (locked)
+               lockmgr(&pf_consistency_lock, LK_RELEASE);
+       return (1);
+}
 
 int
 pf_tbladdr_setup(struct pf_ruleset *rs, struct pf_addr_wrap *aw)
@@ -1042,14 +1314,14 @@ pf_calc_skip_steps(struct pf_rulequeue *rules)
                        PF_SET_SKIP_STEPS(PF_SKIP_AF);
                if (cur->proto != prev->proto)
                        PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
-               if (cur->src.not != prev->src.not ||
+               if (cur->src.neg != prev->src.neg ||
                    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
                        PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
                if (cur->src.port[0] != prev->src.port[0] ||
                    cur->src.port[1] != prev->src.port[1] ||
                    cur->src.port_op != prev->src.port_op)
                        PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
-               if (cur->dst.not != prev->dst.not ||
+               if (cur->dst.neg != prev->dst.neg ||
                    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
                        PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
                if (cur->dst.port[0] != prev->dst.port[0] ||
@@ -1079,30 +1351,18 @@ pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
        case PF_ADDR_DYNIFTL:
                return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
        case PF_ADDR_NOROUTE:
+       case PF_ADDR_URPFFAILED:
                return (0);
        case PF_ADDR_TABLE:
                return (aw1->p.tbl != aw2->p.tbl);
+       case PF_ADDR_RTLABEL:
+               return (aw1->v.rtlabel != aw2->v.rtlabel);
        default:
                kprintf("invalid address type: %d\n", aw1->type);
                return (1);
        }
 }
 
-void
-pf_update_anchor_rules(void)
-{
-       struct pf_rule  *rule;
-       int              i;
-
-       for (i = 0; i < PF_RULESET_MAX; ++i)
-               TAILQ_FOREACH(rule, pf_main_ruleset.rules[i].active.ptr,
-                   entries)
-                       if (rule->anchorname[0])
-                               rule->anchor = pf_find_anchor(rule->anchorname);
-                       else
-                               rule->anchor = NULL;
-}
-
 u_int16_t
 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
 {
@@ -1283,11 +1543,70 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
        }
 }
 
+
+/*
+ * Need to modulate the sequence numbers in the TCP SACK option
+ * (credits to Krzysztof Pfaff for report and patch)
+ */
+int
+pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
+    struct tcphdr *th, struct pf_state_peer *dst)
+{
+       int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
+       u_int8_t opts[TCP_MAXOLEN], *opt = opts;
+       int copyback = 0, i, olen;
+       struct raw_sackblock sack;
+
+#define TCPOLEN_SACKLEN        (TCPOLEN_SACK + 2)
+       if (hlen < TCPOLEN_SACKLEN ||
+           !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
+               return 0;
+
+       while (hlen >= TCPOLEN_SACKLEN) {
+               olen = opt[1];
+               switch (*opt) {
+               case TCPOPT_EOL:        /* FALLTHROUGH */
+               case TCPOPT_NOP:
+                       opt++;
+                       hlen--;
+                       break;
+               case TCPOPT_SACK:
+                       if (olen > hlen)
+                               olen = hlen;
+                       if (olen >= TCPOLEN_SACKLEN) {
+                               for (i = 2; i + TCPOLEN_SACK <= olen;
+                                   i += TCPOLEN_SACK) {
+                                       memcpy(&sack, &opt[i], sizeof(sack));
+                                       pf_change_a(&sack.rblk_start, &th->th_sum,
+                                           htonl(ntohl(sack.rblk_start) -
+                                           dst->seqdiff), 0);
+                                       pf_change_a(&sack.rblk_end, &th->th_sum,
+                                           htonl(ntohl(sack.rblk_end) -
+                                           dst->seqdiff), 0);
+                                       memcpy(&opt[i], &sack, sizeof(sack));
+                               }
+                               copyback = 1;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       if (olen < 2)
+                               olen = 2;
+                       hlen -= olen;
+                       opt += olen;
+               }
+       }
+
+       if (copyback)
+               m_copyback(m, off + sizeof(*th), thoptlen, opts);
+       return (copyback);
+}
+
 void
 pf_send_tcp(const struct pf_rule *r, sa_family_t af,
     const struct pf_addr *saddr, const struct pf_addr *daddr,
     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
-    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl)
+    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
+    u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp)
 {
        struct mbuf     *m;
        int              len = 0, tlen;
@@ -1298,7 +1617,8 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af,
        struct ip6_hdr  *h6 = NULL;
 #endif /* INET6 */
        struct tcphdr   *th = NULL;
-       char *opt;
+       char            *opt;
+       struct pf_mtag  *pf_mtag;
 
        /* maximum segment size tcp option */
        tlen = sizeof(struct tcphdr);
@@ -1322,15 +1642,26 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af,
        m = m_gethdr(MB_DONTWAIT, MT_HEADER);
        if (m == NULL)
                return;
-       m->m_pkthdr.fw_flags = PF_MBUF_GENERATED;
+       if ((pf_mtag = pf_get_mtag(m)) == NULL) {
+               m_freem(m);
+               return;
+       }
+       if (tag)
+               pf_mtag->flags |= PF_TAG_GENERATED;
+
+       pf_mtag->tag = rtag;
+
+       if (r != NULL && r->rtableid >= 0)
+               pf_mtag->rtableid = r->rtableid;
+
 #ifdef ALTQ
        if (r != NULL && r->qid) {
-               m->m_pkthdr.fw_flags |= ALTQ_MBUF_TAGGED;
-               m->m_pkthdr.altq_qid = r->qid;
-               m->m_pkthdr.ecn_af = af;
-               m->m_pkthdr.header = mtod(m, struct ip *);
+               pf_mtag->qid = r->qid;
+               /* add hints for ecn */
+               pf_mtag->af = af;
+               pf_mtag->hdr = mtod(m, struct ip *);
        }
-#endif
+#endif /* ALTQ */
        m->m_data += max_linkhdr;
        m->m_pkthdr.len = m->m_len = len;
        m->m_pkthdr.rcvif = NULL;
@@ -1395,7 +1726,28 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af,
                h->ip_off = path_mtu_discovery ? IP_DF : 0;
                h->ip_ttl = ttl ? ttl : ip_defttl;
                h->ip_sum = 0;
-               ip_output(m, NULL, NULL, 0, NULL, NULL);
+               if (eh == NULL) {
+                       ip_output(m, NULL, NULL, 0, NULL, NULL);
+               } else {
+                       struct route             ro;
+                       struct rtentry           rt;
+                       struct ether_header     *e = (void *)ro.ro_dst.sa_data;
+
+                       if (ifp == NULL) {
+                               m_freem(m);
+                               return;
+                       }
+                       rt.rt_ifp = ifp;
+                       ro.ro_rt = &rt;
+                       ro.ro_dst.sa_len = sizeof(ro.ro_dst);
+                       ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT;
+                       bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN);
+                       bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN);
+                       e->ether_type = eh->ether_type;
+                       /* XXX_IMPORT: later */
+                       ip_output(m, (void *)NULL, &ro, 0,
+                           (void *)NULL, (void *)NULL);
+               }
                break;
 #endif /* INET */
 #ifdef INET6
@@ -1417,21 +1769,26 @@ void
 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
     struct pf_rule *r)
 {
+       struct pf_mtag  *pf_mtag;
        struct mbuf     *m0;
 
-       m0 = m_copypacket(m, MB_DONTWAIT);
-       if (m0 == NULL)
+       m0 = m_copy(m, 0, M_COPYALL);
+
+       if ((pf_mtag = pf_get_mtag(m0)) == NULL)
                return;
-       m0->m_pkthdr.fw_flags |= PF_MBUF_GENERATED;
+       pf_mtag->flags |= PF_TAG_GENERATED;
+
+       if (r->rtableid >= 0)
+               pf_mtag->rtableid = r->rtableid;
 
 #ifdef ALTQ
        if (r->qid) {
-               m->m_pkthdr.fw_flags |= ALTQ_MBUF_TAGGED;
-               m->m_pkthdr.altq_qid = r->qid;
-               m->m_pkthdr.ecn_af = af;
-               m->m_pkthdr.header = mtod(m0, struct ip *);
+               pf_mtag->qid = r->qid;
+               /* add hints for ecn */
+               pf_mtag->af = af;
+               pf_mtag->hdr = mtod(m0, struct ip *);
        }
-#endif
+#endif /* ALTQ */
 
        switch (af) {
 #ifdef INET
@@ -1544,63 +1901,135 @@ pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
        return (pf_match(op, a1, a2, g));
 }
 
-static int
-pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_rule *nat_rule,
-            int *tag)
+struct pf_mtag *
+pf_find_mtag(struct mbuf *m)
 {
-       if (*tag == -1) {       /* find mbuf tag */
-               if (nat_rule != NULL && nat_rule->tag)
-                       *tag = nat_rule->tag;
-               else if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
-                       *tag = m->m_pkthdr.pf_tag;
-               else
-                       *tag = 0;
+       struct m_tag    *mtag;
+
+       if ((mtag = m_tag_find(m, PF_MBUF_TAGGED, NULL)) == NULL)
+               return (NULL);
+
+       return ((struct pf_mtag *)(mtag + 1));
+}
+
+struct pf_mtag *
+pf_get_mtag(struct mbuf *m)
+{
+       struct m_tag    *mtag;
+
+       if ((mtag = m_tag_find(m, PF_MBUF_TAGGED, NULL)) == NULL) {
+               mtag = m_tag_get(PF_MBUF_TAGGED, sizeof(struct pf_mtag),
+                   M_NOWAIT);
+               if (mtag == NULL)
+                       return (NULL);
+               bzero(mtag + 1, sizeof(struct pf_mtag));
+               m_tag_prepend(m, mtag);
        }
 
+       return ((struct pf_mtag *)(mtag + 1));
+}
+
+int
+pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_mtag *pf_mtag,
+    int *tag)
+{
+       if (*tag == -1)
+               *tag = pf_mtag->tag;
+
        return ((!r->match_tag_not && r->match_tag == *tag) ||
            (r->match_tag_not && r->match_tag != *tag));
 }
 
-void
-pf_tag_packet(struct mbuf *m, int tag)
+int
+pf_tag_packet(struct mbuf *m, struct pf_mtag *pf_mtag, int tag, int rtableid)
 {
-       if (tag <= 0)
-               return;
+       if (tag <= 0 && rtableid < 0)
+               return (0);
+
+       if (pf_mtag == NULL)
+               if ((pf_mtag = pf_get_mtag(m)) == NULL)
+                       return (1);
+       if (tag > 0)
+               pf_mtag->tag = tag;
+       if (rtableid >= 0)
+               pf_mtag->rtableid = rtableid;
 
-       m->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
-       m->m_pkthdr.pf_tag = tag;
+       return (0);
 }
 
-#define PF_STEP_INTO_ANCHOR(r, a, s, n)                                        \
-       do {                                                            \
-               if ((r) == NULL || (r)->anchor == NULL ||               \
-                   (s) != NULL || (a) != NULL)                         \
-                       panic("PF_STEP_INTO_ANCHOR");                   \
-               (a) = (r);                                              \
-               (s) = TAILQ_FIRST(&(r)->anchor->rulesets);              \
-               (r) = NULL;                                             \
-               while ((s) != NULL && ((r) =                            \
-                   TAILQ_FIRST((s)->rules[n].active.ptr)) == NULL)     \
-                       (s) = TAILQ_NEXT((s), entries);                 \
-               if ((r) == NULL) {                                      \
-                       (r) = TAILQ_NEXT((a), entries);                 \
-                       (a) = NULL;                                     \
-               }                                                       \
-       } while (0)
+static void
+pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
+    struct pf_rule **r, struct pf_rule **a,  int *match)
+{
+       struct pf_anchor_stackframe     *f;
+
+       (*r)->anchor->match = 0;
+       if (match)
+               *match = 0;
+       if (*depth >= sizeof(pf_anchor_stack) /
+           sizeof(pf_anchor_stack[0])) {
+               kprintf("pf_step_into_anchor: stack overflow\n");
+               *r = TAILQ_NEXT(*r, entries);
+               return;
+       } else if (*depth == 0 && a != NULL)
+               *a = *r;
+       f = pf_anchor_stack + (*depth)++;
+       f->rs = *rs;
+       f->r = *r;
+       if ((*r)->anchor_wildcard) {
+               f->parent = &(*r)->anchor->children;
+               if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
+                   NULL) {
+                       *r = NULL;
+                       return;
+               }
+               *rs = &f->child->ruleset;
+       } else {
+               f->parent = NULL;
+               f->child = NULL;
+               *rs = &(*r)->anchor->ruleset;
+       }
+       *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
+}
 
-#define PF_STEP_OUT_OF_ANCHOR(r, a, s, n)                              \
-       do {                                                            \
-               if ((r) != NULL || (a) == NULL || (s) == NULL)          \
-                       panic("PF_STEP_OUT_OF_ANCHOR");                 \
-               (s) = TAILQ_NEXT((s), entries);                         \
-               while ((s) != NULL && ((r) =                            \
-                   TAILQ_FIRST((s)->rules[n].active.ptr)) == NULL)     \
-                       (s) = TAILQ_NEXT((s), entries);                 \
-               if ((r) == NULL) {                                      \
-                       (r) = TAILQ_NEXT((a), entries);                 \
-                       (a) = NULL;                                     \
-               }                                                       \
-       } while (0)
+int
+pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
+    struct pf_rule **r, struct pf_rule **a, int *match)
+{
+       struct pf_anchor_stackframe     *f;
+       int quick = 0;
+
+       do {
+               if (*depth <= 0)
+                       break;
+               f = pf_anchor_stack + *depth - 1;
+               if (f->parent != NULL && f->child != NULL) {
+                       if (f->child->match ||
+                           (match != NULL && *match)) {
+                               f->r->anchor->match = 1;
+                               *match = 0;
+                       }
+                       f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
+                       if (f->child != NULL) {
+                               *rs = &f->child->ruleset;
+                               *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
+                               if (*r == NULL)
+                                       continue;
+                               else
+                                       break;
+                       }
+               }
+               (*depth)--;
+               if (*depth == 0 && a != NULL)
+                       *a = NULL;
+               *rs = f->rs;
+               if (f->r->anchor->match || (match  != NULL && *match))
+                       quick = f->r->quick;
+               *r = TAILQ_NEXT(f->r, entries);
+       } while (*r == NULL);
+
+       return (quick);
+}
 
 #ifdef INET6
 void
@@ -1754,20 +2183,27 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
        if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
                return (1);
        if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
-               if (af == AF_INET) {
+               switch (af) {
+#ifdef INET
+               case AF_INET:
                        if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
                            (rpool->opts & PF_POOL_TYPEMASK) !=
                            PF_POOL_ROUNDROBIN)
                                return (1);
                         raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
                         rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
-               } else {
+                       break;
+#endif /* INET */
+#ifdef INET6
+               case AF_INET6:
                        if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
                            (rpool->opts & PF_POOL_TYPEMASK) !=
                            PF_POOL_ROUNDROBIN)
                                return (1);
                        raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
                        rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
+                       break;
+#endif /* INET6 */
                }
        } else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
                if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
@@ -1789,25 +2225,29 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
                        switch (af) {
 #ifdef INET
                        case AF_INET:
-                               rpool->counter.addr32[0] = karc4random();
+                               rpool->counter.addr32[0] = htonl(karc4random());
                                break;
 #endif /* INET */
 #ifdef INET6
                        case AF_INET6:
                                if (rmask->addr32[3] != 0xffffffff)
-                                       rpool->counter.addr32[3] = karc4random();
+                                       rpool->counter.addr32[3] =
+                                           htonl(karc4random());
                                else
                                        break;
                                if (rmask->addr32[2] != 0xffffffff)
-                                       rpool->counter.addr32[2] = karc4random();
+                                       rpool->counter.addr32[2] =
+                                           htonl(karc4random());
                                else
                                        break;
                                if (rmask->addr32[1] != 0xffffffff)
-                                       rpool->counter.addr32[1] = karc4random();
+                                       rpool->counter.addr32[1] =
+                                           htonl(karc4random());
                                else
                                        break;
                                if (rmask->addr32[0] != 0xffffffff)
-                                       rpool->counter.addr32[0] = karc4random();
+                                       rpool->counter.addr32[0] =
+                                           htonl(karc4random());
                                break;
 #endif /* INET6 */
                        }
@@ -1868,6 +2308,8 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
 
        get_addr:
                PF_ACPY(naddr, &rpool->counter, af);
+               if (init_addr != NULL && PF_AZERO(init_addr, af))
+                       PF_ACPY(init_addr, naddr, af);
                PF_AINC(&rpool->counter, af);
                break;
        }
@@ -1890,7 +2332,7 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
     struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high,
     struct pf_src_node **sn)
 {
-       struct pf_state         key;
+       struct pf_state_cmp     key;
        struct pf_addr          init_addr;
        u_int16_t               cut;
 
@@ -1898,6 +2340,11 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
        if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
                return (1);
 
+       if (proto == IPPROTO_ICMP) {
+               low = 1;
+               high = 65535;
+       }
+
        do {
                key.af = af;
                key.proto = proto;
@@ -1909,8 +2356,9 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
                 * port search; start random, step;
                 * similar 2 portloop in in_pcbbind
                 */
-               if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
-                       key.gwy.port = 0;
+               if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
+                   proto == IPPROTO_ICMP)) {
+                       key.gwy.port = dport;
                        if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL)
                                return (0);
                } else if (low == 0 && high == 0) {
@@ -1932,7 +2380,7 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
                                high = tmp;
                        }
                        /* low < high */
-                       cut = karc4random() % (1 + high - low) + low;
+                       cut = htonl(karc4random()) % (1 + high - low) + low;
                        /* low <= cut <= high */
                        for (tmp = cut; tmp <= high; ++(tmp)) {
                                key.gwy.port = htons(tmp);
@@ -1974,8 +2422,11 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
     int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
     struct pf_addr *daddr, u_int16_t dport, int rs_num)
 {
-       struct pf_rule          *r, *rm = NULL, *anchorrule = NULL;
+       struct pf_rule          *r, *rm = NULL;
        struct pf_ruleset       *ruleset = NULL;
+       int                      tag = -1;
+       int                      rtableid = -1;
+       int                      asd = 0;
 
        r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
        while (r && rm == NULL) {
@@ -1992,8 +2443,7 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
                }
 
                r->evaluations++;
-               if (r->kif != NULL &&
-                   (r->kif != kif && r->kif != kif->pfik_parent) == !r->ifnot)
+               if (pfi_kif_match(r->kif, kif) == r->ifnot)
                        r = r->skip[PF_SKIP_IFP].ptr;
                else if (r->direction && r->direction != direction)
                        r = r->skip[PF_SKIP_DIR].ptr;
@@ -2001,7 +2451,8 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
                        r = r->skip[PF_SKIP_AF].ptr;
                else if (r->proto && r->proto != pd->proto)
                        r = r->skip[PF_SKIP_PROTO].ptr;
-               else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, src->not))
+               else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
+                   src->neg, kif))
                        r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
                            PF_SKIP_DST_ADDR].ptr;
                else if (src->port_op && !pf_match_port(src->port_op,
@@ -2009,28 +2460,38 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
                        r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
                            PF_SKIP_DST_PORT].ptr;
                else if (dst != NULL &&
-                   PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->not))
+                   PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL))
                        r = r->skip[PF_SKIP_DST_ADDR].ptr;
-               else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, 0))
+               else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
+                   0, NULL))
                        r = TAILQ_NEXT(r, entries);
                else if (dst != NULL && dst->port_op &&
                    !pf_match_port(dst->port_op, dst->port[0],
                    dst->port[1], dport))
                        r = r->skip[PF_SKIP_DST_PORT].ptr;
+               else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
+                       r = TAILQ_NEXT(r, entries);
                else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
                    IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
                    off, pd->hdr.tcp), r->os_fingerprint)))
                        r = TAILQ_NEXT(r, entries);
-               else if (r->anchorname[0] && r->anchor == NULL)
-                       r = TAILQ_NEXT(r, entries);
-               else if (r->anchor == NULL)
+               else {
+                       if (r->tag)
+                               tag = r->tag;
+                       if (r->rtableid >= 0)
+                               rtableid = r->rtableid;
+                       if (r->anchor == NULL) {
                                rm = r;
-               else
-                       PF_STEP_INTO_ANCHOR(r, anchorrule, ruleset, rs_num);
-               if (r == NULL && anchorrule != NULL)
-                       PF_STEP_OUT_OF_ANCHOR(r, anchorrule, ruleset,
-                           rs_num);
+                       } else
+                               pf_step_into_anchor(&asd, &ruleset, rs_num,
+                                   &r, NULL, NULL);
+               }
+               if (r == NULL)
+                       pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
+                           NULL, NULL);
        }
+       if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid))
+               return (NULL);
        if (rm != NULL && (rm->action == PF_NONAT ||
            rm->action == PF_NORDR || rm->action == PF_NOBINAT))
                return (NULL);
@@ -2082,7 +2543,9 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                        switch (direction) {
                        case PF_OUT:
                                if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
-                                       if (pd->af == AF_INET) {
+                                       switch (pd->af) {
+#ifdef INET
+                                       case AF_INET:
                                                if (r->rpool.cur->addr.p.dyn->
                                                    pfid_acnt4 < 1)
                                                        return (NULL);
@@ -2092,7 +2555,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                                                    &r->rpool.cur->addr.p.dyn->
                                                    pfid_mask4,
                                                    saddr, AF_INET);
-                                       } else {
+                                               break;
+#endif /* INET */
+#ifdef INET6
+                                       case AF_INET6:
                                                if (r->rpool.cur->addr.p.dyn->
                                                    pfid_acnt6 < 1)
                                                        return (NULL);
@@ -2102,6 +2568,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                                                    &r->rpool.cur->addr.p.dyn->
                                                    pfid_mask6,
                                                    saddr, AF_INET6);
+                                               break;
+#endif /* INET6 */
                                        }
                                } else
                                        PF_POOLMASK(naddr,
@@ -2110,8 +2578,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                                            saddr, pd->af);
                                break;
                        case PF_IN:
-                               if (r->src.addr.type == PF_ADDR_DYNIFTL){
-                                       if (pd->af == AF_INET) {
+                               if (r->src.addr.type == PF_ADDR_DYNIFTL) {
+                                       switch (pd->af) {
+#ifdef INET
+                                       case AF_INET:
                                                if (r->src.addr.p.dyn->
                                                    pfid_acnt4 < 1)
                                                        return (NULL);
@@ -2121,7 +2591,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                                                    &r->src.addr.p.dyn->
                                                    pfid_mask4,
                                                    daddr, AF_INET);
-                                       } else {
+                                               break;
+#endif /* INET */
+#ifdef INET6
+                                       case AF_INET6:
                                                if (r->src.addr.p.dyn->
                                                    pfid_acnt6 < 1)
                                                        return (NULL);
@@ -2131,6 +2604,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                                                    &r->src.addr.p.dyn->
                                                    pfid_mask6,
                                                    daddr, AF_INET6);
+                                               break;
+#endif /* INET6 */
                                        }
                                } else
                                        PF_POOLMASK(naddr,
@@ -2141,9 +2616,13 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
                        }
                        break;
                case PF_RDR: {
-                       if (pf_map_addr(r->af, r, saddr, naddr, NULL, sn))
+                       if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
                                return (NULL);
-
+                       if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
+                           PF_POOL_BITMASK)
+                               PF_POOLMASK(naddr, naddr,
+                                   &r->rpool.cur->addr.v.a.mask, daddr,
+                                   pd->af);
                        if (r->rpool.proxy_port[1]) {
                                u_int32_t       tmp_nport;
 
@@ -2201,7 +2680,7 @@ in_pcblookup_hash_handler(struct netmsg *msg0)
 #endif /* SMP */
 
 int
-pf_socket_lookup(uid_t *uid, gid_t *gid, int direction, struct pf_pdesc *pd)
+pf_socket_lookup(int direction, struct pf_pdesc *pd, struct inpcb *inp_arg)
 {
        struct pf_addr          *saddr, *daddr;
        u_int16_t                sport, dport;
@@ -2212,8 +2691,11 @@ pf_socket_lookup(uid_t *uid, gid_t *gid, int direction, struct pf_pdesc *pd)
 #endif
        int                      pi_cpu = 0;
 
-       *uid = UID_MAX;
-       *gid = GID_MAX;
+       if (pd == NULL)
+               return (-1);
+       pd->lookup.uid = UID_MAX;
+       pd->lookup.gid = GID_MAX;
+       pd->lookup.pid = NO_PID;
        if (direction == PF_IN) {
                saddr = pd->src;
                daddr = pd->dst;
@@ -2288,7 +2770,7 @@ pf_socket_lookup(uid_t *uid, gid_t *gid, int direction, struct pf_pdesc *pd)
                            &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL);
 
                        if (inp == NULL)
-                               return (0);
+                               return (-1);
                        break;
                }
                /* FALLTHROUGH if SMP and on other CPU */
@@ -2309,10 +2791,10 @@ pf_socket_lookup(uid_t *uid, gid_t *gid, int direction, struct pf_pdesc *pd)
                break;
 
        default:
-               return (0);
+               return (-1);
        }
-       *uid = inp->inp_socket->so_cred->cr_uid;
-       *gid = inp->inp_socket->so_cred->cr_groups[0];
+       pd->lookup.uid = inp->inp_socket->so_cred->cr_uid;
+       pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0];
        return (1);
 }
 
@@ -2380,6 +2862,7 @@ pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
                        break;
                case TCPOPT_MAXSEG:
                        bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
+                       NTOHS(mss);
                        /* FALLTHROUGH */
                default:
                        optlen = opt[1];
@@ -2474,22 +2957,38 @@ pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
 int
 pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
     struct pfi_kif *kif, struct mbuf *m, int off, void *h,
-    struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm)
+    struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm,
+    struct ifqueue *ifq, struct inpcb *inp)
 {
        struct pf_rule          *nr = NULL;
        struct pf_addr          *saddr = pd->src, *daddr = pd->dst;
        struct tcphdr           *th = pd->hdr.tcp;
        u_int16_t                bport, nport = 0;
        sa_family_t              af = pd->af;
-       int                      lookup = -1;
-       uid_t                    uid;
-       gid_t                    gid;
        struct pf_rule          *r, *a = NULL;
        struct pf_ruleset       *ruleset = NULL;
        struct pf_src_node      *nsn = NULL;
        u_short                  reason;
        int                      rewrite = 0;
-       int                      tag = -1;
+       int                      tag = -1, rtableid = -1;
+       u_int16_t                mss = tcp_mssdflt;
+       int                      asd = 0;
+       int                      match = 0;
+
+       if (pf_check_congestion(ifq)) {
+               REASON_SET(&reason, PFRES_CONGEST);
+               return (PF_DROP);
+       }
+
+       if (inp != NULL)
+               pd->lookup.done = pf_socket_lookup(direction, pd, inp);
+       else if (debug_pfugidhack) {
+               crit_exit();
+               DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
+               pd->lookup.done = pf_socket_lookup(direction, pd, inp);
+               crit_enter();
+       }
+       
 
        r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 
@@ -2525,8 +3024,7 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
 
        while (r != NULL) {
                r->evaluations++;
-               if (r->kif != NULL &&
-                   (r->kif != kif && r->kif != kif->pfik_parent) == !r->ifnot)
+               if (pfi_kif_match(r->kif, kif) == r->ifnot)
                        r = r->skip[PF_SKIP_IFP].ptr;
                else if (r->direction && r->direction != direction)
                        r = r->skip[PF_SKIP_DIR].ptr;
@@ -2534,37 +3032,37 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
                        r = r->skip[PF_SKIP_AF].ptr;
                else if (r->proto && r->proto != IPPROTO_TCP)
                        r = r->skip[PF_SKIP_PROTO].ptr;
-               else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.not))
+               else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
+                   r->src.neg, kif))
                        r = r->skip[PF_SKIP_SRC_ADDR].ptr;
                else if (r->src.port_op && !pf_match_port(r->src.port_op,
                    r->src.port[0], r->src.port[1], th->th_sport))
                        r = r->skip[PF_SKIP_SRC_PORT].ptr;
-               else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.not))
+               else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
+                   r->dst.neg, NULL))
                        r = r->skip[PF_SKIP_DST_ADDR].ptr;
                else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
                    r->dst.port[0], r->dst.port[1], th->th_dport))
                        r = r->skip[PF_SKIP_DST_PORT].ptr;
-               else if (r->tos && !(r->tos & pd->tos))
+               else if (r->tos && !(r->tos == pd->tos))
                        r = TAILQ_NEXT(r, entries);
                else if (r->rule_flag & PFRULE_FRAGMENT)
                        r = TAILQ_NEXT(r, entries);
                else if ((r->flagset & th->th_flags) != r->flags)
                        r = TAILQ_NEXT(r, entries);
-               else if (r->uid.op && (lookup != -1 || (lookup =
-                   pf_socket_lookup(&uid, &gid, direction, pd), 1)) &&
+               else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
+                   pf_socket_lookup(direction, pd, inp), 1)) &&
                    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
-                   uid))
+                   pd->lookup.uid))
                        r = TAILQ_NEXT(r, entries);
-               else if (r->gid.op && (lookup != -1 || (lookup =
-                   pf_socket_lookup(&uid, &gid, direction, pd), 1)) &&
+               else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
+                   pf_socket_lookup(direction, pd, inp), 1)) &&
                    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
-                   gid))
+                   pd->lookup.gid))
                        r = TAILQ_NEXT(r, entries);
                else if (r->prob && r->prob <= karc4random())
                        r = TAILQ_NEXT(r, entries);
-               else if (r->match_tag && !pf_match_tag(m, r, nr, &tag))
-                       r = TAILQ_NEXT(r, entries);
-               else if (r->anchorname[0] && r->anchor == NULL)
+               else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
                        r = TAILQ_NEXT(r, entries);
                else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
                    pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint))
@@ -2572,7 +3070,10 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
                else {
                        if (r->tag)
                                tag = r->tag;
+                       if (r->rtableid >= 0)
+                               rtableid = r->rtableid;
                        if (r->anchor == NULL) {
+                               match = 1;
                                *rm = r;
                                *am = a;
                                *rsm = ruleset;
@@ -2580,12 +3081,12 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
                                        break;
                                r = TAILQ_NEXT(r, entries);
                        } else
-                               PF_STEP_INTO_ANCHOR(r, a, ruleset,
-                                   PF_RULESET_FILTER);
+                               pf_step_into_anchor(&asd, &ruleset,
+                                   PF_RULESET_FILTER, &r, &a, &match);
                }
-               if (r == NULL && a != NULL)
-                       PF_STEP_OUT_OF_ANCHOR(r, a, ruleset,
-                           PF_RULESET_FILTER);
+               if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
+                   PF_RULESET_FILTER, &r, &a, &match))
+                       break;
        }
        r = *rm;
        a = *am;
@@ -2593,10 +3094,11 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
 
        REASON_SET(&reason, PFRES_MATCH);
 
-       if (r->log) {
+       if (r->log || (nr != NULL && nr->natpass && nr->log)) {
                if (rewrite)
                        m_copyback(m, off, sizeof(*th), (caddr_t)th);
-               PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset);
+               PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
+                   a, ruleset, pd);
        }
 
        if ((r->action == PF_DROP) &&
@@ -2627,7 +3129,7 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
                        pf_send_tcp(r, af, pd->dst,
                            pd->src, th->th_dport, th->th_sport,
                            ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
-                           r->return_ttl);
+                           r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp);
                } else if ((af == AF_INET) && r->return_icmp)
                        pf_send_icmp(m, r->return_icmp >> 8,
                            r->return_icmp & 255, af, r);
@@ -2636,10 +3138,14 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
                            r->return_icmp6 & 255, af, r);
        }
 
-       if (r->action == PF_DROP)
+       if (r->action == PF_DROP) {
                return (PF_DROP);
+       }
 
-       pf_tag_packet(m, tag);
+       if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
+               REASON_SET(&reason, PFRES_MEMORY);
+               return (PF_DROP);
+       }
 
        if (r->keep_state || nr != NULL ||
            (pd->flags & PFDESC_TCP_NORM)) {
@@ -2651,21 +3157,29 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
                len = pd->tot_len - off - (th->th_off << 2);
 
                /* check maximums */
-               if (r->max_states && (r->states >= r->max_states))
+               if (r->max_states && (r->states >= r->max_states)) {
+                       pf_status.lcounters[LCNT_STATES]++;
+                       REASON_SET(&reason, PFRES_MAXSTATES);
                        goto cleanup;
-               /* src node for flter rule */
+               }
+               /* src node for filter rule */
                if ((r->rule_flag & PFRULE_SRCTRACK ||
                    r->rpool.opts & PF_POOL_STICKYADDR) &&
-                   pf_insert_src_node(&sn, r, saddr, af) != 0)
+                   pf_insert_src_node(&sn, r, saddr, af) != 0) {
+                       REASON_SET(&reason, PFRES_SRCLIMIT);
                        goto cleanup;
+               }
                /* src node for translation rule */
                if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
                    ((direction == PF_OUT &&
                    pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) ||
-                   (pf_insert_src_node(&nsn, nr, saddr, af) != 0)))
+                   (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
+                       REASON_SET(&reason, PFRES_SRCLIMIT);
                        goto cleanup;
+               }
                s = pool_get(&pf_state_pl, PR_NOWAIT);
                if (s == NULL) {
+                       REASON_SET(&reason, PFRES_MEMORY);
 cleanup:
                        if (sn != NULL && sn->states == 0 && sn->expire == 0) {
                                RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
@@ -2680,20 +3194,17 @@ cleanup:
                                pf_status.src_nodes--;
                                pool_put(&pf_src_tree_pl, nsn);
                        }
-                       REASON_SET(&reason, PFRES_MEMORY);
                        return (PF_DROP);
                }
                bzero(s, sizeof(*s));
-               r->states++;
-               if (a != NULL)
-                       a->states++;
                s->rule.ptr = r;
                s->nat_rule.ptr = nr;
-               if (s->nat_rule.ptr != NULL)
-                       s->nat_rule.ptr->states++;
                s->anchor.ptr = a;
+               STATE_INC_COUNTERS(s);
                s->allow_opts = r->allow_opts;
-               s->log = r->log & 2;
+               s->log = r->log & PF_LOG_ALL;
+               if (nr != NULL)
+                       s->log |= nr->log & PF_LOG_ALL;
                s->proto = IPPROTO_TCP;
                s->direction = direction;
                s->af = af;
@@ -2731,30 +3242,14 @@ cleanup:
                if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
                    r->keep_state == PF_STATE_MODULATE) {
                        /* Generate sequence number modulator */
-                       while ((s->src.seqdiff = karc4random()) == 0)
+                       while ((s->src.seqdiff =
+                           pf_new_isn(s) - s->src.seqlo) == 0)
                                ;
                        pf_change_a(&th->th_seq, &th->th_sum,
                            htonl(s->src.seqlo + s->src.seqdiff), 0);
                        rewrite = 1;
                } else
                        s->src.seqdiff = 0;
-
-               /*
-                * WARNING!  NetBSD patched this to not scale max_win up
-                * on the initial SYN, but they failed to correct the code
-                * in pf_test_state_tcp() that 'undid' the scaling, and they
-                * failed to remove the scale factor on successful window
-                * scale negotiation (and doing so would be difficult in the
-                * face of retransmission, without adding more flags to the
-                * state structure).
-                * 
-                * After discussions with Daniel Hartmeier and Max Laier
-                * I've decided not to apply the NetBSD patch.
-                * 
-                * The worst that happens is that the undo code on window
-                * scale negotiation failures will produce a larger
-                * max_win then actual.
-                */
                if (th->th_flags & TH_SYN) {
                        s->src.seqhi++;
                        s->src.wscale = pf_get_wscale(m, off, th->th_off, af);
@@ -2791,29 +3286,37 @@ cleanup:
                    off, pd, th, &s->src, &s->dst)) {
                        REASON_SET(&reason, PFRES_MEMORY);
                        pf_src_tree_remove_state(s);
+                       STATE_DEC_COUNTERS(s);
                        pool_put(&pf_state_pl, s);
                        return (PF_DROP);
                }
                if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
-                   pf_normalize_tcp_stateful(m, off, pd, &reason, th, &s->src,
-                   &s->dst, &rewrite)) {
+                   pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
+                   &s->src, &s->dst, &rewrite)) {
+                       /* This really shouldn't happen!!! */
+                       DPFPRINTF(PF_DEBUG_URGENT,
+                           ("pf_normalize_tcp_stateful failed on first pkt"));
                        pf_normalize_tcp_cleanup(s);
                        pf_src_tree_remove_state(s);
+                       STATE_DEC_COUNTERS(s);
                        pool_put(&pf_state_pl, s);
                        return (PF_DROP);
                }
                if (pf_insert_state(BOUND_IFACE(r, kif), s)) {
                        pf_normalize_tcp_cleanup(s);
-                       REASON_SET(&reason, PFRES_MEMORY);
+                       REASON_SET(&reason, PFRES_STATEINS);
                        pf_src_tree_remove_state(s);
+                       STATE_DEC_COUNTERS(s);
                        pool_put(&pf_state_pl, s);
                        return (PF_DROP);
                } else
                        *sm = s;
+               if (tag > 0) {
+                       pf_tag_ref(tag);
+                       s->tag = tag;
+               }
                if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
                    r->keep_state == PF_STATE_SYNPROXY) {
-                       u_int16_t mss;
-
                        s->src.state = PF_TCPS_PROXY_SRC;
                        if (nr != NULL) {
                                if (direction == PF_OUT) {
@@ -2826,7 +3329,7 @@ cleanup:
                                            bport, 0, af);
                                }
                        }
-                       s->src.seqhi = karc4random();
+                       s->src.seqhi = htonl(karc4random());
                        /* Find mss option */
                        mss = pf_get_mss(m, off, th->th_off, af);
                        mss = pf_calc_mss(saddr, af, mss);
@@ -2834,7 +3337,8 @@ cleanup:
                        s->src.mss = mss;
                        pf_send_tcp(r, af, daddr, saddr, th->th_dport,
                            th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
-                           TH_SYN|TH_ACK, 0, s->src.mss, 0);
+                           TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL);
+                       REASON_SET(&reason, PFRES_SYNPROXY);
                        return (PF_SYNPROXY_DROP);
                }
        }
@@ -2849,24 +3353,29 @@ cleanup:
 int
 pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
     struct pfi_kif *kif, struct mbuf *m, int off, void *h,
-    struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm)
+    struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm,
+    struct ifqueue *ifq, struct inpcb *inp)
 {
        struct pf_rule          *nr = NULL;
        struct pf_addr          *saddr = pd->src, *daddr = pd->dst;
        struct udphdr           *uh = pd->hdr.udp;
        u_int16_t                bport, nport = 0;
        sa_family_t              af = pd->af;
-       int                      lookup = -1;
-       uid_t                    uid;
-       gid_t                    gid;
        struct pf_rule          *r, *a = NULL;
        struct pf_ruleset       *ruleset = NULL;
        struct pf_src_node      *nsn = NULL;
        u_short                  reason;
        int                      rewrite = 0;
-       int                      tag = -1;
-
-       r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
+       int                      tag = -1, rtableid = -1;
+       int                      asd = 0;
+       int                      match = 0;
+
+       if (pf_check_congestion(ifq)) {
+               REASON_SET(&reason, PFRES_CONGEST);
+               return (PF_DROP);
+       }
+
+       r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 
        if (direction == PF_OUT) {
                bport = nport = uh->uh_sport;
@@ -2900,8 +3409,7 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
 
        while (r != NULL) {
                r->evaluations++;
-               if (r->kif != NULL &&
-                   (r->kif != kif && r->kif != kif->pfik_parent) == !r->ifnot)
+               if (pfi_kif_match(r->kif, kif) == r->ifnot)
                        r = r->skip[PF_SKIP_IFP].ptr;
                else if (r->direction && r->direction != direction)
                        r = r->skip[PF_SKIP_DIR].ptr;
@@ -2909,42 +3417,45 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
                        r = r->skip[PF_SKIP_AF].ptr;
                else if (r->proto && r->proto != IPPROTO_UDP)
                        r = r->skip[PF_SKIP_PROTO].ptr;
-               else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.not))
+               else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
+                   r->src.neg, kif))
                        r = r->skip[PF_SKIP_SRC_ADDR].ptr;
                else if (r->src.port_op && !pf_match_port(r->src.port_op,
                    r->src.port[0], r->src.port[1], uh->uh_sport))
                        r = r->skip[PF_SKIP_SRC_PORT].ptr;
-               else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.not))
+               else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
+                   r->dst.neg, NULL))
                        r = r->skip[PF_SKIP_DST_ADDR].ptr;
                else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
                    r->dst.port[0], r->dst.port[1], uh->uh_dport))
                        r = r->skip[PF_SKIP_DST_PORT].ptr;
-               else if (r->tos && !(r->tos & pd->tos))
+               else if (r->tos && !(r->tos == pd->tos))
                        r = TAILQ_NEXT(r, entries);
                else if (r->rule_flag & PFRULE_FRAGMENT)
                        r = TAILQ_NEXT(r, entries);
-               else if (r->uid.op && (lookup != -1 || (lookup =
-                   pf_socket_lookup(&uid, &gid, direction, pd), 1)) &&
+               else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
+                   pf_socket_lookup(direction, pd, inp), 1)) &&
                    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
-                   uid))
+                   pd->lookup.uid))
                        r = TAILQ_NEXT(r, entries);
-               else if (r->gid.op && (lookup != -1 || (lookup =
-                   pf_socket_lookup(&uid, &gid, direction, pd), 1)) &&
+               else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
+                   pf_socket_lookup(direction, pd, inp), 1)) &&
                    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
-                   gid))
+                   pd->lookup.gid))
                        r = TAILQ_NEXT(r, entries);
                else if (r->prob && r->prob <= karc4random())
                        r = TAILQ_NEXT(r, entries);
-               else if (r->match_tag && !pf_match_tag(m, r, nr, &tag))
-                       r = TAILQ_NEXT(r, entries);
-               else if (r->anchorname[0] && r->anchor == NULL)
+               else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
                        r = TAILQ_NEXT(r, entries);
                else if (r->os_fingerprint != PF_OSFP_ANY)
                        r = TAILQ_NEXT(r, entries);
                else {
                        if (r->tag)
                                tag = r->tag;
+                       if (r->rtableid >= 0)
+                               rtableid = r->rtableid;
                        if (r->anchor == NULL) {
+                               match = 1;
                                *rm = r;
                                *am = a;
                                *rsm = ruleset;
@@ -2952,12 +3463,12 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
                                        break;
                                r = TAILQ_NEXT(r, entries);
                        } else
-                               PF_STEP_INTO_ANCHOR(r, a, ruleset,
-                                   PF_RULESET_FILTER);
+                               pf_step_into_anchor(&asd, &ruleset,
+                                   PF_RULESET_FILTER, &r, &a, &match);
                }
-               if (r == NULL && a != NULL)
-                       PF_STEP_OUT_OF_ANCHOR(r, a, ruleset,
-                           PF_RULESET_FILTER);
+               if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
+                   PF_RULESET_FILTER, &r, &a, &match))
+                       break;
        }
        r = *rm;
        a = *am;
@@ -2965,10 +3476,11 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
 
        REASON_SET(&reason, PFRES_MATCH);
 
-       if (r->log) {
+       if (r->log || (nr != NULL && nr->natpass && nr->log)) {
                if (rewrite)
                        m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
-               PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset);
+               PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
+                   a, ruleset, pd);
        }
 
        if ((r->action == PF_DROP) &&
@@ -2997,7 +3509,10 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
        if (r->action == PF_DROP)
                return (PF_DROP);
 
-       pf_tag_packet(m, tag);
+       if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
+               REASON_SET(&reason, PFRES_MEMORY);
+               return (PF_DROP);
+       }
 
        if (r->keep_state || nr != NULL) {
                /* create new state */
@@ -3005,21 +3520,29 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
                struct pf_src_node *sn = NULL;
 
                /* check maximums */
-               if (r->max_states && (r->states >= r->max_states))
+               if (r->max_states && (r->states >= r->max_states)) {
+                       pf_status.lcounters[LCNT_STATES]++;
+                       REASON_SET(&reason, PFRES_MAXSTATES);
                        goto cleanup;
-               /* src node for flter rule */
+               }
+               /* src node for filter rule */
                if ((r->rule_flag & PFRULE_SRCTRACK ||
                    r->rpool.opts & PF_POOL_STICKYADDR) &&
-                   pf_insert_src_node(&sn, r, saddr, af) != 0)
+                   pf_insert_src_node(&sn, r, saddr, af) != 0) {
+                       REASON_SET(&reason, PFRES_SRCLIMIT);
                        goto cleanup;
+               }
                /* src node for translation rule */
                if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
                    ((direction == PF_OUT &&
                    pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) ||
-                   (pf_insert_src_node(&nsn, nr, saddr, af) != 0)))
+                   (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
+                       REASON_SET(&reason, PFRES_SRCLIMIT);
                        goto cleanup;
+               }
                s = pool_get(&pf_state_pl, PR_NOWAIT);
                if (s == NULL) {
+                       REASON_SET(&reason, PFRES_MEMORY);
 cleanup:
                        if (sn != NULL && sn->states == 0 && sn->expire == 0) {
                                RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
@@ -3034,20 +3557,17 @@ cleanup:
                                pf_status.src_nodes--;
                                pool_put(&pf_src_tree_pl, nsn);
                        }
-                       REASON_SET(&reason, PFRES_MEMORY);
                        return (PF_DROP);
                }
                bzero(s, sizeof(*s));
-               r->states++;
-               if (a != NULL)
-                       a->states++;
                s->rule.ptr = r;
                s->nat_rule.ptr = nr;
-               if (s->nat_rule.ptr != NULL)
-                       s->nat_rule.ptr->states++;
                s->anchor.ptr = a;
+               STATE_INC_COUNTERS(s);
                s->allow_opts = r->allow_opts;
-               s->log = r->log & 2;
+               s->log = r->log & PF_LOG_ALL;
+               if (nr != NULL)
+                       s->log |= nr->log & PF_LOG_ALL;
                s->proto = IPPROTO_UDP;
                s->direction = direction;
                s->af = af;
@@ -3093,12 +3613,17 @@ cleanup:
                        s->nat_src_node->states++;
                }
                if (pf_insert_state(BOUND_IFACE(r, kif), s)) {
-                       REASON_SET(&reason, PFRES_MEMORY);
+                       REASON_SET(&reason, PFRES_STATEINS);
                        pf_src_tree_remove_state(s);
+                       STATE_DEC_COUNTERS(s);
                        pool_put(&pf_state_pl, s);
                        return (PF_DROP);
                } else
                        *sm = s;
+               if (tag > 0) {
+                       pf_tag_ref(tag);
+                       s->tag = tag;
+               }
        }
 
        /* copy back packet headers if we performed NAT operations */
@@ -3111,7 +3636,8 @@ cleanup:
 int
 pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
     struct pfi_kif *kif, struct mbuf *m, int off, void *h,
-    struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm)
+    struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm,
+    struct ifqueue *ifq)
 {
        struct pf_rule          *nr = NULL;
        struct pf_addr          *saddr = pd->src, *daddr = pd->dst;
@@ -3119,14 +3645,21 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
        struct pf_ruleset       *ruleset = NULL;
        struct pf_src_node      *nsn = NULL;
        u_short                  reason;
-       u_int16_t                icmpid = 0;
+       u_int16_t                icmpid = 0, bport, nport = 0;
        sa_family_t              af = pd->af;
        u_int8_t                 icmptype = 0, icmpcode = 0;
        int                      state_icmp = 0;
-       int                      tag = -1;
+       int                      tag = -1, rtableid = -1;
 #ifdef INET6
        int                      rewrite = 0;
 #endif /* INET6 */
+       int                      asd = 0;
+       int                      match = 0;
+
+       if (pf_check_congestion(ifq)) {
+               REASON_SET(&reason, PFRES_CONGEST);
+               return (PF_DROP);
+       }
 
        switch (pd->proto) {
 #ifdef INET
@@ -3161,15 +3694,21 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
        r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 
        if (direction == PF_OUT) {
+               bport = nport = icmpid;
                /* check outgoing packet for BINAT/NAT */
                if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn,
-                   saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) {
+                   saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) !=
+                   NULL) {
                        PF_ACPY(&pd->baddr, saddr, af);
                        switch (af) {
 #ifdef INET
                        case AF_INET:
                                pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
                                    pd->naddr.v4.s_addr, 0);
+                               pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
+                                   pd->hdr.icmp->icmp_cksum, icmpid, nport, 0);
+                               pd->hdr.icmp->icmp_id = nport;
+                               m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
                                break;
 #endif /* INET */
 #ifdef INET6
@@ -3185,9 +3724,11 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
                        pd->nat_rule = nr;
                }
        } else {
+               bport = nport = icmpid;
                /* check incoming packet for BINAT/RDR */
                if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn,
-                   saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) {
+                   saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) !=
+                   NULL) {
                        PF_ACPY(&pd->baddr, daddr, af);
                        switch (af) {
 #ifdef INET
@@ -3212,8 +3753,7 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
 
        while (r != NULL) {
                r->evaluations++;
-               if (r->kif != NULL &&
-                   (r->kif != kif && r->kif != kif->pfik_parent) == !r->ifnot)
+               if (pfi_kif_match(r->kif, kif) == r->ifnot)
                        r = r->skip[PF_SKIP_IFP].ptr;
                else if (r->direction && r->direction != direction)
                        r = r->skip[PF_SKIP_DIR].ptr;
@@ -3221,30 +3761,33 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
                        r = r->skip[PF_SKIP_AF].ptr;
                else if (r->proto && r->proto != pd->proto)
                        r = r->skip[PF_SKIP_PROTO].ptr;
-               else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.not))
+               else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
+                   r->src.neg, kif))
                        r = r->skip[PF_SKIP_SRC_ADDR].ptr;
-               else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.not))
+               else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
+                   r->dst.neg, NULL))
                        r = r->skip[PF_SKIP_DST_ADDR].ptr;
                else if (r->type && r->type != icmptype + 1)
                        r = TAILQ_NEXT(r, entries);
                else if (r->code && r->code != icmpcode + 1)
                        r = TAILQ_NEXT(r, entries);
-               else if (r->tos && !(r->tos & pd->tos))
+               else if (r->tos && !(r->tos == pd->tos))
                        r = TAILQ_NEXT(r, entries);
                else if (r->rule_flag & PFRULE_FRAGMENT)
                        r = TAILQ_NEXT(r, entries);
                else if (r->prob && r->prob <= karc4random())
                        r = TAILQ_NEXT(r, entries);
-               else if (r->match_tag && !pf_match_tag(m, r, nr, &tag))
-                       r = TAILQ_NEXT(r, entries);
-               else if (r->anchorname[0] && r->anchor == NULL)
+               else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
                        r = TAILQ_NEXT(r, entries);
                else if (r->os_fingerprint != PF_OSFP_ANY)
                        r = TAILQ_NEXT(r, entries);
                else {
                        if (r->tag)
                                tag = r->tag;
+                       if (r->rtableid >= 0)
+                               rtableid = r->rtableid;
                        if (r->anchor == NULL) {
+                               match = 1;
                                *rm = r;
                                *am = a;
                                *rsm = ruleset;
@@ -3252,12 +3795,12 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
                                        break;
                                r = TAILQ_NEXT(r, entries);
                        } else
-                               PF_STEP_INTO_ANCHOR(r, a, ruleset,
-                                   PF_RULESET_FILTER);
+                               pf_step_into_anchor(&asd, &ruleset,
+                                   PF_RULESET_FILTER, &r, &a, &match);
                }
-               if (r == NULL && a != NULL)
-                       PF_STEP_OUT_OF_ANCHOR(r, a, ruleset,
-                           PF_RULESET_FILTER);
+               if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
+                   PF_RULESET_FILTER, &r, &a, &match))
+                       break;
        }
        r = *rm;
        a = *am;
@@ -3265,19 +3808,23 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
 
        REASON_SET(&reason, PFRES_MATCH);
 
-       if (r->log) {
+       if (r->log || (nr != NULL && nr->natpass && nr->log)) {
 #ifdef INET6
                if (rewrite)
                        m_copyback(m, off, sizeof(struct icmp6_hdr),
                            (caddr_t)pd->hdr.icmp6);
 #endif /* INET6 */
-               PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset);
+               PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
+                   a, ruleset, pd);
        }
 
        if (r->action != PF_PASS)
                return (PF_DROP);
 
-       pf_tag_packet(m, tag);
+       if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
+               REASON_SET(&reason, PFRES_MEMORY);
+               return (PF_DROP);
+       }
 
        if (!state_icmp && (r->keep_state || nr != NULL)) {
                /* create new state */
@@ -3285,21 +3832,29 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
                struct pf_src_node *sn = NULL;
 
                /* check maximums */
-               if (r->max_states && (r->states >= r->max_states))
+               if (r->max_states && (r->states >= r->max_states)) {
+                       pf_status.lcounters[LCNT_STATES]++;
+                       REASON_SET(&reason, PFRES_MAXSTATES);
                        goto cleanup;
-               /* src node for flter rule */
+               }
+               /* src node for filter rule */
                if ((r->rule_flag & PFRULE_SRCTRACK ||
                    r->rpool.opts & PF_POOL_STICKYADDR) &&
-                   pf_insert_src_node(&sn, r, saddr, af) != 0)
+                   pf_insert_src_node(&sn, r, saddr, af) != 0) {
+                       REASON_SET(&reason, PFRES_SRCLIMIT);
                        goto cleanup;
+               }
                /* src node for translation rule */
                if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
                    ((direction == PF_OUT &&
                    pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) ||
-                   (pf_insert_src_node(&nsn, nr, saddr, af) != 0)))
+                   (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
+                       REASON_SET(&reason, PFRES_SRCLIMIT);
                        goto cleanup;
+               }
                s = pool_get(&pf_state_pl, PR_NOWAIT);
                if (s == NULL) {
+                       REASON_SET(&reason, PFRES_MEMORY);
 cleanup:
                        if (sn != NULL && sn->states == 0 && sn->expire == 0) {
                                RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
@@ -3314,43 +3869,44 @@ cleanup:
                                pf_status.src_nodes--;
                                pool_put(&pf_src_tree_pl, nsn);
                        }
-                       REASON_SET(&reason, PFRES_MEMORY);
                        return (PF_DROP);
                }
                bzero(s, sizeof(*s));
-               r->states++;
-               if (a != NULL)
-                       a->states++;
                s->rule.ptr = r;
                s->nat_rule.ptr = nr;
-               if (s->nat_rule.ptr != NULL)
-                       s->nat_rule.ptr->states++;
                s->anchor.ptr = a;
+               STATE_INC_COUNTERS(s);
                s->allow_opts = r->allow_opts;
-               s->log = r->log & 2;
+               s->log = r->log & PF_LOG_ALL;
+               if (nr != NULL)
+                       s->log |= nr->log & PF_LOG_ALL;
                s->proto = pd->proto;
                s->direction = direction;
                s->af = af;
                if (direction == PF_OUT) {
                        PF_ACPY(&s->gwy.addr, saddr, af);
-                       s->gwy.port = icmpid;
+                       s->gwy.port = nport;
                        PF_ACPY(&s->ext.addr, daddr, af);
-                       s->ext.port = icmpid;
-                       if (nr != NULL)
+                       s->ext.port = 0;
+                       if (nr != NULL) {
                                PF_ACPY(&s->lan.addr, &pd->baddr, af);
-                       else
+                               s->lan.port = bport;
+                       } else {