gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 The DragonFly Project. All rights reserved.
	3	*
	4	* Copyright (c) 2001 Daniel Hartmeier
	5	* Copyright (c) 2002 - 2008 Henning Brauer
	6	* All rights reserved.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	*
	12	* - Redistributions of source code must retain the above copyright
	13	* notice, this list of conditions and the following disclaimer.
	14	* - Redistributions in binary form must reproduce the above
	15	* copyright notice, this list of conditions and the following
	16	* disclaimer in the documentation and/or other materials provided
	17	* with the distribution.
	18	*
	19	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	20	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	21	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	22	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	23	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	24	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	25	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	26	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	27	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	28	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	29	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	30	* POSSIBILITY OF SUCH DAMAGE.
	31	*
	32	* Effort sponsored in part by the Defense Advanced Research Projects
	33	* Agency (DARPA) and Air Force Research Laboratory, Air Force
	34	* Materiel Command, USAF, under agreement number F30602-01-2-0537.
	35	*
	36	*/
	37
	38	#include "opt_inet.h"
	39	#include "opt_inet6.h"
	40
	41	#include <sys/param.h>
	42	#include <sys/systm.h>
	43	#include <sys/malloc.h>
	44	#include <sys/mbuf.h>
	45	#include <sys/filio.h>
	46	#include <sys/socket.h>
	47	#include <sys/socketvar.h>
	48	#include <sys/kernel.h>
	49	#include <sys/time.h>
	50	#include <sys/sysctl.h>
	51	#include <sys/endian.h>
	52	#include <sys/proc.h>
	53	#include <sys/kthread.h>
	54	#include <sys/spinlock.h>
	55
	56	#include <sys/md5.h>
	57
	58	#include <net/if.h>
	59	#include <net/if_types.h>
	60	#include <net/bpf.h>
	61	#include <net/netisr2.h>
	62	#include <net/route.h>
	63
	64	#include <netinet/in.h>
	65	#include <netinet/in_var.h>
	66	#include <netinet/in_systm.h>
	67	#include <netinet/ip.h>
	68	#include <netinet/ip_var.h>
	69	#include <netinet/tcp.h>
	70	#include <netinet/tcp_seq.h>
	71	#include <netinet/udp.h>
	72	#include <netinet/ip_icmp.h>
	73	#include <netinet/in_pcb.h>
	74	#include <netinet/tcp_timer.h>
	75	#include <netinet/tcp_var.h>
	76	#include <netinet/udp_var.h>
	77	#include <netinet/icmp_var.h>
	78	#include <netinet/if_ether.h>
	79
	80	#include <net/pf/pfvar.h>
	81	#include <net/pf/if_pflog.h>
	82
	83	#include <net/pf/if_pfsync.h>
	84
	85	#ifdef INET6
	86	#include <netinet/ip6.h>
	87	#include <netinet/icmp6.h>
	88	#include <netinet6/nd6.h>
	89	#include <netinet6/ip6_var.h>
	90	#include <netinet6/in6_pcb.h>
	91	#endif /* INET6 */
	92
	93	#include <sys/in_cksum.h>
	94	#include <sys/ucred.h>
	95	#include <machine/limits.h>
	96	#include <sys/msgport2.h>
	97	#include <sys/spinlock2.h>
	98	#include <net/netmsg2.h>
	99	#include <net/toeplitz2.h>
	100
	101	extern int ip_optcopy(struct ip , struct ip );
	102	extern int debug_pfugidhack;
	103
	104	/*
	105	* pf_token - shared lock for cpu-localized operations,
	106	* exclusive lock otherwise.
	107	*
	108	* pf_gtoken- exclusive lock used for initialization.
	109	*/
	110	struct lwkt_token pf_token = LWKT_TOKEN_INITIALIZER(pf_token);
	111	struct lwkt_token pf_gtoken = LWKT_TOKEN_INITIALIZER(pf_gtoken);
	112
	113	#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) kprintf x
	114
	115	#define FAIL(code) { error = (code); goto done; }
	116
	117	/*
	118	* Global variables
	119	*/
	120
	121	/* mask radix tree */
	122	struct radix_node_head *pf_maskhead;
	123
	124	/* state tables */
	125	struct pf_state_tree pf_statetbl; / incls one global table */
	126	struct pf_state **purge_cur;
	127	struct pf_altqqueue pf_altqs[2];
	128	struct pf_palist pf_pabuf;
	129	struct pf_altqqueue *pf_altqs_active;
	130	struct pf_altqqueue *pf_altqs_inactive;
	131	struct pf_status pf_status;
	132
	133	u_int32_t ticket_altqs_active;
	134	u_int32_t ticket_altqs_inactive;
	135	int altqs_inactive_open;
	136	u_int32_t ticket_pabuf;
	137
	138	MD5_CTX pf_tcp_secret_ctx;
	139	u_char pf_tcp_secret[16];
	140	int pf_tcp_secret_init;
	141	int pf_tcp_iss_off;
	142
	143	struct pf_anchor_stackframe {
	144	struct pf_ruleset *rs;
	145	struct pf_rule *r;
	146	struct pf_anchor_node *parent;
	147	struct pf_anchor *child;
	148	} pf_anchor_stack[64];
	149
	150	struct malloc_type pf_src_tree_pl, pf_rule_pl, *pf_pooladdr_pl;
	151	struct malloc_type pf_state_pl, pf_state_key_pl, *pf_state_item_pl;
	152	struct malloc_type *pf_altq_pl;
	153
	154	void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
	155
	156	void pf_init_threshold(struct pf_threshold *, u_int32_t,
	157	u_int32_t);
	158	void pf_add_threshold(struct pf_threshold *);
	159	int pf_check_threshold(struct pf_threshold *);
	160
	161	void pf_change_ap(struct pf_addr , u_int16_t ,
	162	u_int16_t , u_int16_t , struct pf_addr *,
	163	u_int16_t, u_int8_t, sa_family_t);
	164	int pf_modulate_sack(struct mbuf , int, struct pf_pdesc ,
	165	struct tcphdr , struct pf_state_peer );
	166	#ifdef INET6
	167	void pf_change_a6(struct pf_addr , u_int16_t ,
	168	struct pf_addr *, u_int8_t);
	169	#endif /* INET6 */
	170	void pf_change_icmp(struct pf_addr , u_int16_t ,
	171	struct pf_addr , struct pf_addr , u_int16_t,
	172	u_int16_t , u_int16_t , u_int16_t *,
	173	u_int16_t *, u_int8_t, sa_family_t);
	174	void pf_send_tcp(const struct pf_rule *, sa_family_t,
	175	const struct pf_addr , const struct pf_addr ,
	176	u_int16_t, u_int16_t, u_int32_t, u_int32_t,
	177	u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
	178	u_int16_t, struct ether_header , struct ifnet );
	179	void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
	180	sa_family_t, struct pf_rule *);
	181	struct pf_rule pf_match_translation(struct pf_pdesc , struct mbuf *,
	182	int, int, struct pfi_kif *,
	183	struct pf_addr , u_int16_t, struct pf_addr ,
	184	u_int16_t, int);
	185	struct pf_rule pf_get_translation(struct pf_pdesc , struct mbuf *,
	186	int, int, struct pfi_kif , struct pf_src_node *,
	187	struct pf_state_key , struct pf_state_key ,
	188	struct pf_state_key , struct pf_state_key ,
	189	struct pf_addr , struct pf_addr ,
	190	u_int16_t, u_int16_t);
	191	void pf_detach_state(struct pf_state *);
	192	int pf_state_key_setup(struct pf_pdesc , struct pf_rule ,
	193	struct pf_state_key , struct pf_state_key ,
	194	struct pf_state_key , struct pf_state_key ,
	195	struct pf_addr , struct pf_addr ,
	196	u_int16_t, u_int16_t);
	197	void pf_state_key_detach(struct pf_state *, int);
	198	u_int32_t pf_tcp_iss(struct pf_pdesc *);
	199	int pf_test_rule(struct pf_rule , struct pf_state ,
	200	int, struct pfi_kif , struct mbuf , int,
	201	void , struct pf_pdesc , struct pf_rule **,
	202	struct pf_ruleset *, struct ifqueue , struct inpcb *);
	203	static __inline int pf_create_state(struct pf_rule , struct pf_rule ,
	204	struct pf_rule , struct pf_pdesc ,
	205	struct pf_src_node , struct pf_state_key ,
	206	struct pf_state_key , struct pf_state_key ,
	207	struct pf_state_key , struct mbuf , int,
	208	u_int16_t, u_int16_t, int , struct pfi_kif ,
	209	struct pf_state **, int, u_int16_t, u_int16_t,
	210	int);
	211	int pf_test_fragment(struct pf_rule **, int,
	212	struct pfi_kif , struct mbuf , void *,
	213	struct pf_pdesc , struct pf_rule *,
	214	struct pf_ruleset **);
	215	int pf_tcp_track_full(struct pf_state_peer *,
	216	struct pf_state_peer , struct pf_state *,
	217	struct pfi_kif , struct mbuf , int,
	218	struct pf_pdesc , u_short , int *);
	219	int pf_tcp_track_sloppy(struct pf_state_peer *,
	220	struct pf_state_peer , struct pf_state *,
	221	struct pf_pdesc , u_short );
	222	int pf_test_state_tcp(struct pf_state **, int,
	223	struct pfi_kif , struct mbuf , int,
	224	void , struct pf_pdesc , u_short *);
	225	int pf_test_state_udp(struct pf_state **, int,
	226	struct pfi_kif , struct mbuf , int,
	227	void , struct pf_pdesc );
	228	int pf_test_state_icmp(struct pf_state **, int,
	229	struct pfi_kif , struct mbuf , int,
	230	void , struct pf_pdesc , u_short *);
	231	int pf_test_state_other(struct pf_state **, int,
	232	struct pfi_kif , struct mbuf , struct pf_pdesc *);
	233	void pf_step_into_anchor(int , struct pf_ruleset *, int,
	234	struct pf_rule , struct pf_rule , int *);
	235	int pf_step_out_of_anchor(int , struct pf_ruleset *,
	236	int, struct pf_rule , struct pf_rule ,
	237	int *);
	238	void pf_hash(struct pf_addr , struct pf_addr ,
	239	struct pf_poolhashkey *, sa_family_t);
	240	int pf_map_addr(u_int8_t, struct pf_rule *,
	241	struct pf_addr , struct pf_addr ,
	242	struct pf_addr , struct pf_src_node *);
	243	int pf_get_sport(struct pf_pdesc *,
	244	sa_family_t, u_int8_t, struct pf_rule *,
	245	struct pf_addr , struct pf_addr ,
	246	u_int16_t, u_int16_t,
	247	struct pf_addr , u_int16_t ,
	248	u_int16_t, u_int16_t,
	249	struct pf_src_node **);
	250	void pf_route(struct mbuf *, struct pf_rule , int,
	251	struct ifnet , struct pf_state ,
	252	struct pf_pdesc *);
	253	void pf_route6(struct mbuf *, struct pf_rule , int,
	254	struct ifnet , struct pf_state ,
	255	struct pf_pdesc *);
	256	u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t,
	257	sa_family_t);
	258	u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t,
	259	sa_family_t);
	260	u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t,
	261	u_int16_t);
	262	void pf_set_rt_ifp(struct pf_state *,
	263	struct pf_addr *);
	264	int pf_check_proto_cksum(struct mbuf *, int, int,
	265	u_int8_t, sa_family_t);
	266	struct pf_divert pf_get_divert(struct mbuf );
	267	void pf_print_state_parts(struct pf_state *,
	268	struct pf_state_key , struct pf_state_key );
	269	int pf_addr_wrap_neq(struct pf_addr_wrap *,
	270	struct pf_addr_wrap *);
	271	struct pf_state pf_find_state(struct pfi_kif ,
	272	struct pf_state_key_cmp , u_int, struct mbuf );
	273	int pf_src_connlimit(struct pf_state *);
	274	int pf_check_congestion(struct ifqueue *);
	275
	276	extern int pf_end_threads;
	277
	278	struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
	279	{ &pf_state_pl, PFSTATE_HIWAT },
	280	{ &pf_src_tree_pl, PFSNODE_HIWAT },
	281	{ &pf_frent_pl, PFFRAG_FRENT_HIWAT },
	282	{ &pfr_ktable_pl, PFR_KTABLE_HIWAT },
	283	{ &pfr_kentry_pl, PFR_KENTRY_HIWAT }
	284	};
	285
	286	/*
	287	* If route-to and direction is out we match with no further processing
	288	* (rt_kif must be assigned and not equal to the out interface)
	289	* If reply-to and direction is in we match with no further processing
	290	* (rt_kif must be assigned and not equal to the in interface)
	291	*/
	292	#define STATE_LOOKUP(i, k, d, s, m) \
	293	do { \
	294	s = pf_find_state(i, k, d, m); \
	295	if (s == NULL \|\| (s)->timeout == PFTM_PURGE) \
	296	return (PF_DROP); \
	297	if (d == PF_OUT && \
	298	(((s)->rule.ptr->rt == PF_ROUTETO && \
	299	(s)->rule.ptr->direction == PF_OUT) \|\| \
	300	((s)->rule.ptr->rt == PF_REPLYTO && \
	301	(s)->rule.ptr->direction == PF_IN)) && \
	302	(s)->rt_kif != NULL && \
	303	(s)->rt_kif != i) \
	304	return (PF_PASS); \
	305	} while (0)
	306
	307	#define BOUND_IFACE(r, k) \
	308	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all
	309
	310	#define STATE_INC_COUNTERS(s) \
	311	do { \
	312	atomic_add_int(&s->rule.ptr->states_cur, 1); \
	313	s->rule.ptr->states_tot++; \
	314	if (s->anchor.ptr != NULL) { \
	315	atomic_add_int(&s->anchor.ptr->states_cur, 1); \
	316	s->anchor.ptr->states_tot++; \
	317	} \
	318	if (s->nat_rule.ptr != NULL) { \
	319	atomic_add_int(&s->nat_rule.ptr->states_cur, 1); \
	320	s->nat_rule.ptr->states_tot++; \
	321	} \
	322	} while (0)
	323
	324	#define STATE_DEC_COUNTERS(s) \
	325	do { \
	326	if (s->nat_rule.ptr != NULL) \
	327	atomic_add_int(&s->nat_rule.ptr->states_cur, -1); \
	328	if (s->anchor.ptr != NULL) \
	329	atomic_add_int(&s->anchor.ptr->states_cur, -1); \
	330	atomic_add_int(&s->rule.ptr->states_cur, -1); \
	331	} while (0)
	332
	333	static MALLOC_DEFINE(M_PFSTATEPL, "pfstatepl", "pf state pool list");
	334	static MALLOC_DEFINE(M_PFSRCTREEPL, "pfsrctpl", "pf source tree pool list");
	335	static MALLOC_DEFINE(M_PFSTATEKEYPL, "pfstatekeypl", "pf state key pool list");
	336	static MALLOC_DEFINE(M_PFSTATEITEMPL, "pfstateitempl", "pf state item pool list");
	337
	338	static __inline int pf_src_compare(struct pf_src_node , struct pf_src_node );
	339	static __inline int pf_state_compare_key(struct pf_state_key *,
	340	struct pf_state_key *);
	341	static __inline int pf_state_compare_rkey(struct pf_state_key *,
	342	struct pf_state_key *);
	343	static __inline int pf_state_compare_id(struct pf_state *,
	344	struct pf_state *);
	345
	346	struct pf_src_tree *tree_src_tracking;
	347	struct pf_state_tree_id *tree_id;
	348	struct pf_state_queue *state_list;
	349	struct pf_counters *pf_counters;
	350
	351	RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
	352	RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
	353	RB_GENERATE(pf_state_rtree, pf_state_key, entry, pf_state_compare_rkey);
	354	RB_GENERATE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id);
	355
	356	static __inline int
	357	pf_src_compare(struct pf_src_node a, struct pf_src_node b)
	358	{
	359	int diff;
	360
	361	if (a->rule.ptr > b->rule.ptr)
	362	return (1);
	363	if (a->rule.ptr < b->rule.ptr)
	364	return (-1);
	365	if ((diff = a->af - b->af) != 0)
	366	return (diff);
	367	switch (a->af) {
	368	#ifdef INET
	369	case AF_INET:
	370	if (a->addr.addr32[0] > b->addr.addr32[0])
	371	return (1);
	372	if (a->addr.addr32[0] < b->addr.addr32[0])
	373	return (-1);
	374	break;
	375	#endif /* INET */
	376	#ifdef INET6
	377	case AF_INET6:
	378	if (a->addr.addr32[3] > b->addr.addr32[3])
	379	return (1);
	380	if (a->addr.addr32[3] < b->addr.addr32[3])
	381	return (-1);
	382	if (a->addr.addr32[2] > b->addr.addr32[2])
	383	return (1);
	384	if (a->addr.addr32[2] < b->addr.addr32[2])
	385	return (-1);
	386	if (a->addr.addr32[1] > b->addr.addr32[1])
	387	return (1);
	388	if (a->addr.addr32[1] < b->addr.addr32[1])
	389	return (-1);
	390	if (a->addr.addr32[0] > b->addr.addr32[0])
	391	return (1);
	392	if (a->addr.addr32[0] < b->addr.addr32[0])
	393	return (-1);
	394	break;
	395	#endif /* INET6 */
	396	}
	397	return (0);
	398	}
	399
	400	u_int32_t
	401	pf_state_hash(struct pf_state_key *sk)
	402	{
	403	u_int32_t hv = (u_int32_t)(((intptr_t)sk >> 6) ^ ((intptr_t)sk >> 15));
	404	if (hv == 0) /* disallow 0 */
	405	hv = 1;
	406	return(hv);
	407	}
	408
	409	#ifdef INET6
	410	void
	411	pf_addrcpy(struct pf_addr dst, struct pf_addr src, sa_family_t af)
	412	{
	413	switch (af) {
	414	#ifdef INET
	415	case AF_INET:
	416	dst->addr32[0] = src->addr32[0];
	417	break;
	418	#endif /* INET */
	419	case AF_INET6:
	420	dst->addr32[0] = src->addr32[0];
	421	dst->addr32[1] = src->addr32[1];
	422	dst->addr32[2] = src->addr32[2];
	423	dst->addr32[3] = src->addr32[3];
	424	break;
	425	}
	426	}
	427	#endif /* INET6 */
	428
	429	void
	430	pf_init_threshold(struct pf_threshold *threshold,
	431	u_int32_t limit, u_int32_t seconds)
	432	{
	433	threshold->limit = limit * PF_THRESHOLD_MULT;
	434	threshold->seconds = seconds;
	435	threshold->count = 0;
	436	threshold->last = time_second;
	437	}
	438
	439	void
	440	pf_add_threshold(struct pf_threshold *threshold)
	441	{
	442	u_int32_t t = time_second, diff = t - threshold->last;
	443
	444	if (diff >= threshold->seconds)
	445	threshold->count = 0;
	446	else
	447	threshold->count -= threshold->count * diff /
	448	threshold->seconds;
	449	threshold->count += PF_THRESHOLD_MULT;
	450	threshold->last = t;
	451	}
	452
	453	int
	454	pf_check_threshold(struct pf_threshold *threshold)
	455	{
	456	return (threshold->count > threshold->limit);
	457	}
	458
	459	int
	460	pf_src_connlimit(struct pf_state *state)
	461	{
	462	int bad = 0;
	463	int cpu = mycpu->gd_cpuid;
	464
	465	atomic_add_int(&state->src_node->conn, 1);
	466	state->src.tcp_est = 1;
	467	pf_add_threshold(&state->src_node->conn_rate);
	468
	469	if (state->rule.ptr->max_src_conn &&
	470	state->rule.ptr->max_src_conn <
	471	state->src_node->conn) {
	472	PF_INC_LCOUNTER(LCNT_SRCCONN);
	473	bad++;
	474	}
	475
	476	if (state->rule.ptr->max_src_conn_rate.limit &&
	477	pf_check_threshold(&state->src_node->conn_rate)) {
	478	PF_INC_LCOUNTER(LCNT_SRCCONNRATE);
	479	bad++;
	480	}
	481
	482	if (!bad)
	483	return 0;
	484
	485	if (state->rule.ptr->overload_tbl) {
	486	struct pfr_addr p;
	487	u_int32_t killed = 0;
	488
	489	PF_INC_LCOUNTER(LCNT_OVERLOAD_TABLE);
	490	if (pf_status.debug >= PF_DEBUG_MISC) {
	491	kprintf("pf_src_connlimit: blocking address ");
	492	pf_print_host(&state->src_node->addr, 0,
	493	state->key[PF_SK_WIRE]->af);
	494	}
	495
	496	bzero(&p, sizeof(p));
	497	p.pfra_af = state->key[PF_SK_WIRE]->af;
	498	switch (state->key[PF_SK_WIRE]->af) {
	499	#ifdef INET
	500	case AF_INET:
	501	p.pfra_net = 32;
	502	p.pfra_ip4addr = state->src_node->addr.v4;
	503	break;
	504	#endif /* INET */
	505	#ifdef INET6
	506	case AF_INET6:
	507	p.pfra_net = 128;
	508	p.pfra_ip6addr = state->src_node->addr.v6;
	509	break;
	510	#endif /* INET6 */
	511	}
	512
	513	pfr_insert_kentry(state->rule.ptr->overload_tbl,
	514	&p, time_second);
	515
	516	/* kill existing states if that's required. */
	517	if (state->rule.ptr->flush) {
	518	struct pf_state_key *sk;
	519	struct pf_state *st;
	520
	521	PF_INC_LCOUNTER(LCNT_OVERLOAD_FLUSH);
	522	RB_FOREACH(st, pf_state_tree_id, &tree_id[cpu]) {
	523	sk = st->key[PF_SK_WIRE];
	524	/*
	525	* Kill states from this source. (Only those
	526	* from the same rule if PF_FLUSH_GLOBAL is not
	527	* set). (Only on current cpu).
	528	*/
	529	if (sk->af ==
	530	state->key[PF_SK_WIRE]->af &&
	531	((state->direction == PF_OUT &&
	532	PF_AEQ(&state->src_node->addr,
	533	&sk->addr[0], sk->af)) \|\|
	534	(state->direction == PF_IN &&
	535	PF_AEQ(&state->src_node->addr,
	536	&sk->addr[1], sk->af))) &&
	537	(state->rule.ptr->flush &
	538	PF_FLUSH_GLOBAL \|\|
	539	state->rule.ptr == st->rule.ptr)) {
	540	st->timeout = PFTM_PURGE;
	541	st->src.state = st->dst.state =
	542	TCPS_CLOSED;
	543	killed++;
	544	}
	545	}
	546	if (pf_status.debug >= PF_DEBUG_MISC)
	547	kprintf(", %u states killed", killed);
	548	}
	549	if (pf_status.debug >= PF_DEBUG_MISC)
	550	kprintf("\n");
	551	}
	552
	553	/* kill this state */
	554	state->timeout = PFTM_PURGE;
	555	state->src.state = state->dst.state = TCPS_CLOSED;
	556
	557	return 1;
	558	}
	559
	560	int
	561	pf_insert_src_node(struct pf_src_node *sn, struct pf_rule rule,
	562	struct pf_addr *src, sa_family_t af)
	563	{
	564	struct pf_src_node k;
	565	int cpu = mycpu->gd_cpuid;
	566
	567	bzero(&k, sizeof(k)); /* avoid gcc warnings */
	568	if (*sn == NULL) {
	569	k.af = af;
	570	PF_ACPY(&k.addr, src, af);
	571	if (rule->rule_flag & PFRULE_RULESRCTRACK \|\|
	572	rule->rpool.opts & PF_POOL_STICKYADDR)
	573	k.rule.ptr = rule;
	574	else
	575	k.rule.ptr = NULL;
	576	PF_INC_SCOUNTER(SCNT_SRC_NODE_SEARCH);
	577	*sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
	578	}
	579	if (*sn == NULL) {
	580	if (!rule->max_src_nodes \|\|
	581	rule->src_nodes < rule->max_src_nodes)
	582	(*sn) = kmalloc(sizeof(struct pf_src_node),
	583	M_PFSRCTREEPL, M_NOWAIT\|M_ZERO);
	584	else
	585	PF_INC_LCOUNTER(LCNT_SRCNODES);
	586	if ((*sn) == NULL)
	587	return (-1);
	588
	589	pf_init_threshold(&(*sn)->conn_rate,
	590	rule->max_src_conn_rate.limit,
	591	rule->max_src_conn_rate.seconds);
	592
	593	(*sn)->af = af;
	594	if (rule->rule_flag & PFRULE_RULESRCTRACK \|\|
	595	rule->rpool.opts & PF_POOL_STICKYADDR)
	596	(*sn)->rule.ptr = rule;
	597	else
	598	(*sn)->rule.ptr = NULL;
	599	PF_ACPY(&(*sn)->addr, src, af);
	600	if (RB_INSERT(pf_src_tree,
	601	&tree_src_tracking[cpu], *sn) != NULL) {
	602	if (pf_status.debug >= PF_DEBUG_MISC) {
	603	kprintf("pf: src_tree insert failed: ");
	604	pf_print_host(&(*sn)->addr, 0, af);
	605	kprintf("\n");
	606	}
	607	kfree(*sn, M_PFSRCTREEPL);
	608	return (-1);
	609	}
	610
	611	/*
	612	* Atomic op required to increment src_nodes in the rule
	613	* because we hold a shared token here (decrements will use
	614	* an exclusive token).
	615	*/
	616	(*sn)->creation = time_second;
	617	(*sn)->ruletype = rule->action;
	618	if ((*sn)->rule.ptr != NULL)
	619	atomic_add_int(&(*sn)->rule.ptr->src_nodes, 1);
	620	PF_INC_SCOUNTER(SCNT_SRC_NODE_INSERT);
	621	atomic_add_int(&pf_status.src_nodes, 1);
	622	} else {
	623	if (rule->max_src_states &&
	624	(*sn)->states >= rule->max_src_states) {
	625	PF_INC_LCOUNTER(LCNT_SRCSTATES);
	626	return (-1);
	627	}
	628	}
	629	return (0);
	630	}
	631
	632	/*
	633	* state table (indexed by the pf_state_key structure), normal RBTREE
	634	* comparison.
	635	*/
	636	static __inline int
	637	pf_state_compare_key(struct pf_state_key a, struct pf_state_key b)
	638	{
	639	int diff;
	640
	641	if ((diff = a->proto - b->proto) != 0)
	642	return (diff);
	643	if ((diff = a->af - b->af) != 0)
	644	return (diff);
	645	switch (a->af) {
	646	#ifdef INET
	647	case AF_INET:
	648	if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
	649	return (1);
	650	if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
	651	return (-1);
	652	if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
	653	return (1);
	654	if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
	655	return (-1);
	656	break;
	657	#endif /* INET */
	658	#ifdef INET6
	659	case AF_INET6:
	660	if (a->addr[0].addr32[3] > b->addr[0].addr32[3])
	661	return (1);
	662	if (a->addr[0].addr32[3] < b->addr[0].addr32[3])
	663	return (-1);
	664	if (a->addr[1].addr32[3] > b->addr[1].addr32[3])
	665	return (1);
	666	if (a->addr[1].addr32[3] < b->addr[1].addr32[3])
	667	return (-1);
	668	if (a->addr[0].addr32[2] > b->addr[0].addr32[2])
	669	return (1);
	670	if (a->addr[0].addr32[2] < b->addr[0].addr32[2])
	671	return (-1);
	672	if (a->addr[1].addr32[2] > b->addr[1].addr32[2])
	673	return (1);
	674	if (a->addr[1].addr32[2] < b->addr[1].addr32[2])
	675	return (-1);
	676	if (a->addr[0].addr32[1] > b->addr[0].addr32[1])
	677	return (1);
	678	if (a->addr[0].addr32[1] < b->addr[0].addr32[1])
	679	return (-1);
	680	if (a->addr[1].addr32[1] > b->addr[1].addr32[1])
	681	return (1);
	682	if (a->addr[1].addr32[1] < b->addr[1].addr32[1])
	683	return (-1);
	684	if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
	685	return (1);
	686	if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
	687	return (-1);
	688	if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
	689	return (1);
	690	if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
	691	return (-1);
	692	break;
	693	#endif /* INET6 */
	694	}
	695
	696	if ((diff = a->port[0] - b->port[0]) != 0)
	697	return (diff);
	698	if ((diff = a->port[1] - b->port[1]) != 0)
	699	return (diff);
	700
	701	return (0);
	702	}
	703
	704	/*
	705	* Used for RB_FIND only, compare in the reverse direction. The
	706	* element to be reversed is always (a), since we obviously can't
	707	* reverse the state tree depicted by (b).
	708	*/
	709	static __inline int
	710	pf_state_compare_rkey(struct pf_state_key a, struct pf_state_key b)
	711	{
	712	int diff;
	713
	714	if ((diff = a->proto - b->proto) != 0)
	715	return (diff);
	716	if ((diff = a->af - b->af) != 0)
	717	return (diff);
	718	switch (a->af) {
	719	#ifdef INET
	720	case AF_INET:
	721	if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
	722	return (1);
	723	if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
	724	return (-1);
	725	if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
	726	return (1);
	727	if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
	728	return (-1);
	729	break;
	730	#endif /* INET */
	731	#ifdef INET6
	732	case AF_INET6:
	733	if (a->addr[1].addr32[3] > b->addr[0].addr32[3])
	734	return (1);
	735	if (a->addr[1].addr32[3] < b->addr[0].addr32[3])
	736	return (-1);
	737	if (a->addr[0].addr32[3] > b->addr[1].addr32[3])
	738	return (1);
	739	if (a->addr[0].addr32[3] < b->addr[1].addr32[3])
	740	return (-1);
	741	if (a->addr[1].addr32[2] > b->addr[0].addr32[2])
	742	return (1);
	743	if (a->addr[1].addr32[2] < b->addr[0].addr32[2])
	744	return (-1);
	745	if (a->addr[0].addr32[2] > b->addr[1].addr32[2])
	746	return (1);
	747	if (a->addr[0].addr32[2] < b->addr[1].addr32[2])
	748	return (-1);
	749	if (a->addr[1].addr32[1] > b->addr[0].addr32[1])
	750	return (1);
	751	if (a->addr[1].addr32[1] < b->addr[0].addr32[1])
	752	return (-1);
	753	if (a->addr[0].addr32[1] > b->addr[1].addr32[1])
	754	return (1);
	755	if (a->addr[0].addr32[1] < b->addr[1].addr32[1])
	756	return (-1);
	757	if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
	758	return (1);
	759	if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
	760	return (-1);
	761	if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
	762	return (1);
	763	if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
	764	return (-1);
	765	break;
	766	#endif /* INET6 */
	767	}
	768
	769	if ((diff = a->port[1] - b->port[0]) != 0)
	770	return (diff);
	771	if ((diff = a->port[0] - b->port[1]) != 0)
	772	return (diff);
	773
	774	return (0);
	775	}
	776
	777	static __inline int
	778	pf_state_compare_id(struct pf_state a, struct pf_state b)
	779	{
	780	if (a->id > b->id)
	781	return (1);
	782	if (a->id < b->id)
	783	return (-1);
	784	if (a->creatorid > b->creatorid)
	785	return (1);
	786	if (a->creatorid < b->creatorid)
	787	return (-1);
	788
	789	return (0);
	790	}
	791
	792	int
	793	pf_state_key_attach(struct pf_state_key sk, struct pf_state s, int idx)
	794	{
	795	struct pf_state_item *si;
	796	struct pf_state_key *cur;
	797	int cpu;
	798	int error;
	799
	800	/*
	801	* PFSTATE_STACK_GLOBAL is set when the state might not hash to the
	802	* current cpu. The keys are managed on the global statetbl tree
	803	* for this case. Only translations (RDR, NAT) can cause this.
	804	*
	805	* When this flag is not set we must still check the global statetbl
	806	* for a collision, and if we find one we set the HALF_DUPLEX flag
	807	* in the state.
	808	*/
	809	if (s->state_flags & PFSTATE_STACK_GLOBAL) {
	810	cpu = ncpus;
	811	lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
	812	} else {
	813	cpu = mycpu->gd_cpuid;
	814	lockmgr(&pf_global_statetbl_lock, LK_SHARED);
	815	}
	816	KKASSERT(s->key[idx] == NULL); /* XXX handle this? */
	817
	818	if (pf_status.debug >= PF_DEBUG_MISC) {
	819	kprintf("state_key attach cpu %d (%08x:%d) %s (%08x:%d)\n",
	820	cpu,
	821	ntohl(sk->addr[0].addr32[0]), ntohs(sk->port[0]),
	822	(idx == PF_SK_WIRE ? "->" : "<-"),
	823	ntohl(sk->addr[1].addr32[0]), ntohs(sk->port[1]));
	824	}
	825
	826	/*
	827	* Check whether (e.g.) a PASS rule being put on a per-cpu tree
	828	* collides with a translation rule on the global tree. This is
	829	* NOT an error. We WANT to establish state for this case so the
	830	* packet path is short-cutted and doesn't need to scan the ruleset
	831	* on every packet. But the established state will only see one
	832	* side of a two-way packet conversation. To prevent this from
	833	* causing problems (e.g. generating a RST), we force PFSTATE_SLOPPY
	834	* to be set on the established state.
	835	*
	836	* A collision against RDR state can only occur with a PASS IN in the
	837	* opposite direction or a PASS OUT in the forwards direction. This
	838	* is because RDRs are processed on the input side.
	839	*
	840	* A collision against NAT state can only occur with a PASS IN in the
	841	* forwards direction or a PASS OUT in the opposite direction. This
	842	* is because NATs are processed on the output side.
	843	*
	844	* In both situations we need to do a reverse addr/port test because
	845	* the PASS IN or PASS OUT only establishes if it doesn't match the
	846	* established RDR state in the forwards direction. The direction
	847	* flag has to be ignored (it will be one way for a PASS IN and the
	848	* other way for a PASS OUT).
	849	*
	850	* pf_global_statetbl_lock will be locked shared when testing and
	851	* not entering into the global state table.
	852	*/
	853	if (cpu != ncpus &&
	854	(cur = RB_FIND(pf_state_rtree,
	855	(struct pf_state_rtree *)&pf_statetbl[ncpus],
	856	sk)) != NULL) {
	857	TAILQ_FOREACH(si, &cur->states, entry) {
	858	/*
	859	* NOTE: We must ignore direction mismatches.
	860	*/
	861	if (si->s->kif == s->kif) {
	862	s->state_flags \|= PFSTATE_HALF_DUPLEX \|
	863	PFSTATE_SLOPPY;
	864	if (pf_status.debug >= PF_DEBUG_MISC) {
	865	kprintf(
	866	"pf: %s key attach collision "
	867	"on %s: ",
	868	(idx == PF_SK_WIRE) ?
	869	"wire" : "stack",
	870	s->kif->pfik_name);
	871	pf_print_state_parts(s,
	872	(idx == PF_SK_WIRE) ? sk : NULL,
	873	(idx == PF_SK_STACK) ? sk : NULL);
	874	kprintf("\n");
	875	}
	876	break;
	877	}
	878	}
	879	}
	880
	881	/*
	882	* Enter into either the per-cpu or the global state table.
	883	*
	884	* pf_global_statetbl_lock will be locked exclusively when entering
	885	* into the global state table.
	886	*/
	887	if ((cur = RB_INSERT(pf_state_tree, &pf_statetbl[cpu], sk)) != NULL) {
	888	/* key exists. check for same kif, if none, add to key */
	889	TAILQ_FOREACH(si, &cur->states, entry) {
	890	if (si->s->kif == s->kif &&
	891	si->s->direction == s->direction) {
	892	if (pf_status.debug >= PF_DEBUG_MISC) {
	893	kprintf(
	894	"pf: %s key attach failed on %s: ",
	895	(idx == PF_SK_WIRE) ?
	896	"wire" : "stack",
	897	s->kif->pfik_name);
	898	pf_print_state_parts(s,
	899	(idx == PF_SK_WIRE) ? sk : NULL,
	900	(idx == PF_SK_STACK) ? sk : NULL);
	901	kprintf("\n");
	902	}
	903	kfree(sk, M_PFSTATEKEYPL);
	904	error = -1;
	905	goto failed; /* collision! */
	906	}
	907	}
	908	kfree(sk, M_PFSTATEKEYPL);
	909
	910	s->key[idx] = cur;
	911	} else {
	912	s->key[idx] = sk;
	913	}
	914
	915	if ((si = kmalloc(sizeof(struct pf_state_item),
	916	M_PFSTATEITEMPL, M_NOWAIT)) == NULL) {
	917	pf_state_key_detach(s, idx);
	918	error = -1;
	919	goto failed; /* collision! */
	920	}
	921	si->s = s;
	922
	923	/* list is sorted, if-bound states before floating */
	924	if (s->kif == pfi_all)
	925	TAILQ_INSERT_TAIL(&s->key[idx]->states, si, entry);
	926	else
	927	TAILQ_INSERT_HEAD(&s->key[idx]->states, si, entry);
	928
	929	error = 0;
	930	failed:
	931	lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
	932	return error;
	933	}
	934
	935	/*
	936	* NOTE: Can only be called indirectly via the purge thread with pf_token
	937	* exclusively locked.
	938	*/
	939	void
	940	pf_detach_state(struct pf_state *s)
	941	{
	942	if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
	943	s->key[PF_SK_WIRE] = NULL;
	944
	945	if (s->key[PF_SK_STACK] != NULL)
	946	pf_state_key_detach(s, PF_SK_STACK);
	947
	948	if (s->key[PF_SK_WIRE] != NULL)
	949	pf_state_key_detach(s, PF_SK_WIRE);
	950	}
	951
	952	/*
	953	* NOTE: Can only be called indirectly via the purge thread with pf_token
	954	* exclusively locked.
	955	*/
	956	void
	957	pf_state_key_detach(struct pf_state *s, int idx)
	958	{
	959	struct pf_state_item *si;
	960	int cpu;
	961
	962	/*
	963	* PFSTATE_STACK_GLOBAL is set for translations when the translated
	964	* address/port is not localized to the same cpu that the untranslated
	965	* address/port is on. The wire pf_state_key is managed on the global
	966	* statetbl tree for this case.
	967	*/
	968	if (s->state_flags & PFSTATE_STACK_GLOBAL) {
	969	cpu = ncpus;
	970	lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
	971	} else {
	972	cpu = mycpu->gd_cpuid;
	973	}
	974
	975	si = TAILQ_FIRST(&s->key[idx]->states);
	976	while (si && si->s != s)
	977	si = TAILQ_NEXT(si, entry);
	978
	979	if (si) {
	980	TAILQ_REMOVE(&s->key[idx]->states, si, entry);
	981	kfree(si, M_PFSTATEITEMPL);
	982	}
	983
	984	if (TAILQ_EMPTY(&s->key[idx]->states)) {
	985	RB_REMOVE(pf_state_tree, &pf_statetbl[cpu], s->key[idx]);
	986	if (s->key[idx]->reverse)
	987	s->key[idx]->reverse->reverse = NULL;
	988	if (s->key[idx]->inp)
	989	s->key[idx]->inp->inp_pf_sk = NULL;
	990	kfree(s->key[idx], M_PFSTATEKEYPL);
	991	}
	992	s->key[idx] = NULL;
	993
	994	if (s->state_flags & PFSTATE_STACK_GLOBAL)
	995	lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
	996	}
	997
	998	struct pf_state_key *
	999	pf_alloc_state_key(int pool_flags)
	1000	{
	1001	struct pf_state_key *sk;
	1002
	1003	sk = kmalloc(sizeof(struct pf_state_key), M_PFSTATEKEYPL, pool_flags);
	1004	if (sk) {
	1005	TAILQ_INIT(&sk->states);
	1006	}
	1007	return (sk);
	1008	}
	1009
	1010	int
	1011	pf_state_key_setup(struct pf_pdesc pd, struct pf_rule nr,
	1012	struct pf_state_key skw, struct pf_state_key sks,
	1013	struct pf_state_key skp, struct pf_state_key nkp,
	1014	struct pf_addr saddr, struct pf_addr daddr,
	1015	u_int16_t sport, u_int16_t dport)
	1016	{
	1017	KKASSERT((skp == NULL && nkp == NULL));
	1018
	1019	if ((*skp = pf_alloc_state_key(M_NOWAIT \| M_ZERO)) == NULL)
	1020	return (ENOMEM);
	1021
	1022	PF_ACPY(&(*skp)->addr[pd->sidx], saddr, pd->af);
	1023	PF_ACPY(&(*skp)->addr[pd->didx], daddr, pd->af);
	1024	(*skp)->port[pd->sidx] = sport;
	1025	(*skp)->port[pd->didx] = dport;
	1026	(*skp)->proto = pd->proto;
	1027	(*skp)->af = pd->af;
	1028
	1029	if (nr != NULL) {
	1030	if ((*nkp = pf_alloc_state_key(M_NOWAIT \| M_ZERO)) == NULL)
	1031	return (ENOMEM); /* caller must handle cleanup */
	1032
	1033	/* XXX maybe just bcopy and TAILQ_INIT(&(nkp)->states) /
	1034	PF_ACPY(&(nkp)->addr[0], &(skp)->addr[0], pd->af);
	1035	PF_ACPY(&(nkp)->addr[1], &(skp)->addr[1], pd->af);
	1036	(nkp)->port[0] = (skp)->port[0];
	1037	(nkp)->port[1] = (skp)->port[1];
	1038	(*nkp)->proto = pd->proto;
	1039	(*nkp)->af = pd->af;
	1040	} else {
	1041	nkp = skp;
	1042	}
	1043
	1044	if (pd->dir == PF_IN) {
	1045	skw = skp;
	1046	sks = nkp;
	1047	} else {
	1048	sks = skp;
	1049	skw = nkp;
	1050	}
	1051	return (0);
	1052	}
	1053
	1054	/*
	1055	* Insert pf_state with one or two state keys (allowing a reverse path lookup
	1056	* which is used by NAT). In the NAT case skw is the initiator (?) and
	1057	* sks is the target.
	1058	*/
	1059	int
	1060	pf_state_insert(struct pfi_kif kif, struct pf_state_key skw,
	1061	struct pf_state_key sks, struct pf_state s)
	1062	{
	1063	int cpu = mycpu->gd_cpuid;
	1064
	1065	s->kif = kif;
	1066	s->cpuid = cpu;
	1067
	1068	if (skw == sks) {
	1069	if (pf_state_key_attach(skw, s, PF_SK_WIRE))
	1070	return (-1);
	1071	s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
	1072	} else {
	1073	/*
	1074	skw->reverse = sks;
	1075	sks->reverse = skw;
	1076	*/
	1077	if (pf_state_key_attach(skw, s, PF_SK_WIRE)) {
	1078	kfree(sks, M_PFSTATEKEYPL);
	1079	return (-1);
	1080	}
	1081	if (pf_state_key_attach(sks, s, PF_SK_STACK)) {
	1082	pf_state_key_detach(s, PF_SK_WIRE);
	1083	return (-1);
	1084	}
	1085	}
	1086
	1087	if (s->id == 0 && s->creatorid == 0) {
	1088	u_int64_t sid;
	1089
	1090	sid = atomic_fetchadd_long(&pf_status.stateid, 1);
	1091	s->id = htobe64(sid);
	1092	s->creatorid = pf_status.hostid;
	1093	}
	1094
	1095	/*
	1096	* Calculate hash code for altq
	1097	*/
	1098	s->hash = crc32(s->key[PF_SK_WIRE], PF_STATE_KEY_HASH_LENGTH);
	1099
	1100	if (RB_INSERT(pf_state_tree_id, &tree_id[cpu], s) != NULL) {
	1101	if (pf_status.debug >= PF_DEBUG_MISC) {
	1102	kprintf("pf: state insert failed: "
	1103	"id: %016jx creatorid: %08x",
	1104	(uintmax_t)be64toh(s->id), ntohl(s->creatorid));
	1105	if (s->sync_flags & PFSTATE_FROMSYNC)
	1106	kprintf(" (from sync)");
	1107	kprintf("\n");
	1108	}
	1109	pf_detach_state(s);
	1110	return (-1);
	1111	}
	1112	TAILQ_INSERT_TAIL(&state_list[cpu], s, entry_list);
	1113	PF_INC_FCOUNTER(FCNT_STATE_INSERT);
	1114	atomic_add_int(&pf_status.states, 1);
	1115	pfi_kif_ref(kif, PFI_KIF_REF_STATE);
	1116	pfsync_insert_state(s);
	1117	return (0);
	1118	}
	1119
	1120	struct pf_state *
	1121	pf_find_state_byid(struct pf_state_cmp *key)
	1122	{
	1123	int cpu = mycpu->gd_cpuid;
	1124
	1125	PF_INC_FCOUNTER(FCNT_STATE_SEARCH);
	1126
	1127	return (RB_FIND(pf_state_tree_id, &tree_id[cpu],
	1128	(struct pf_state *)key));
	1129	}
	1130
	1131	/*
	1132	* WARNING! May return a state structure that was localized to another cpu,
	1133	* destruction is typically protected by the callers pf_token.
	1134	* The element can only be destroyed
	1135	*/
	1136	struct pf_state *
	1137	pf_find_state(struct pfi_kif kif, struct pf_state_key_cmp key, u_int dir,
	1138	struct mbuf *m)
	1139	{
	1140	struct pf_state_key skey = (void )key;
	1141	struct pf_state_key *sk;
	1142	struct pf_state_item *si;
	1143	struct pf_state *s;
	1144	int cpu = mycpu->gd_cpuid;
	1145	int globalstl = 0;
	1146
	1147	PF_INC_FCOUNTER(FCNT_STATE_SEARCH);
	1148
	1149	if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
	1150	((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse) {
	1151	sk = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse;
	1152	} else {
	1153	sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
	1154	if (sk == NULL) {
	1155	lockmgr(&pf_global_statetbl_lock, LK_SHARED);
	1156	sk = RB_FIND(pf_state_tree, &pf_statetbl[ncpus], skey);
	1157	if (sk == NULL) {
	1158	lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
	1159	return (NULL);
	1160	}
	1161	globalstl = 1;
	1162	}
	1163	if (dir == PF_OUT && m->m_pkthdr.pf.statekey) {
	1164	((struct pf_state_key *)
	1165	m->m_pkthdr.pf.statekey)->reverse = sk;
	1166	sk->reverse = m->m_pkthdr.pf.statekey;
	1167	}
	1168	}
	1169	if (dir == PF_OUT)
	1170	m->m_pkthdr.pf.statekey = NULL;
	1171
	1172	/* list is sorted, if-bound states before floating ones */
	1173	TAILQ_FOREACH(si, &sk->states, entry) {
	1174	if ((si->s->kif == pfi_all \|\| si->s->kif == kif) &&
	1175	sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
	1176	si->s->key[PF_SK_STACK])) {
	1177	break;
	1178	}
	1179	}
	1180
	1181	/*
	1182	* Extract state before potentially releasing the global statetbl
	1183	* lock. Ignore the state if the create is still in-progress as
	1184	* it can be deleted out from under us by the owning localized cpu.
	1185	* However, if CREATEINPROG is not set, state can only be deleted
	1186	* by the purge thread which we are protected from via our shared
	1187	* pf_token.
	1188	*/
	1189	if (si) {
	1190	s = si->s;
	1191	if (s && (s->state_flags & PFSTATE_CREATEINPROG))
	1192	s = NULL;
	1193	} else {
	1194	s = NULL;
	1195	}
	1196	if (globalstl)
	1197	lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
	1198	return s;
	1199	}
	1200
	1201	/*
	1202	* WARNING! May return a state structure that was localized to another cpu,
	1203	* destruction is typically protected by the callers pf_token.
	1204	*/
	1205	struct pf_state *
	1206	pf_find_state_all(struct pf_state_key_cmp key, u_int dir, int more)
	1207	{
	1208	struct pf_state_key skey = (void )key;
	1209	struct pf_state_key *sk;
	1210	struct pf_state_item si, ret = NULL;
	1211	struct pf_state *s;
	1212	int cpu = mycpu->gd_cpuid;
	1213	int globalstl = 0;
	1214
	1215	PF_INC_FCOUNTER(FCNT_STATE_SEARCH);
	1216
	1217	sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
	1218	if (sk == NULL) {
	1219	lockmgr(&pf_global_statetbl_lock, LK_SHARED);
	1220	sk = RB_FIND(pf_state_tree, &pf_statetbl[ncpus], skey);
	1221	globalstl = 1;
	1222	}
	1223	if (sk != NULL) {
	1224	TAILQ_FOREACH(si, &sk->states, entry)
	1225	if (dir == PF_INOUT \|\|
	1226	(sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
	1227	si->s->key[PF_SK_STACK]))) {
	1228	if (more == NULL) {
	1229	ret = si;
	1230	break;
	1231	}
	1232	if (ret)
	1233	(*more)++;
	1234	else
	1235	ret = si;
	1236	}
	1237	}
	1238
	1239	/*
	1240	* Extract state before potentially releasing the global statetbl
	1241	* lock. Ignore the state if the create is still in-progress as
	1242	* it can be deleted out from under us by the owning localized cpu.
	1243	* However, if CREATEINPROG is not set, state can only be deleted
	1244	* by the purge thread which we are protected from via our shared
	1245	* pf_token.
	1246	*/
	1247	if (ret) {
	1248	s = ret->s;
	1249	if (s && (s->state_flags & PFSTATE_CREATEINPROG))
	1250	s = NULL;
	1251	} else {
	1252	s = NULL;
	1253	}
	1254	if (globalstl)
	1255	lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
	1256	return s;
	1257	}
	1258
	1259	/* END state table stuff */
	1260
	1261	void
	1262	pf_purge_thread(void *v)
	1263	{
	1264	globaldata_t save_gd = mycpu;
	1265	int nloops = 0;
	1266	int locked = 0;
	1267	int nn;
	1268	int endingit;
	1269
	1270	for (;;) {
	1271	tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);
	1272
	1273	endingit = pf_end_threads;
	1274
	1275	for (nn = 0; nn < ncpus; ++nn) {
	1276	lwkt_setcpu_self(globaldata_find(nn));
	1277
	1278	lwkt_gettoken(&pf_token);
	1279	lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
	1280	crit_enter();
	1281
	1282	/*
	1283	* process a fraction of the state table every second
	1284	*/
	1285	if(!pf_purge_expired_states(
	1286	1 + (pf_status.states /
	1287	pf_default_rule.timeout[
	1288	PFTM_INTERVAL]), 0)) {
	1289	pf_purge_expired_states(
	1290	1 + (pf_status.states /
	1291	pf_default_rule.timeout[
	1292	PFTM_INTERVAL]), 1);
	1293	}
	1294
	1295	/*
	1296	* purge other expired types every PFTM_INTERVAL
	1297	* seconds
	1298	*/
	1299	if (++nloops >=
	1300	pf_default_rule.timeout[PFTM_INTERVAL]) {
	1301	pf_purge_expired_fragments();
	1302	if (!pf_purge_expired_src_nodes(locked)) {
	1303	pf_purge_expired_src_nodes(1);
	1304	}
	1305	nloops = 0;
	1306	}
	1307
	1308	/*
	1309	* If terminating the thread, clean everything out
	1310	* (on all cpus).
	1311	*/
	1312	if (endingit) {
	1313	pf_purge_expired_states(pf_status.states, 0);
	1314	pf_purge_expired_fragments();
	1315	pf_purge_expired_src_nodes(1);
	1316	}
	1317
	1318	crit_exit();
	1319	lockmgr(&pf_consistency_lock, LK_RELEASE);
	1320	lwkt_reltoken(&pf_token);
	1321	}
	1322	lwkt_setcpu_self(save_gd);
	1323	if (endingit)
	1324	break;
	1325	}
	1326
	1327	/*
	1328	* Thread termination
	1329	*/
	1330	pf_end_threads++;
	1331	wakeup(pf_purge_thread);
	1332	kthread_exit();
	1333	}
	1334
	1335	u_int32_t
	1336	pf_state_expires(const struct pf_state *state)
	1337	{
	1338	u_int32_t timeout;
	1339	u_int32_t start;
	1340	u_int32_t end;
	1341	u_int32_t states;
	1342
	1343	/* handle all PFTM_* > PFTM_MAX here */
	1344	if (state->timeout == PFTM_PURGE)
	1345	return (time_second);
	1346	if (state->timeout == PFTM_UNTIL_PACKET)
	1347	return (0);
	1348	KKASSERT(state->timeout != PFTM_UNLINKED);
	1349	KKASSERT(state->timeout < PFTM_MAX);
	1350	timeout = state->rule.ptr->timeout[state->timeout];
	1351	if (!timeout)
	1352	timeout = pf_default_rule.timeout[state->timeout];
	1353	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
	1354	if (start) {
	1355	end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
	1356	states = state->rule.ptr->states_cur;
	1357	} else {
	1358	start = pf_default_rule.timeout[PFTM_ADAPTIVE_START];
	1359	end = pf_default_rule.timeout[PFTM_ADAPTIVE_END];
	1360	states = pf_status.states;
	1361	}
	1362
	1363	/*
	1364	* If the number of states exceeds allowed values, adaptively
	1365	* timeout the state more quickly. This can be very dangerous
	1366	* to legitimate connections, however, so defray the timeout
	1367	* based on the packet count.
	1368	*
	1369	* Retain from 0-100% based on number of states.
	1370	*
	1371	* Recover up to 50% of the lost portion if there was
	1372	* packet traffic (100 pkts = 50%).
	1373	*/
	1374	if (end && states > start && start < end) {
	1375	u_int32_t n; /* timeout retention 0-100% */
	1376	u_int64_t pkts;
	1377	#if 0
	1378	static struct krate boorate = { .freq = 1 };
	1379	#endif
	1380
	1381	/*
	1382	* Reduce timeout by n% (0-100)
	1383	*/
	1384	n = (states - start) * 100 / (end - start);
	1385	if (n > 100)
	1386	n = 0;
	1387	else
	1388	n = 100 - n;
	1389
	1390	/*
	1391	* But claw back some of the reduction based on packet
	1392	* count associated with the state.
	1393	*/
	1394	pkts = state->packets[0] + state->packets[1];
	1395	if (pkts > 100)
	1396	pkts = 100;
	1397	#if 0
	1398	krateprintf(&boorate, "timeout %-4u n=%u pkts=%-3lu -> %lu\n",
	1399	timeout, n, pkts, n + (100 - n) * pkts / 200);
	1400	#endif
	1401
	1402	n += (100 - n) * pkts / 200; /* recover by up-to 50% */
	1403	timeout = timeout * n / 100;
	1404
	1405	}
	1406	return (state->expire + timeout);
	1407	}
	1408
	1409	/*
	1410	* (called with exclusive pf_token)
	1411	*/
	1412	int
	1413	pf_purge_expired_src_nodes(int waslocked)
	1414	{
	1415	struct pf_src_node cur, next;
	1416	int locked = waslocked;
	1417	int cpu = mycpu->gd_cpuid;
	1418
	1419	for (cur = RB_MIN(pf_src_tree, &tree_src_tracking[cpu]);
	1420	cur;
	1421	cur = next) {
	1422	next = RB_NEXT(pf_src_tree, &tree_src_tracking[cpu], cur);
	1423
	1424	if (cur->states <= 0 && cur->expire <= time_second) {
	1425	if (!locked) {
	1426	lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
	1427	next = RB_NEXT(pf_src_tree,
	1428	&tree_src_tracking[cpu], cur);
	1429	locked = 1;
	1430	}
	1431	if (cur->rule.ptr != NULL) {
	1432	/*
	1433	* decrements in rule should be ok, token is
	1434	* held exclusively in this code path.
	1435	*/
	1436	atomic_add_int(&cur->rule.ptr->src_nodes, -1);
	1437	if (cur->rule.ptr->states_cur <= 0 &&
	1438	cur->rule.ptr->max_src_nodes <= 0)
	1439	pf_rm_rule(NULL, cur->rule.ptr);
	1440	}
	1441	RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], cur);
	1442	PF_INC_SCOUNTER(SCNT_SRC_NODE_REMOVALS);
	1443	atomic_add_int(&pf_status.src_nodes, -1);
	1444	kfree(cur, M_PFSRCTREEPL);
	1445	}
	1446	}
	1447	if (locked && !waslocked)
	1448	lockmgr(&pf_consistency_lock, LK_RELEASE);
	1449	return(1);
	1450	}
	1451
	1452	void
	1453	pf_src_tree_remove_state(struct pf_state *s)
	1454	{
	1455	u_int32_t timeout;
	1456
	1457	if (s->src_node != NULL) {
	1458	if (s->src.tcp_est)
	1459	atomic_add_int(&s->src_node->conn, -1);
	1460	if (--s->src_node->states <= 0) {
	1461	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
	1462	if (!timeout) {
	1463	timeout =
	1464	pf_default_rule.timeout[PFTM_SRC_NODE];
	1465	}
	1466	s->src_node->expire = time_second + timeout;
	1467	}
	1468	}
	1469	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
	1470	if (--s->nat_src_node->states <= 0) {
	1471	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
	1472	if (!timeout)
	1473	timeout =
	1474	pf_default_rule.timeout[PFTM_SRC_NODE];
	1475	s->nat_src_node->expire = time_second + timeout;
	1476	}
	1477	}
	1478	s->src_node = s->nat_src_node = NULL;
	1479	}
	1480
	1481	/* callers should be at crit_enter() */
	1482	void
	1483	pf_unlink_state(struct pf_state *cur)
	1484	{
	1485	int cpu = mycpu->gd_cpuid;
	1486
	1487	if (cur->src.state == PF_TCPS_PROXY_DST) {
	1488	/* XXX wire key the right one? */
	1489	pf_send_tcp(cur->rule.ptr, cur->key[PF_SK_WIRE]->af,
	1490	&cur->key[PF_SK_WIRE]->addr[1],
	1491	&cur->key[PF_SK_WIRE]->addr[0],
	1492	cur->key[PF_SK_WIRE]->port[1],
	1493	cur->key[PF_SK_WIRE]->port[0],
	1494	cur->src.seqhi, cur->src.seqlo + 1,
	1495	TH_RST\|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL);
	1496	}
	1497	RB_REMOVE(pf_state_tree_id, &tree_id[cpu], cur);
	1498	if (cur->creatorid == pf_status.hostid)
	1499	pfsync_delete_state(cur);
	1500	cur->timeout = PFTM_UNLINKED;
	1501	pf_src_tree_remove_state(cur);
	1502	pf_detach_state(cur);
	1503	}
	1504
	1505	/*
	1506	* callers should be at crit_enter() and hold pf_consistency_lock exclusively.
	1507	* pf_token must also be held exclusively.
	1508	*/
	1509	void
	1510	pf_free_state(struct pf_state *cur)
	1511	{
	1512	int cpu = mycpu->gd_cpuid;
	1513
	1514	KKASSERT(cur->cpuid == cpu);
	1515
	1516	if (pfsyncif != NULL &&
	1517	(pfsyncif->sc_bulk_send_next == cur \|\|
	1518	pfsyncif->sc_bulk_terminator == cur))
	1519	return;
	1520	KKASSERT(cur->timeout == PFTM_UNLINKED);
	1521	/*
	1522	* decrements in rule should be ok, token is
	1523	* held exclusively in this code path.
	1524	*/
	1525	if (--cur->rule.ptr->states_cur <= 0 &&
	1526	cur->rule.ptr->src_nodes <= 0)
	1527	pf_rm_rule(NULL, cur->rule.ptr);
	1528	if (cur->nat_rule.ptr != NULL) {
	1529	if (--cur->nat_rule.ptr->states_cur <= 0 &&
	1530	cur->nat_rule.ptr->src_nodes <= 0) {
	1531	pf_rm_rule(NULL, cur->nat_rule.ptr);
	1532	}
	1533	}
	1534	if (cur->anchor.ptr != NULL) {
	1535	if (--cur->anchor.ptr->states_cur <= 0)
	1536	pf_rm_rule(NULL, cur->anchor.ptr);
	1537	}
	1538	pf_normalize_tcp_cleanup(cur);
	1539	pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
	1540
	1541	/*
	1542	* We may be freeing pf_purge_expired_states()'s saved scan entry,
	1543	* adjust it if necessary.
	1544	*/
	1545	if (purge_cur[cpu] == cur) {
	1546	kprintf("PURGE CONFLICT\n");
	1547	purge_cur[cpu] = TAILQ_NEXT(purge_cur[cpu], entry_list);
	1548	}
	1549	TAILQ_REMOVE(&state_list[cpu], cur, entry_list);
	1550	if (cur->tag)
	1551	pf_tag_unref(cur->tag);
	1552	kfree(cur, M_PFSTATEPL);
	1553	PF_INC_FCOUNTER(FCNT_STATE_REMOVALS);
	1554	atomic_add_int(&pf_status.states, -1);
	1555	}
	1556
	1557	int
	1558	pf_purge_expired_states(u_int32_t maxcheck, int waslocked)
	1559	{
	1560	struct pf_state *cur;
	1561	int locked = waslocked;
	1562	int cpu = mycpu->gd_cpuid;
	1563
	1564	while (maxcheck--) {
	1565	/*
	1566	* Wrap to start of list when we hit the end
	1567	*/
	1568	cur = purge_cur[cpu];
	1569	if (cur == NULL) {
	1570	cur = TAILQ_FIRST(&state_list[cpu]);
	1571	if (cur == NULL)
	1572	break; /* list empty */
	1573	}
	1574
	1575	/*
	1576	* Setup next (purge_cur) while we process this one. If
	1577	* we block and something else deletes purge_cur,
	1578	* pf_free_state() will adjust it further ahead.
	1579	*/
	1580	purge_cur[cpu] = TAILQ_NEXT(cur, entry_list);
	1581
	1582	if (cur->timeout == PFTM_UNLINKED) {
	1583	/* free unlinked state */
	1584	if (! locked) {
	1585	lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
	1586	locked = 1;
	1587	}
	1588	pf_free_state(cur);
	1589	} else if (pf_state_expires(cur) <= time_second) {
	1590	/* unlink and free expired state */
	1591	pf_unlink_state(cur);
	1592	if (! locked) {
	1593	if (!lockmgr(&pf_consistency_lock, LK_EXCLUSIVE))
	1594	return (0);
	1595	locked = 1;
	1596	}
	1597	pf_free_state(cur);
	1598	}
	1599	}
	1600
	1601	if (locked)
	1602	lockmgr(&pf_consistency_lock, LK_RELEASE);
	1603	return (1);
	1604	}
	1605
	1606	int
	1607	pf_tbladdr_setup(struct pf_ruleset rs, struct pf_addr_wrap aw)
	1608	{
	1609	if (aw->type != PF_ADDR_TABLE)
	1610	return (0);
	1611	if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL)
	1612	return (1);
	1613	return (0);
	1614	}
	1615
	1616	void
	1617	pf_tbladdr_remove(struct pf_addr_wrap *aw)
	1618	{
	1619	if (aw->type != PF_ADDR_TABLE \|\| aw->p.tbl == NULL)
	1620	return;
	1621	pfr_detach_table(aw->p.tbl);
	1622	aw->p.tbl = NULL;
	1623	}
	1624
	1625	void
	1626	pf_tbladdr_copyout(struct pf_addr_wrap *aw)
	1627	{
	1628	struct pfr_ktable *kt = aw->p.tbl;
	1629
	1630	if (aw->type != PF_ADDR_TABLE \|\| kt == NULL)
	1631	return;
	1632	if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
	1633	kt = kt->pfrkt_root;
	1634	aw->p.tbl = NULL;
	1635	aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
	1636	kt->pfrkt_cnt : -1;
	1637	}
	1638
	1639	void
	1640	pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
	1641	{
	1642	switch (af) {
	1643	#ifdef INET
	1644	case AF_INET: {
	1645	u_int32_t a = ntohl(addr->addr32[0]);
	1646	kprintf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
	1647	(a>>8)&255, a&255);
	1648	if (p) {
	1649	p = ntohs(p);
	1650	kprintf(":%u", p);
	1651	}
	1652	break;
	1653	}
	1654	#endif /* INET */
	1655	#ifdef INET6
	1656	case AF_INET6: {
	1657	u_int16_t b;
	1658	u_int8_t i, curstart, curend, maxstart, maxend;
	1659	curstart = curend = maxstart = maxend = 255;
	1660	for (i = 0; i < 8; i++) {
	1661	if (!addr->addr16[i]) {
	1662	if (curstart == 255)
	1663	curstart = i;
	1664	curend = i;
	1665	} else {
	1666	if ((curend - curstart) >
	1667	(maxend - maxstart)) {
	1668	maxstart = curstart;
	1669	maxend = curend;
	1670	}
	1671	curstart = curend = 255;
	1672	}
	1673	}
	1674	if ((curend - curstart) >
	1675	(maxend - maxstart)) {
	1676	maxstart = curstart;
	1677	maxend = curend;
	1678	}
	1679	for (i = 0; i < 8; i++) {
	1680	if (i >= maxstart && i <= maxend) {
	1681	if (i == 0)
	1682	kprintf(":");
	1683	if (i == maxend)
	1684	kprintf(":");
	1685	} else {
	1686	b = ntohs(addr->addr16[i]);
	1687	kprintf("%x", b);
	1688	if (i < 7)
	1689	kprintf(":");
	1690	}
	1691	}
	1692	if (p) {
	1693	p = ntohs(p);
	1694	kprintf("[%u]", p);
	1695	}
	1696	break;
	1697	}
	1698	#endif /* INET6 */
	1699	}
	1700	}
	1701
	1702	void
	1703	pf_print_state(struct pf_state *s)
	1704	{
	1705	pf_print_state_parts(s, NULL, NULL);
	1706	}
	1707
	1708	void
	1709	pf_print_state_parts(struct pf_state *s,
	1710	struct pf_state_key skwp, struct pf_state_key sksp)
	1711	{
	1712	struct pf_state_key skw, sks;
	1713	u_int8_t proto, dir;
	1714
	1715	/* Do our best to fill these, but they're skipped if NULL */
	1716	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
	1717	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
	1718	proto = skw ? skw->proto : (sks ? sks->proto : 0);
	1719	dir = s ? s->direction : 0;
	1720
	1721	switch (proto) {
	1722	case IPPROTO_TCP:
	1723	kprintf("TCP ");
	1724	break;
	1725	case IPPROTO_UDP:
	1726	kprintf("UDP ");
	1727	break;
	1728	case IPPROTO_ICMP:
	1729	kprintf("ICMP ");
	1730	break;
	1731	case IPPROTO_ICMPV6:
	1732	kprintf("ICMPV6 ");
	1733	break;
	1734	default:
	1735	kprintf("%u ", skw->proto);
	1736	break;
	1737	}
	1738	switch (dir) {
	1739	case PF_IN:
	1740	kprintf(" in");
	1741	break;
	1742	case PF_OUT:
	1743	kprintf(" out");
	1744	break;
	1745	}
	1746	if (skw) {
	1747	kprintf(" wire: ");
	1748	pf_print_host(&skw->addr[0], skw->port[0], skw->af);
	1749	kprintf(" ");
	1750	pf_print_host(&skw->addr[1], skw->port[1], skw->af);
	1751	}
	1752	if (sks) {
	1753	kprintf(" stack: ");
	1754	if (sks != skw) {
	1755	pf_print_host(&sks->addr[0], sks->port[0], sks->af);
	1756	kprintf(" ");
	1757	pf_print_host(&sks->addr[1], sks->port[1], sks->af);
	1758	} else
	1759	kprintf("-");
	1760	}
	1761	if (s) {
	1762	if (proto == IPPROTO_TCP) {
	1763	kprintf(" [lo=%u high=%u win=%u modulator=%u",
	1764	s->src.seqlo, s->src.seqhi,
	1765	s->src.max_win, s->src.seqdiff);
	1766	if (s->src.wscale && s->dst.wscale)
	1767	kprintf(" wscale=%u",
	1768	s->src.wscale & PF_WSCALE_MASK);
	1769	kprintf("]");
	1770	kprintf(" [lo=%u high=%u win=%u modulator=%u",
	1771	s->dst.seqlo, s->dst.seqhi,
	1772	s->dst.max_win, s->dst.seqdiff);
	1773	if (s->src.wscale && s->dst.wscale)
	1774	kprintf(" wscale=%u",
	1775	s->dst.wscale & PF_WSCALE_MASK);
	1776	kprintf("]");
	1777	}
	1778	kprintf(" %u:%u", s->src.state, s->dst.state);
	1779	}
	1780	}
	1781
	1782	void
	1783	pf_print_flags(u_int8_t f)
	1784	{
	1785	if (f)
	1786	kprintf(" ");
	1787	if (f & TH_FIN)
	1788	kprintf("F");
	1789	if (f & TH_SYN)
	1790	kprintf("S");
	1791	if (f & TH_RST)
	1792	kprintf("R");
	1793	if (f & TH_PUSH)
	1794	kprintf("P");
	1795	if (f & TH_ACK)
	1796	kprintf("A");
	1797	if (f & TH_URG)
	1798	kprintf("U");
	1799	if (f & TH_ECE)
	1800	kprintf("E");
	1801	if (f & TH_CWR)
	1802	kprintf("W");
	1803	}
	1804
	1805	#define PF_SET_SKIP_STEPS(i) \
	1806	do { \
	1807	while (head[i] != cur) { \
	1808	head[i]->skip[i].ptr = cur; \
	1809	head[i] = TAILQ_NEXT(head[i], entries); \
	1810	} \
	1811	} while (0)
	1812
	1813	void
	1814	pf_calc_skip_steps(struct pf_rulequeue *rules)
	1815	{
	1816	struct pf_rule cur, prev, *head[PF_SKIP_COUNT];
	1817	int i;
	1818
	1819	cur = TAILQ_FIRST(rules);
	1820	prev = cur;
	1821	for (i = 0; i < PF_SKIP_COUNT; ++i)
	1822	head[i] = cur;
	1823	while (cur != NULL) {
	1824
	1825	if (cur->kif != prev->kif \|\| cur->ifnot != prev->ifnot)
	1826	PF_SET_SKIP_STEPS(PF_SKIP_IFP);
	1827	if (cur->direction != prev->direction)
	1828	PF_SET_SKIP_STEPS(PF_SKIP_DIR);
	1829	if (cur->af != prev->af)
	1830	PF_SET_SKIP_STEPS(PF_SKIP_AF);
	1831	if (cur->proto != prev->proto)
	1832	PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
	1833	if (cur->src.neg != prev->src.neg \|\|
	1834	pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
	1835	PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
	1836	if (cur->src.port[0] != prev->src.port[0] \|\|
	1837	cur->src.port[1] != prev->src.port[1] \|\|
	1838	cur->src.port_op != prev->src.port_op)
	1839	PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
	1840	if (cur->dst.neg != prev->dst.neg \|\|
	1841	pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
	1842	PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
	1843	if (cur->dst.port[0] != prev->dst.port[0] \|\|
	1844	cur->dst.port[1] != prev->dst.port[1] \|\|
	1845	cur->dst.port_op != prev->dst.port_op)
	1846	PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
	1847
	1848	prev = cur;
	1849	cur = TAILQ_NEXT(cur, entries);
	1850	}
	1851	for (i = 0; i < PF_SKIP_COUNT; ++i)
	1852	PF_SET_SKIP_STEPS(i);
	1853	}
	1854
	1855	int
	1856	pf_addr_wrap_neq(struct pf_addr_wrap aw1, struct pf_addr_wrap aw2)
	1857	{
	1858	if (aw1->type != aw2->type)
	1859	return (1);
	1860	switch (aw1->type) {
	1861	case PF_ADDR_ADDRMASK:
	1862	case PF_ADDR_RANGE:
	1863	if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
	1864	return (1);
	1865	if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
	1866	return (1);
	1867	return (0);
	1868	case PF_ADDR_DYNIFTL:
	1869	return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
	1870	case PF_ADDR_NOROUTE:
	1871	case PF_ADDR_URPFFAILED:
	1872	return (0);
	1873	case PF_ADDR_TABLE:
	1874	return (aw1->p.tbl != aw2->p.tbl);
	1875	case PF_ADDR_RTLABEL:
	1876	return (aw1->v.rtlabel != aw2->v.rtlabel);
	1877	default:
	1878	kprintf("invalid address type: %d\n", aw1->type);
	1879	return (1);
	1880	}
	1881	}
	1882
	1883	u_int16_t
	1884	pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
	1885	{
	1886	u_int32_t l;
	1887
	1888	if (udp && !cksum)
	1889	return (0x0000);
	1890	l = cksum + old - new;
	1891	l = (l >> 16) + (l & 65535);
	1892	l = l & 65535;
	1893	if (udp && !l)
	1894	return (0xFFFF);
	1895	return (l);
	1896	}
	1897
	1898	void
	1899	pf_change_ap(struct pf_addr a, u_int16_t p, u_int16_t ic, u_int16_t pc,
	1900	struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
	1901	{
	1902	struct pf_addr ao;
	1903	u_int16_t po = *p;
	1904
	1905	PF_ACPY(&ao, a, af);
	1906	PF_ACPY(a, an, af);
	1907
	1908	*p = pn;
	1909
	1910	switch (af) {
	1911	#ifdef INET
	1912	case AF_INET:
	1913	ic = pf_cksum_fixup(pf_cksum_fixup(ic,
	1914	ao.addr16[0], an->addr16[0], 0),
	1915	ao.addr16[1], an->addr16[1], 0);
	1916	*p = pn;
	1917	pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(pc,
	1918	ao.addr16[0], an->addr16[0], u),
	1919	ao.addr16[1], an->addr16[1], u),
	1920	po, pn, u);
	1921	break;
	1922	#endif /* INET */
	1923	#ifdef INET6
	1924	case AF_INET6:
	1925	*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	1926	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	1927	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
	1928	ao.addr16[0], an->addr16[0], u),
	1929	ao.addr16[1], an->addr16[1], u),
	1930	ao.addr16[2], an->addr16[2], u),
	1931	ao.addr16[3], an->addr16[3], u),
	1932	ao.addr16[4], an->addr16[4], u),
	1933	ao.addr16[5], an->addr16[5], u),
	1934	ao.addr16[6], an->addr16[6], u),
	1935	ao.addr16[7], an->addr16[7], u),
	1936	po, pn, u);
	1937	break;
	1938	#endif /* INET6 */
	1939	}
	1940	}
	1941
	1942
	1943	/* Changes a u_int32_t. Uses a void * so there are no align restrictions */
	1944	void
	1945	pf_change_a(void a, u_int16_t c, u_int32_t an, u_int8_t u)
	1946	{
	1947	u_int32_t ao;
	1948
	1949	memcpy(&ao, a, sizeof(ao));
	1950	memcpy(a, &an, sizeof(u_int32_t));
	1951	c = pf_cksum_fixup(pf_cksum_fixup(c, ao / 65536, an / 65536, u),
	1952	ao % 65536, an % 65536, u);
	1953	}
	1954
	1955	#ifdef INET6
	1956	void
	1957	pf_change_a6(struct pf_addr a, u_int16_t c, struct pf_addr *an, u_int8_t u)
	1958	{
	1959	struct pf_addr ao;
	1960
	1961	PF_ACPY(&ao, a, AF_INET6);
	1962	PF_ACPY(a, an, AF_INET6);
	1963
	1964	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	1965	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	1966	pf_cksum_fixup(pf_cksum_fixup(*c,
	1967	ao.addr16[0], an->addr16[0], u),
	1968	ao.addr16[1], an->addr16[1], u),
	1969	ao.addr16[2], an->addr16[2], u),
	1970	ao.addr16[3], an->addr16[3], u),
	1971	ao.addr16[4], an->addr16[4], u),
	1972	ao.addr16[5], an->addr16[5], u),
	1973	ao.addr16[6], an->addr16[6], u),
	1974	ao.addr16[7], an->addr16[7], u);
	1975	}
	1976	#endif /* INET6 */
	1977
	1978	void
	1979	pf_change_icmp(struct pf_addr ia, u_int16_t ip, struct pf_addr *oa,
	1980	struct pf_addr na, u_int16_t np, u_int16_t pc, u_int16_t *h2c,
	1981	u_int16_t ic, u_int16_t hc, u_int8_t u, sa_family_t af)
	1982	{
	1983	struct pf_addr oia, ooa;
	1984
	1985	PF_ACPY(&oia, ia, af);
	1986	if (oa)
	1987	PF_ACPY(&ooa, oa, af);
	1988
	1989	/* Change inner protocol port, fix inner protocol checksum. */
	1990	if (ip != NULL) {
	1991	u_int16_t oip = *ip;
	1992	u_int32_t opc = 0;
	1993
	1994	if (pc != NULL)
	1995	opc = *pc;
	1996	*ip = np;
	1997	if (pc != NULL)
	1998	pc = pf_cksum_fixup(pc, oip, *ip, u);
	1999	ic = pf_cksum_fixup(ic, oip, *ip, 0);
	2000	if (pc != NULL)
	2001	ic = pf_cksum_fixup(ic, opc, *pc, 0);
	2002	}
	2003	/* Change inner ip address, fix inner ip and icmp checksums. */
	2004	PF_ACPY(ia, na, af);
	2005	switch (af) {
	2006	#ifdef INET
	2007	case AF_INET: {
	2008	u_int32_t oh2c = *h2c;
	2009
	2010	h2c = pf_cksum_fixup(pf_cksum_fixup(h2c,
	2011	oia.addr16[0], ia->addr16[0], 0),
	2012	oia.addr16[1], ia->addr16[1], 0);
	2013	ic = pf_cksum_fixup(pf_cksum_fixup(ic,
	2014	oia.addr16[0], ia->addr16[0], 0),
	2015	oia.addr16[1], ia->addr16[1], 0);
	2016	ic = pf_cksum_fixup(ic, oh2c, *h2c, 0);
	2017	break;
	2018	}
	2019	#endif /* INET */
	2020	#ifdef INET6
	2021	case AF_INET6:
	2022	*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	2023	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	2024	pf_cksum_fixup(pf_cksum_fixup(*ic,
	2025	oia.addr16[0], ia->addr16[0], u),
	2026	oia.addr16[1], ia->addr16[1], u),
	2027	oia.addr16[2], ia->addr16[2], u),
	2028	oia.addr16[3], ia->addr16[3], u),
	2029	oia.addr16[4], ia->addr16[4], u),
	2030	oia.addr16[5], ia->addr16[5], u),
	2031	oia.addr16[6], ia->addr16[6], u),
	2032	oia.addr16[7], ia->addr16[7], u);
	2033	break;
	2034	#endif /* INET6 */
	2035	}
	2036	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
	2037	if (oa) {
	2038	PF_ACPY(oa, na, af);
	2039	switch (af) {
	2040	#ifdef INET
	2041	case AF_INET:
	2042	hc = pf_cksum_fixup(pf_cksum_fixup(hc,
	2043	ooa.addr16[0], oa->addr16[0], 0),
	2044	ooa.addr16[1], oa->addr16[1], 0);
	2045	break;
	2046	#endif /* INET */
	2047	#ifdef INET6
	2048	case AF_INET6:
	2049	*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	2050	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	2051	pf_cksum_fixup(pf_cksum_fixup(*ic,
	2052	ooa.addr16[0], oa->addr16[0], u),
	2053	ooa.addr16[1], oa->addr16[1], u),
	2054	ooa.addr16[2], oa->addr16[2], u),
	2055	ooa.addr16[3], oa->addr16[3], u),
	2056	ooa.addr16[4], oa->addr16[4], u),
	2057	ooa.addr16[5], oa->addr16[5], u),
	2058	ooa.addr16[6], oa->addr16[6], u),
	2059	ooa.addr16[7], oa->addr16[7], u);
	2060	break;
	2061	#endif /* INET6 */
	2062	}
	2063	}
	2064	}
	2065
	2066
	2067	/*
	2068	* Need to modulate the sequence numbers in the TCP SACK option
	2069	* (credits to Krzysztof Pfaff for report and patch)
	2070	*/
	2071	int
	2072	pf_modulate_sack(struct mbuf m, int off, struct pf_pdesc pd,
	2073	struct tcphdr th, struct pf_state_peer dst)
	2074	{
	2075	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
	2076	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
	2077	int copyback = 0, i, olen;
	2078	struct raw_sackblock sack;
	2079
	2080	#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
	2081	if (hlen < TCPOLEN_SACKLEN \|\|
	2082	!pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
	2083	return 0;
	2084
	2085	while (hlen >= TCPOLEN_SACKLEN) {
	2086	olen = opt[1];
	2087	switch (*opt) {
	2088	case TCPOPT_EOL: /* FALLTHROUGH */
	2089	case TCPOPT_NOP:
	2090	opt++;
	2091	hlen--;
	2092	break;
	2093	case TCPOPT_SACK:
	2094	if (olen > hlen)
	2095	olen = hlen;
	2096	if (olen >= TCPOLEN_SACKLEN) {
	2097	for (i = 2; i + TCPOLEN_SACK <= olen;
	2098	i += TCPOLEN_SACK) {
	2099	memcpy(&sack, &opt[i], sizeof(sack));
	2100	pf_change_a(&sack.rblk_start, &th->th_sum,
	2101	htonl(ntohl(sack.rblk_start) -
	2102	dst->seqdiff), 0);
	2103	pf_change_a(&sack.rblk_end, &th->th_sum,
	2104	htonl(ntohl(sack.rblk_end) -
	2105	dst->seqdiff), 0);
	2106	memcpy(&opt[i], &sack, sizeof(sack));
	2107	}
	2108	copyback = 1;
	2109	}
	2110	/* FALLTHROUGH */
	2111	default:
	2112	if (olen < 2)
	2113	olen = 2;
	2114	hlen -= olen;
	2115	opt += olen;
	2116	}
	2117	}
	2118
	2119	if (copyback)
	2120	m_copyback(m, off + sizeof(*th), thoptlen, opts);
	2121	return (copyback);
	2122	}
	2123
	2124	void
	2125	pf_send_tcp(const struct pf_rule *r, sa_family_t af,
	2126	const struct pf_addr saddr, const struct pf_addr daddr,
	2127	u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
	2128	u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
	2129	u_int16_t rtag, struct ether_header eh, struct ifnet ifp)
	2130	{
	2131	struct mbuf *m;
	2132	int len = 0, tlen;
	2133	#ifdef INET
	2134	struct ip *h = NULL;
	2135	#endif /* INET */
	2136	#ifdef INET6
	2137	struct ip6_hdr *h6 = NULL;
	2138	#endif /* INET6 */
	2139	struct tcphdr *th = NULL;
	2140	char *opt;
	2141
	2142	ASSERT_LWKT_TOKEN_HELD(&pf_token);
	2143
	2144	/* maximum segment size tcp option */
	2145	tlen = sizeof(struct tcphdr);
	2146	if (mss)
	2147	tlen += 4;
	2148
	2149	switch (af) {
	2150	#ifdef INET
	2151	case AF_INET:
	2152	len = sizeof(struct ip) + tlen;
	2153	break;
	2154	#endif /* INET */
	2155	#ifdef INET6
	2156	case AF_INET6:
	2157	len = sizeof(struct ip6_hdr) + tlen;
	2158	break;
	2159	#endif /* INET6 */
	2160	}
	2161
	2162	/*
	2163	* Create outgoing mbuf.
	2164	*
	2165	* DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
	2166	* so make sure pf.flags is clear.
	2167	*/
	2168	m = m_gethdr(M_NOWAIT, MT_HEADER);
	2169	if (m == NULL) {
	2170	return;
	2171	}
	2172	if (tag)
	2173	m->m_pkthdr.fw_flags \|= PF_MBUF_TAGGED;
	2174	m->m_pkthdr.pf.flags = 0;
	2175	m->m_pkthdr.pf.tag = rtag;
	2176	/* XXX Recheck when upgrading to > 4.4 */
	2177	m->m_pkthdr.pf.statekey = NULL;
	2178	if (r != NULL && r->rtableid >= 0)
	2179	m->m_pkthdr.pf.rtableid = r->rtableid;
	2180
	2181	#ifdef ALTQ
	2182	if (r != NULL && r->qid) {
	2183	m->m_pkthdr.fw_flags \|= PF_MBUF_STRUCTURE;
	2184	m->m_pkthdr.pf.qid = r->qid;
	2185	m->m_pkthdr.pf.ecn_af = af;
	2186	m->m_pkthdr.pf.hdr = mtod(m, struct ip *);
	2187	}
	2188	#endif /* ALTQ */
	2189	m->m_data += max_linkhdr;
	2190	m->m_pkthdr.len = m->m_len = len;
	2191	m->m_pkthdr.rcvif = NULL;
	2192	bzero(m->m_data, len);
	2193	switch (af) {
	2194	#ifdef INET
	2195	case AF_INET:
	2196	h = mtod(m, struct ip *);
	2197
	2198	/* IP header fields included in the TCP checksum */
	2199	h->ip_p = IPPROTO_TCP;
	2200	h->ip_len = htons(tlen);
	2201	h->ip_src.s_addr = saddr->v4.s_addr;
	2202	h->ip_dst.s_addr = daddr->v4.s_addr;
	2203
	2204	th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
	2205	break;
	2206	#endif /* INET */
	2207	#ifdef INET6
	2208	case AF_INET6:
	2209	h6 = mtod(m, struct ip6_hdr *);
	2210
	2211	/* IP header fields included in the TCP checksum */
	2212	h6->ip6_nxt = IPPROTO_TCP;
	2213	h6->ip6_plen = htons(tlen);
	2214	memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
	2215	memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
	2216
	2217	th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
	2218	break;
	2219	#endif /* INET6 */
	2220	}
	2221
	2222	/* TCP header */
	2223	th->th_sport = sport;
	2224	th->th_dport = dport;
	2225	th->th_seq = htonl(seq);
	2226	th->th_ack = htonl(ack);
	2227	th->th_off = tlen >> 2;
	2228	th->th_flags = flags;
	2229	th->th_win = htons(win);
	2230
	2231	if (mss) {
	2232	opt = (char *)(th + 1);
	2233	opt[0] = TCPOPT_MAXSEG;
	2234	opt[1] = 4;
	2235	mss = htons(mss);
	2236	bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
	2237	}
	2238
	2239	switch (af) {
	2240	#ifdef INET
	2241	case AF_INET:
	2242	/* TCP checksum */
	2243	th->th_sum = in_cksum(m, len);
	2244
	2245	/* Finish the IP header */
	2246	h->ip_v = 4;
	2247	h->ip_hl = sizeof(*h) >> 2;
	2248	h->ip_tos = IPTOS_LOWDELAY;
	2249	h->ip_len = htons(len);
	2250	h->ip_off = path_mtu_discovery ? htons(IP_DF) : 0;
	2251	h->ip_ttl = ttl ? ttl : ip_defttl;
	2252	h->ip_sum = 0;
	2253	if (eh == NULL) {
	2254	lwkt_reltoken(&pf_token);
	2255	ip_output(m, NULL, NULL, 0, NULL, NULL);
	2256	lwkt_gettoken(&pf_token);
	2257	} else {
	2258	struct route ro;
	2259	struct rtentry rt;
	2260	struct ether_header e = (void )ro.ro_dst.sa_data;
	2261
	2262	if (ifp == NULL) {
	2263	m_freem(m);
	2264	return;
	2265	}
	2266	rt.rt_ifp = ifp;
	2267	ro.ro_rt = &rt;
	2268	ro.ro_dst.sa_len = sizeof(ro.ro_dst);
	2269	ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT;
	2270	bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN);
	2271	bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN);
	2272	e->ether_type = eh->ether_type;
	2273	/* XXX_IMPORT: later */
	2274	lwkt_reltoken(&pf_token);
	2275	ip_output(m, NULL, &ro, 0, NULL, NULL);
	2276	lwkt_gettoken(&pf_token);
	2277	}
	2278	break;
	2279	#endif /* INET */
	2280	#ifdef INET6
	2281	case AF_INET6:
	2282	/* TCP checksum */
	2283	th->th_sum = in6_cksum(m, IPPROTO_TCP,
	2284	sizeof(struct ip6_hdr), tlen);
	2285
	2286	h6->ip6_vfc \|= IPV6_VERSION;
	2287	h6->ip6_hlim = IPV6_DEFHLIM;
	2288
	2289	lwkt_reltoken(&pf_token);
	2290	ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
	2291	lwkt_gettoken(&pf_token);
	2292	break;
	2293	#endif /* INET6 */
	2294	}
	2295	}
	2296
	2297	void
	2298	pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
	2299	struct pf_rule *r)
	2300	{
	2301	struct mbuf *m0;
	2302
	2303	/*
	2304	* DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
	2305	* so make sure pf.flags is clear.
	2306	*/
	2307	if ((m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
	2308	return;
	2309
	2310	m0->m_pkthdr.fw_flags \|= PF_MBUF_TAGGED;
	2311	m0->m_pkthdr.pf.flags = 0;
	2312	/* XXX Re-Check when Upgrading to > 4.4 */
	2313	m0->m_pkthdr.pf.statekey = NULL;
	2314
	2315	if (r->rtableid >= 0)
	2316	m0->m_pkthdr.pf.rtableid = r->rtableid;
	2317
	2318	#ifdef ALTQ
	2319	if (r->qid) {
	2320	m->m_pkthdr.fw_flags \|= PF_MBUF_STRUCTURE;
	2321	m0->m_pkthdr.pf.qid = r->qid;
	2322	m0->m_pkthdr.pf.ecn_af = af;
	2323	m0->m_pkthdr.pf.hdr = mtod(m0, struct ip *);
	2324	}
	2325	#endif /* ALTQ */
	2326
	2327	switch (af) {
	2328	#ifdef INET
	2329	case AF_INET:
	2330	icmp_error(m0, type, code, 0, 0);
	2331	break;
	2332	#endif /* INET */
	2333	#ifdef INET6
	2334	case AF_INET6:
	2335	icmp6_error(m0, type, code, 0);
	2336	break;
	2337	#endif /* INET6 */
	2338	}
	2339	}
	2340
	2341	/*
	2342	* Return 1 if the addresses a and b match (with mask m), otherwise return 0.
	2343	* If n is 0, they match if they are equal. If n is != 0, they match if they
	2344	* are different.
	2345	*/
	2346	int
	2347	pf_match_addr(u_int8_t n, struct pf_addr a, struct pf_addr m,
	2348	struct pf_addr *b, sa_family_t af)
	2349	{
	2350	int match = 0;
	2351
	2352	switch (af) {
	2353	#ifdef INET
	2354	case AF_INET:
	2355	if ((a->addr32[0] & m->addr32[0]) ==
	2356	(b->addr32[0] & m->addr32[0]))
	2357	match++;
	2358	break;
	2359	#endif /* INET */
	2360	#ifdef INET6
	2361	case AF_INET6:
	2362	if (((a->addr32[0] & m->addr32[0]) ==
	2363	(b->addr32[0] & m->addr32[0])) &&
	2364	((a->addr32[1] & m->addr32[1]) ==
	2365	(b->addr32[1] & m->addr32[1])) &&
	2366	((a->addr32[2] & m->addr32[2]) ==
	2367	(b->addr32[2] & m->addr32[2])) &&
	2368	((a->addr32[3] & m->addr32[3]) ==
	2369	(b->addr32[3] & m->addr32[3])))
	2370	match++;
	2371	break;
	2372	#endif /* INET6 */
	2373	}
	2374	if (match) {
	2375	if (n)
	2376	return (0);
	2377	else
	2378	return (1);
	2379	} else {
	2380	if (n)
	2381	return (1);
	2382	else
	2383	return (0);
	2384	}
	2385	}
	2386
	2387	/*
	2388	* Return 1 if b <= a <= e, otherwise return 0.
	2389	*/
	2390	int
	2391	pf_match_addr_range(struct pf_addr b, struct pf_addr e,
	2392	struct pf_addr *a, sa_family_t af)
	2393	{
	2394	switch (af) {
	2395	#ifdef INET
	2396	case AF_INET:
	2397	if ((a->addr32[0] < b->addr32[0]) \|\|
	2398	(a->addr32[0] > e->addr32[0]))
	2399	return (0);
	2400	break;
	2401	#endif /* INET */
	2402	#ifdef INET6
	2403	case AF_INET6: {
	2404	int i;
	2405
	2406	/* check a >= b */
	2407	for (i = 0; i < 4; ++i)
	2408	if (a->addr32[i] > b->addr32[i])
	2409	break;
	2410	else if (a->addr32[i] < b->addr32[i])
	2411	return (0);
	2412	/* check a <= e */
	2413	for (i = 0; i < 4; ++i)
	2414	if (a->addr32[i] < e->addr32[i])
	2415	break;
	2416	else if (a->addr32[i] > e->addr32[i])
	2417	return (0);
	2418	break;
	2419	}
	2420	#endif /* INET6 */
	2421	}
	2422	return (1);
	2423	}
	2424
	2425	int
	2426	pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
	2427	{
	2428	switch (op) {
	2429	case PF_OP_IRG:
	2430	return ((p > a1) && (p < a2));
	2431	case PF_OP_XRG:
	2432	return ((p < a1) \|\| (p > a2));
	2433	case PF_OP_RRG:
	2434	return ((p >= a1) && (p <= a2));
	2435	case PF_OP_EQ:
	2436	return (p == a1);
	2437	case PF_OP_NE:
	2438	return (p != a1);
	2439	case PF_OP_LT:
	2440	return (p < a1);
	2441	case PF_OP_LE:
	2442	return (p <= a1);
	2443	case PF_OP_GT:
	2444	return (p > a1);
	2445	case PF_OP_GE:
	2446	return (p >= a1);
	2447	}
	2448	return (0); /* never reached */
	2449	}
	2450
	2451	int
	2452	pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
	2453	{
	2454	a1 = ntohs(a1);
	2455	a2 = ntohs(a2);
	2456	p = ntohs(p);
	2457	return (pf_match(op, a1, a2, p));
	2458	}
	2459
	2460	int
	2461	pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
	2462	{
	2463	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
	2464	return (0);
	2465	return (pf_match(op, a1, a2, u));
	2466	}
	2467
	2468	int
	2469	pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
	2470	{
	2471	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
	2472	return (0);
	2473	return (pf_match(op, a1, a2, g));
	2474	}
	2475
	2476	int
	2477	pf_match_tag(struct mbuf m, struct pf_rule r, int *tag)
	2478	{
	2479	if (*tag == -1)
	2480	*tag = m->m_pkthdr.pf.tag;
	2481
	2482	return ((!r->match_tag_not && r->match_tag == *tag) \|\|
	2483	(r->match_tag_not && r->match_tag != *tag));
	2484	}
	2485
	2486	int
	2487	pf_tag_packet(struct mbuf *m, int tag, int rtableid)
	2488	{
	2489	if (tag <= 0 && rtableid < 0)
	2490	return (0);
	2491
	2492	if (tag > 0)
	2493	m->m_pkthdr.pf.tag = tag;
	2494	if (rtableid >= 0)
	2495	m->m_pkthdr.pf.rtableid = rtableid;
	2496
	2497	return (0);
	2498	}
	2499
	2500	void
	2501	pf_step_into_anchor(int depth, struct pf_ruleset *rs, int n,
	2502	struct pf_rule r, struct pf_rule a, int *match)
	2503	{
	2504	struct pf_anchor_stackframe *f;
	2505
	2506	(*r)->anchor->match = 0;
	2507	if (match)
	2508	*match = 0;
	2509	if (*depth >= NELEM(pf_anchor_stack)) {
	2510	kprintf("pf_step_into_anchor: stack overflow\n");
	2511	r = TAILQ_NEXT(r, entries);
	2512	return;
	2513	} else if (*depth == 0 && a != NULL)
	2514	a = r;
	2515	f = pf_anchor_stack + (*depth)++;
	2516	f->rs = *rs;
	2517	f->r = *r;
	2518	if ((*r)->anchor_wildcard) {
	2519	f->parent = &(*r)->anchor->children;
	2520	if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
	2521	NULL) {
	2522	*r = NULL;
	2523	return;
	2524	}
	2525	*rs = &f->child->ruleset;
	2526	} else {
	2527	f->parent = NULL;
	2528	f->child = NULL;
	2529	rs = &(r)->anchor->ruleset;
	2530	}
	2531	r = TAILQ_FIRST((rs)->rules[n].active.ptr);
	2532	}
	2533
	2534	int
	2535	pf_step_out_of_anchor(int depth, struct pf_ruleset *rs, int n,
	2536	struct pf_rule r, struct pf_rule a, int *match)
	2537	{
	2538	struct pf_anchor_stackframe *f;
	2539	int quick = 0;
	2540
	2541	do {
	2542	if (*depth <= 0)
	2543	break;
	2544	f = pf_anchor_stack + *depth - 1;
	2545	if (f->parent != NULL && f->child != NULL) {
	2546	if (f->child->match \|\|
	2547	(match != NULL && *match)) {
	2548	f->r->anchor->match = 1;
	2549	*match = 0;
	2550	}
	2551	f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
	2552	if (f->child != NULL) {
	2553	*rs = &f->child->ruleset;
	2554	r = TAILQ_FIRST((rs)->rules[n].active.ptr);
	2555	if (*r == NULL)
	2556	continue;
	2557	else
	2558	break;
	2559	}
	2560	}
	2561	(*depth)--;
	2562	if (*depth == 0 && a != NULL)
	2563	*a = NULL;
	2564	*rs = f->rs;
	2565	if (f->r->anchor->match \|\| (match != NULL && *match))
	2566	quick = f->r->quick;
	2567	*r = TAILQ_NEXT(f->r, entries);
	2568	} while (*r == NULL);
	2569
	2570	return (quick);
	2571	}
	2572
	2573	#ifdef INET6
	2574	void
	2575	pf_poolmask(struct pf_addr naddr, struct pf_addr raddr,
	2576	struct pf_addr rmask, struct pf_addr saddr, sa_family_t af)
	2577	{
	2578	switch (af) {
	2579	#ifdef INET
	2580	case AF_INET:
	2581	naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) \|
	2582	((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
	2583	break;
	2584	#endif /* INET */
	2585	case AF_INET6:
	2586	naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) \|
	2587	((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
	2588	naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) \|
	2589	((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
	2590	naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) \|
	2591	((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
	2592	naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) \|
	2593	((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
	2594	break;
	2595	}
	2596	}
	2597
	2598	void
	2599	pf_addr_inc(struct pf_addr *addr, sa_family_t af)
	2600	{
	2601	switch (af) {
	2602	#ifdef INET
	2603	case AF_INET:
	2604	addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
	2605	break;
	2606	#endif /* INET */
	2607	case AF_INET6:
	2608	if (addr->addr32[3] == 0xffffffff) {
	2609	addr->addr32[3] = 0;
	2610	if (addr->addr32[2] == 0xffffffff) {
	2611	addr->addr32[2] = 0;
	2612	if (addr->addr32[1] == 0xffffffff) {
	2613	addr->addr32[1] = 0;
	2614	addr->addr32[0] =
	2615	htonl(ntohl(addr->addr32[0]) + 1);
	2616	} else
	2617	addr->addr32[1] =
	2618	htonl(ntohl(addr->addr32[1]) + 1);
	2619	} else
	2620	addr->addr32[2] =
	2621	htonl(ntohl(addr->addr32[2]) + 1);
	2622	} else
	2623	addr->addr32[3] =
	2624	htonl(ntohl(addr->addr32[3]) + 1);
	2625	break;
	2626	}
	2627	}
	2628	#endif /* INET6 */
	2629
	2630	#define mix(a,b,c) \
	2631	do { \
	2632	a -= b; a -= c; a ^= (c >> 13); \
	2633	b -= c; b -= a; b ^= (a << 8); \
	2634	c -= a; c -= b; c ^= (b >> 13); \
	2635	a -= b; a -= c; a ^= (c >> 12); \
	2636	b -= c; b -= a; b ^= (a << 16); \
	2637	c -= a; c -= b; c ^= (b >> 5); \
	2638	a -= b; a -= c; a ^= (c >> 3); \
	2639	b -= c; b -= a; b ^= (a << 10); \
	2640	c -= a; c -= b; c ^= (b >> 15); \
	2641	} while (0)
	2642
	2643	/*
	2644	* hash function based on bridge_hash in if_bridge.c
	2645	*/
	2646	void
	2647	pf_hash(struct pf_addr inaddr, struct pf_addr hash,
	2648	struct pf_poolhashkey *key, sa_family_t af)
	2649	{
	2650	u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
	2651
	2652	switch (af) {
	2653	#ifdef INET
	2654	case AF_INET:
	2655	a += inaddr->addr32[0];
	2656	b += key->key32[1];
	2657	mix(a, b, c);
	2658	hash->addr32[0] = c + key->key32[2];
	2659	break;
	2660	#endif /* INET */
	2661	#ifdef INET6
	2662	case AF_INET6:
	2663	a += inaddr->addr32[0];
	2664	b += inaddr->addr32[2];
	2665	mix(a, b, c);
	2666	hash->addr32[0] = c;
	2667	a += inaddr->addr32[1];
	2668	b += inaddr->addr32[3];
	2669	c += key->key32[1];
	2670	mix(a, b, c);
	2671	hash->addr32[1] = c;
	2672	a += inaddr->addr32[2];
	2673	b += inaddr->addr32[1];
	2674	c += key->key32[2];
	2675	mix(a, b, c);
	2676	hash->addr32[2] = c;
	2677	a += inaddr->addr32[3];
	2678	b += inaddr->addr32[0];
	2679	c += key->key32[3];
	2680	mix(a, b, c);
	2681	hash->addr32[3] = c;
	2682	break;
	2683	#endif /* INET6 */
	2684	}
	2685	}
	2686
	2687	int
	2688	pf_map_addr(sa_family_t af, struct pf_rule r, struct pf_addr saddr,
	2689	struct pf_addr naddr, struct pf_addr init_addr, struct pf_src_node **sn)
	2690	{
	2691	unsigned char hash[16];
	2692	struct pf_pool *rpool = &r->rpool;
	2693	struct pf_pooladdr *acur = rpool->cur;
	2694	struct pf_pooladdr *cur;
	2695	struct pf_addr *raddr;
	2696	struct pf_addr *rmask;
	2697	struct pf_addr counter;
	2698	struct pf_src_node k;
	2699	int cpu = mycpu->gd_cpuid;
	2700	int tblidx;
	2701
	2702	bzero(hash, sizeof(hash)); /* avoid gcc warnings */
	2703
	2704	/*
	2705	* NOTE! rpool->cur and rpool->tblidx can be iterators and thus
	2706	* may represent a SMP race due to the shared nature of the
	2707	* rpool structure. We allow the race and ensure that updates
	2708	* do not create a fatal condition.
	2709	*/
	2710	cpu_ccfence();
	2711	cur = acur;
	2712	raddr = &cur->addr.v.a.addr;
	2713	rmask = &cur->addr.v.a.mask;
	2714
	2715	if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
	2716	(r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
	2717	k.af = af;
	2718	PF_ACPY(&k.addr, saddr, af);
	2719	if (r->rule_flag & PFRULE_RULESRCTRACK \|\|
	2720	r->rpool.opts & PF_POOL_STICKYADDR)
	2721	k.rule.ptr = r;
	2722	else
	2723	k.rule.ptr = NULL;
	2724	PF_INC_SCOUNTER(SCNT_SRC_NODE_SEARCH);
	2725	*sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
	2726	if (sn != NULL && !PF_AZERO(&(sn)->raddr, af)) {
	2727	PF_ACPY(naddr, &(*sn)->raddr, af);
	2728	if (pf_status.debug >= PF_DEBUG_MISC) {
	2729	kprintf("pf_map_addr: src tracking maps ");
	2730	pf_print_host(&k.addr, 0, af);
	2731	kprintf(" to ");
	2732	pf_print_host(naddr, 0, af);
	2733	kprintf("\n");
	2734	}
	2735	return (0);
	2736	}
	2737	}
	2738
	2739	if (cur->addr.type == PF_ADDR_NOROUTE)
	2740	return (1);
	2741	if (cur->addr.type == PF_ADDR_DYNIFTL) {
	2742	switch (af) {
	2743	#ifdef INET
	2744	case AF_INET:
	2745	if (cur->addr.p.dyn->pfid_acnt4 < 1 &&
	2746	(rpool->opts & PF_POOL_TYPEMASK) !=
	2747	PF_POOL_ROUNDROBIN)
	2748	return (1);
	2749	raddr = &cur->addr.p.dyn->pfid_addr4;
	2750	rmask = &cur->addr.p.dyn->pfid_mask4;
	2751	break;
	2752	#endif /* INET */
	2753	#ifdef INET6
	2754	case AF_INET6:
	2755	if (cur->addr.p.dyn->pfid_acnt6 < 1 &&
	2756	(rpool->opts & PF_POOL_TYPEMASK) !=
	2757	PF_POOL_ROUNDROBIN)
	2758	return (1);
	2759	raddr = &cur->addr.p.dyn->pfid_addr6;
	2760	rmask = &cur->addr.p.dyn->pfid_mask6;
	2761	break;
	2762	#endif /* INET6 */
	2763	}
	2764	} else if (cur->addr.type == PF_ADDR_TABLE) {
	2765	if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
	2766	return (1); /* unsupported */
	2767	} else {
	2768	raddr = &cur->addr.v.a.addr;
	2769	rmask = &cur->addr.v.a.mask;
	2770	}
	2771
	2772	switch (rpool->opts & PF_POOL_TYPEMASK) {
	2773	case PF_POOL_NONE:
	2774	PF_ACPY(naddr, raddr, af);
	2775	break;
	2776	case PF_POOL_BITMASK:
	2777	PF_POOLMASK(naddr, raddr, rmask, saddr, af);
	2778	break;
	2779	case PF_POOL_RANDOM:
	2780	if (init_addr != NULL && PF_AZERO(init_addr, af)) {
	2781	switch (af) {
	2782	#ifdef INET
	2783	case AF_INET:
	2784	counter.addr32[0] = htonl(karc4random());
	2785	break;
	2786	#endif /* INET */
	2787	#ifdef INET6
	2788	case AF_INET6:
	2789	if (rmask->addr32[3] != 0xffffffff)
	2790	counter.addr32[3] =
	2791	htonl(karc4random());
	2792	else
	2793	break;
	2794	if (rmask->addr32[2] != 0xffffffff)
	2795	counter.addr32[2] =
	2796	htonl(karc4random());
	2797	else
	2798	break;
	2799	if (rmask->addr32[1] != 0xffffffff)
	2800	counter.addr32[1] =
	2801	htonl(karc4random());
	2802	else
	2803	break;
	2804	if (rmask->addr32[0] != 0xffffffff)
	2805	counter.addr32[0] =
	2806	htonl(karc4random());
	2807	break;
	2808	#endif /* INET6 */
	2809	}
	2810	PF_POOLMASK(naddr, raddr, rmask, &counter, af);
	2811	PF_ACPY(init_addr, naddr, af);
	2812
	2813	} else {
	2814	counter = rpool->counter;
	2815	cpu_ccfence();
	2816	PF_AINC(&counter, af);
	2817	PF_POOLMASK(naddr, raddr, rmask, &counter, af);
	2818	rpool->counter = counter;
	2819	}
	2820	break;
	2821	case PF_POOL_SRCHASH:
	2822	pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
	2823	PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
	2824	break;
	2825	case PF_POOL_ROUNDROBIN:
	2826	tblidx = rpool->tblidx;
	2827	counter = rpool->counter;
	2828	if (cur->addr.type == PF_ADDR_TABLE) {
	2829	if (!pfr_pool_get(cur->addr.p.tbl,
	2830	&tblidx, &counter,
	2831	&raddr, &rmask, af)) {
	2832	goto get_addr;
	2833	}
	2834	} else if (cur->addr.type == PF_ADDR_DYNIFTL) {
	2835	if (!pfr_pool_get(cur->addr.p.dyn->pfid_kt,
	2836	&tblidx, &counter,
	2837	&raddr, &rmask, af)) {
	2838	goto get_addr;
	2839	}
	2840	} else if (pf_match_addr(0, raddr, rmask,
	2841	&counter, af)) {
	2842	goto get_addr;
	2843	}
	2844
	2845	try_next:
	2846	if ((cur = TAILQ_NEXT(cur, entries)) == NULL)
	2847	cur = TAILQ_FIRST(&rpool->list);
	2848	if (cur->addr.type == PF_ADDR_TABLE) {
	2849	tblidx = -1;
	2850	if (pfr_pool_get(cur->addr.p.tbl,
	2851	&tblidx, &counter,
	2852	&raddr, &rmask, af)) {
	2853	/* table contains no address of type 'af' */
	2854	if (cur != acur)
	2855	goto try_next;
	2856	return (1);
	2857	}
	2858	} else if (cur->addr.type == PF_ADDR_DYNIFTL) {
	2859	tblidx = -1;
	2860	if (pfr_pool_get(cur->addr.p.dyn->pfid_kt,
	2861	&tblidx, &counter,
	2862	&raddr, &rmask, af)) {
	2863	/* table contains no address of type 'af' */
	2864	if (cur != acur)
	2865	goto try_next;
	2866	return (1);
	2867	}
	2868	} else {
	2869	raddr = &cur->addr.v.a.addr;
	2870	rmask = &cur->addr.v.a.mask;
	2871	PF_ACPY(&counter, raddr, af);
	2872	}
	2873
	2874	get_addr:
	2875	rpool->cur = cur;
	2876	rpool->tblidx = tblidx;
	2877	PF_ACPY(naddr, &counter, af);
	2878	if (init_addr != NULL && PF_AZERO(init_addr, af))
	2879	PF_ACPY(init_addr, naddr, af);
	2880	PF_AINC(&counter, af);
	2881	rpool->counter = counter;
	2882	break;
	2883	}
	2884	if (*sn != NULL)
	2885	PF_ACPY(&(*sn)->raddr, naddr, af);
	2886
	2887	if (pf_status.debug >= PF_DEBUG_MISC &&
	2888	(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
	2889	kprintf("pf_map_addr: selected address ");
	2890	pf_print_host(naddr, 0, af);
	2891	kprintf("\n");
	2892	}
	2893
	2894	return (0);
	2895	}
	2896
	2897	int
	2898	pf_get_sport(struct pf_pdesc *pd, sa_family_t af,
	2899	u_int8_t proto, struct pf_rule *r,
	2900	struct pf_addr saddr, struct pf_addr daddr,
	2901	u_int16_t sport, u_int16_t dport,
	2902	struct pf_addr naddr, u_int16_t nport,
	2903	u_int16_t low, u_int16_t high, struct pf_src_node **sn)
	2904	{
	2905	struct pf_state_key_cmp key;
	2906	struct pf_addr init_addr;
	2907	u_int16_t cut;
	2908	u_int32_t hash_base = 0;
	2909	int do_hash = 0;
	2910
	2911	bzero(&init_addr, sizeof(init_addr));
	2912	if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
	2913	return (1);
	2914
	2915	if (proto == IPPROTO_ICMP) {
	2916	low = 1;
	2917	high = 65535;
	2918	}
	2919
	2920	bzero(&key, sizeof(key));
	2921	key.af = af;
	2922	key.proto = proto;
	2923	key.port[0] = dport;
	2924	PF_ACPY(&key.addr[0], daddr, key.af);
	2925
	2926	do {
	2927	PF_ACPY(&key.addr[1], naddr, key.af);
	2928
	2929	/*
	2930	* We want to select a port that calculates to a toeplitz hash
	2931	* that masks to the same cpu, otherwise the response may
	2932	* not see the new state.
	2933	*
	2934	* We can still do this even if the kernel is disregarding
	2935	* the hash and vectoring the packets to a specific cpu,
	2936	* but it will reduce the number of ports we can use.
	2937	*/
	2938	switch(af) {
	2939	case AF_INET:
	2940	if (proto == IPPROTO_TCP) {
	2941	do_hash = 1;
	2942	hash_base = toeplitz_piecemeal_port(dport) ^
	2943	toeplitz_piecemeal_addr(daddr->v4.s_addr) ^
	2944	toeplitz_piecemeal_addr(naddr->v4.s_addr);
	2945	}
	2946	break;
	2947	case AF_INET6:
	2948	/* XXX TODO XXX */
	2949	default:
	2950	/* XXX TODO XXX */
	2951	break;
	2952	}
	2953
	2954	/*
	2955	* port search; start random, step;
	2956	* similar 2 portloop in in_pcbbind
	2957	*
	2958	* WARNING! We try to match such that the kernel will
	2959	* dispatch the translated host/port to the same
	2960	* cpu, but this might not be possible.
	2961	*
	2962	* In the case where the port is fixed, or for the
	2963	* UDP case (whos toeplitz does not incorporate the
	2964	* port), we set not_cpu_localized which ultimately
	2965	* causes the pf_state_tree element
	2966	*
	2967	* XXX fixed ports present a problem for cpu localization.
	2968	*/
	2969	if (!(proto == IPPROTO_TCP \|\|
	2970	proto == IPPROTO_UDP \|\|
	2971	proto == IPPROTO_ICMP)) {
	2972	/*
	2973	* non-specific protocol, leave port intact.
	2974	*/
	2975	key.port[1] = sport;
	2976	if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
	2977	*nport = sport;
	2978	pd->not_cpu_localized = 1;
	2979	return (0);
	2980	}
	2981	} else if (low == 0 && high == 0) {
	2982	/*
	2983	* static-port same as originator.
	2984	*/
	2985	key.port[1] = sport;
	2986	if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
	2987	*nport = sport;
	2988	pd->not_cpu_localized = 1;
	2989	return (0);
	2990	}
	2991	} else if (low == high) {
	2992	/*
	2993	* specific port as specified.
	2994	*/
	2995	key.port[1] = htons(low);
	2996	if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
	2997	*nport = htons(low);
	2998	pd->not_cpu_localized = 1;
	2999	return (0);
	3000	}
	3001	} else {
	3002	/*
	3003	* normal dynamic port
	3004	*/
	3005	u_int16_t tmp;
	3006
	3007	if (low > high) {
	3008	tmp = low;
	3009	low = high;
	3010	high = tmp;
	3011	}
	3012	/* low < high */
	3013	cut = htonl(karc4random()) % (1 + high - low) + low;
	3014	/* low <= cut <= high */
	3015	for (tmp = cut; tmp <= high; ++(tmp)) {
	3016	key.port[1] = htons(tmp);
	3017	if (do_hash) {
	3018	uint32_t hash;
	3019
	3020	hash = hash_base ^
	3021	toeplitz_piecemeal_port(key.port[1]);
	3022	if (netisr_hashcpu(hash) != mycpuid)
	3023	continue;
	3024	}
	3025	if (pf_find_state_all(&key, PF_IN, NULL) ==
	3026	NULL && !in_baddynamic(tmp, proto)) {
	3027	if (proto == IPPROTO_UDP)
	3028	pd->not_cpu_localized = 1;
	3029	*nport = htons(tmp);
	3030	return (0);
	3031	}
	3032	}
	3033	for (tmp = cut - 1; tmp >= low; --(tmp)) {
	3034	key.port[1] = htons(tmp);
	3035	if (do_hash) {
	3036	uint32_t hash;
	3037
	3038	hash = hash_base ^
	3039	toeplitz_piecemeal_port(key.port[1]);
	3040	if (netisr_hashcpu(hash) != mycpuid)
	3041	continue;
	3042	}
	3043	if (pf_find_state_all(&key, PF_IN, NULL) ==
	3044	NULL && !in_baddynamic(tmp, proto)) {
	3045	if (proto == IPPROTO_UDP)
	3046	pd->not_cpu_localized = 1;
	3047	*nport = htons(tmp);
	3048	return (0);
	3049	}
	3050	}
	3051	}
	3052
	3053	/*
	3054	* Next address
	3055	*/
	3056	switch (r->rpool.opts & PF_POOL_TYPEMASK) {
	3057	case PF_POOL_RANDOM:
	3058	case PF_POOL_ROUNDROBIN:
	3059	if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
	3060	return (1);
	3061	break;
	3062	case PF_POOL_NONE:
	3063	case PF_POOL_SRCHASH:
	3064	case PF_POOL_BITMASK:
	3065	default:
	3066	return (1);
	3067	}
	3068	} while (! PF_AEQ(&init_addr, naddr, af) );
	3069	return (1); /* none available */
	3070	}
	3071
	3072	struct pf_rule *
	3073	pf_match_translation(struct pf_pdesc pd, struct mbuf m, int off,
	3074	int direction, struct pfi_kif kif, struct pf_addr saddr, u_int16_t sport,
	3075	struct pf_addr *daddr, u_int16_t dport, int rs_num)
	3076	{
	3077	struct pf_rule r, rm = NULL;
	3078	struct pf_ruleset *ruleset = NULL;
	3079	int tag = -1;
	3080	int rtableid = -1;
	3081	int asd = 0;
	3082
	3083	r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
	3084	while (r && rm == NULL) {
	3085	struct pf_rule_addr src = NULL, dst = NULL;
	3086	struct pf_addr_wrap *xdst = NULL;
	3087	struct pf_pooladdr *cur;
	3088
	3089	if (r->action == PF_BINAT && direction == PF_IN) {
	3090	src = &r->dst;
	3091	cur = r->rpool.cur; /* SMP race possible */
	3092	cpu_ccfence();
	3093	if (cur)
	3094	xdst = &cur->addr;
	3095	} else {
	3096	src = &r->src;
	3097	dst = &r->dst;
	3098	}
	3099
	3100	r->evaluations++;
	3101	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	3102	r = r->skip[PF_SKIP_IFP].ptr;
	3103	else if (r->direction && r->direction != direction)
	3104	r = r->skip[PF_SKIP_DIR].ptr;
	3105	else if (r->af && r->af != pd->af)
	3106	r = r->skip[PF_SKIP_AF].ptr;
	3107	else if (r->proto && r->proto != pd->proto)
	3108	r = r->skip[PF_SKIP_PROTO].ptr;
	3109	else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
	3110	src->neg, kif))
	3111	r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
	3112	PF_SKIP_DST_ADDR].ptr;
	3113	else if (src->port_op && !pf_match_port(src->port_op,
	3114	src->port[0], src->port[1], sport))
	3115	r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
	3116	PF_SKIP_DST_PORT].ptr;
	3117	else if (dst != NULL &&
	3118	PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL))
	3119	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	3120	else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
	3121	0, NULL))
	3122	r = TAILQ_NEXT(r, entries);
	3123	else if (dst != NULL && dst->port_op &&
	3124	!pf_match_port(dst->port_op, dst->port[0],
	3125	dst->port[1], dport))
	3126	r = r->skip[PF_SKIP_DST_PORT].ptr;
	3127	else if (r->match_tag && !pf_match_tag(m, r, &tag))
	3128	r = TAILQ_NEXT(r, entries);
	3129	else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
	3130	IPPROTO_TCP \|\| !pf_osfp_match(pf_osfp_fingerprint(pd, m,
	3131	off, pd->hdr.tcp), r->os_fingerprint)))
	3132	r = TAILQ_NEXT(r, entries);
	3133	else {
	3134	if (r->tag)
	3135	tag = r->tag;
	3136	if (r->rtableid >= 0)
	3137	rtableid = r->rtableid;
	3138	if (r->anchor == NULL) {
	3139	rm = r;
	3140	} else
	3141	pf_step_into_anchor(&asd, &ruleset, rs_num,
	3142	&r, NULL, NULL);
	3143	}
	3144	if (r == NULL)
	3145	pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
	3146	NULL, NULL);
	3147	}
	3148	if (pf_tag_packet(m, tag, rtableid))
	3149	return (NULL);
	3150	if (rm != NULL && (rm->action == PF_NONAT \|\|
	3151	rm->action == PF_NORDR \|\| rm->action == PF_NOBINAT))
	3152	return (NULL);
	3153	return (rm);
	3154	}
	3155
	3156	struct pf_rule *
	3157	pf_get_translation(struct pf_pdesc pd, struct mbuf m, int off, int direction,
	3158	struct pfi_kif kif, struct pf_src_node *sn,
	3159	struct pf_state_key skw, struct pf_state_key sks,
	3160	struct pf_state_key skp, struct pf_state_key nkp,
	3161	struct pf_addr saddr, struct pf_addr daddr,
	3162	u_int16_t sport, u_int16_t dport)
	3163	{
	3164	struct pf_rule *r = NULL;
	3165
	3166	if (direction == PF_OUT) {
	3167	r = pf_match_translation(pd, m, off, direction, kif, saddr,
	3168	sport, daddr, dport, PF_RULESET_BINAT);
	3169	if (r == NULL)
	3170	r = pf_match_translation(pd, m, off, direction, kif,
	3171	saddr, sport, daddr, dport, PF_RULESET_NAT);
	3172	} else {
	3173	r = pf_match_translation(pd, m, off, direction, kif, saddr,
	3174	sport, daddr, dport, PF_RULESET_RDR);
	3175	if (r == NULL)
	3176	r = pf_match_translation(pd, m, off, direction, kif,
	3177	saddr, sport, daddr, dport, PF_RULESET_BINAT);
	3178	}
	3179
	3180	if (r != NULL) {
	3181	struct pf_addr *naddr;
	3182	u_int16_t *nport;
	3183
	3184	if (pf_state_key_setup(pd, r, skw, sks, skp, nkp,
	3185	saddr, daddr, sport, dport))
	3186	return r;
	3187
	3188	/* XXX We only modify one side for now. */
	3189	naddr = &(*nkp)->addr[1];
	3190	nport = &(*nkp)->port[1];
	3191
	3192	/*
	3193	* NOTE: Currently all translations will clear
	3194	* BRIDGE_MBUF_TAGGED, telling the bridge to
	3195	* ignore the original input encapsulation.
	3196	*/
	3197	switch (r->action) {
	3198	case PF_NONAT:
	3199	case PF_NOBINAT:
	3200	case PF_NORDR:
	3201	return (NULL);
	3202	case PF_NAT:
	3203	m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
	3204	if (pf_get_sport(pd, pd->af, pd->proto, r,
	3205	saddr, daddr, sport, dport,
	3206	naddr, nport, r->rpool.proxy_port[0],
	3207	r->rpool.proxy_port[1], sn)) {
	3208	DPFPRINTF(PF_DEBUG_MISC,
	3209	("pf: NAT proxy port allocation "
	3210	"(%u-%u) failed\n",
	3211	r->rpool.proxy_port[0],
	3212	r->rpool.proxy_port[1]));
	3213	return (NULL);
	3214	}
	3215	break;
	3216	case PF_BINAT:
	3217	m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
	3218	switch (direction) {
	3219	case PF_OUT:
	3220	if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
	3221	switch (pd->af) {
	3222	#ifdef INET
	3223	case AF_INET:
	3224	if (r->rpool.cur->addr.p.dyn->
	3225	pfid_acnt4 < 1)
	3226	return (NULL);
	3227	PF_POOLMASK(naddr,
	3228	&r->rpool.cur->addr.p.dyn->
	3229	pfid_addr4,
	3230	&r->rpool.cur->addr.p.dyn->
	3231	pfid_mask4,
	3232	saddr, AF_INET);
	3233	break;
	3234	#endif /* INET */
	3235	#ifdef INET6
	3236	case AF_INET6:
	3237	if (r->rpool.cur->addr.p.dyn->
	3238	pfid_acnt6 < 1)
	3239	return (NULL);
	3240	PF_POOLMASK(naddr,
	3241	&r->rpool.cur->addr.p.dyn->
	3242	pfid_addr6,
	3243	&r->rpool.cur->addr.p.dyn->
	3244	pfid_mask6,
	3245	saddr, AF_INET6);
	3246	break;
	3247	#endif /* INET6 */
	3248	}
	3249	} else
	3250	PF_POOLMASK(naddr,
	3251	&r->rpool.cur->addr.v.a.addr,
	3252	&r->rpool.cur->addr.v.a.mask,
	3253	saddr, pd->af);
	3254	break;
	3255	case PF_IN:
	3256	if (r->src.addr.type == PF_ADDR_DYNIFTL) {
	3257	switch (pd->af) {
	3258	#ifdef INET
	3259	case AF_INET:
	3260	if (r->src.addr.p.dyn->
	3261	pfid_acnt4 < 1)
	3262	return (NULL);
	3263	PF_POOLMASK(naddr,
	3264	&r->src.addr.p.dyn->
	3265	pfid_addr4,
	3266	&r->src.addr.p.dyn->
	3267	pfid_mask4,
	3268	daddr, AF_INET);
	3269	break;
	3270	#endif /* INET */
	3271	#ifdef INET6
	3272	case AF_INET6:
	3273	if (r->src.addr.p.dyn->
	3274	pfid_acnt6 < 1)
	3275	return (NULL);
	3276	PF_POOLMASK(naddr,
	3277	&r->src.addr.p.dyn->
	3278	pfid_addr6,
	3279	&r->src.addr.p.dyn->
	3280	pfid_mask6,
	3281	daddr, AF_INET6);
	3282	break;
	3283	#endif /* INET6 */
	3284	}
	3285	} else
	3286	PF_POOLMASK(naddr,
	3287	&r->src.addr.v.a.addr,
	3288	&r->src.addr.v.a.mask, daddr,
	3289	pd->af);
	3290	break;
	3291	}
	3292	break;
	3293	case PF_RDR: {
	3294	m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
	3295	if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
	3296	return (NULL);
	3297	if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
	3298	PF_POOL_BITMASK)
	3299	PF_POOLMASK(naddr, naddr,
	3300	&r->rpool.cur->addr.v.a.mask, daddr,
	3301	pd->af);
	3302
	3303	if (r->rpool.proxy_port[1]) {
	3304	u_int32_t tmp_nport;
	3305
	3306	tmp_nport = ((ntohs(dport) -
	3307	ntohs(r->dst.port[0])) %
	3308	(r->rpool.proxy_port[1] -
	3309	r->rpool.proxy_port[0] + 1)) +
	3310	r->rpool.proxy_port[0];
	3311
	3312	/* wrap around if necessary */
	3313	if (tmp_nport > 65535)
	3314	tmp_nport -= 65535;
	3315	*nport = htons((u_int16_t)tmp_nport);
	3316	} else if (r->rpool.proxy_port[0]) {
	3317	*nport = htons(r->rpool.proxy_port[0]);
	3318	}
	3319	pd->not_cpu_localized = 1;
	3320	break;
	3321	}
	3322	default:
	3323	return (NULL);
	3324	}
	3325	}
	3326
	3327	return (r);
	3328	}
	3329
	3330	struct netmsg_hashlookup {
	3331	struct netmsg_base base;
	3332	struct inpcb **nm_pinp;
	3333	struct inpcbinfo *nm_pcbinfo;
	3334	struct pf_addr *nm_saddr;
	3335	struct pf_addr *nm_daddr;
	3336	uint16_t nm_sport;
	3337	uint16_t nm_dport;
	3338	sa_family_t nm_af;
	3339	};
	3340
	3341	#ifdef PF_SOCKET_LOOKUP_DOMSG
	3342	static void
	3343	in_pcblookup_hash_handler(netmsg_t msg)
	3344	{
	3345	struct netmsg_hashlookup rmsg = (struct netmsg_hashlookup )msg;
	3346
	3347	if (rmsg->nm_af == AF_INET)
	3348	*rmsg->nm_pinp = in_pcblookup_hash(rmsg->nm_pcbinfo,
	3349	rmsg->nm_saddr->v4, rmsg->nm_sport, rmsg->nm_daddr->v4,
	3350	rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
	3351	#ifdef INET6
	3352	else
	3353	*rmsg->nm_pinp = in6_pcblookup_hash(rmsg->nm_pcbinfo,
	3354	&rmsg->nm_saddr->v6, rmsg->nm_sport, &rmsg->nm_daddr->v6,
	3355	rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
	3356	#endif /* INET6 */
	3357	lwkt_replymsg(&rmsg->base.lmsg, 0);
	3358	}
	3359	#endif /* PF_SOCKET_LOOKUP_DOMSG */
	3360
	3361	int
	3362	pf_socket_lookup(int direction, struct pf_pdesc *pd)
	3363	{
	3364	struct pf_addr saddr, daddr;
	3365	u_int16_t sport, dport;
	3366	struct inpcbinfo *pi;
	3367	struct inpcb *inp;
	3368	struct netmsg_hashlookup *msg = NULL;
	3369	#ifdef PF_SOCKET_LOOKUP_DOMSG
	3370	struct netmsg_hashlookup msg0;
	3371	#endif
	3372	int pi_cpu = 0;
	3373
	3374	if (pd == NULL)
	3375	return (-1);
	3376	pd->lookup.uid = UID_MAX;
	3377	pd->lookup.gid = GID_MAX;
	3378	pd->lookup.pid = NO_PID;
	3379	if (direction == PF_IN) {
	3380	saddr = pd->src;
	3381	daddr = pd->dst;
	3382	} else {
	3383	saddr = pd->dst;
	3384	daddr = pd->src;
	3385	}
	3386	switch (pd->proto) {
	3387	case IPPROTO_TCP:
	3388	if (pd->hdr.tcp == NULL)
	3389	return (-1);
	3390	sport = pd->hdr.tcp->th_sport;
	3391	dport = pd->hdr.tcp->th_dport;
	3392
	3393	pi_cpu = tcp_addrcpu(saddr->v4.s_addr, sport, daddr->v4.s_addr, dport);
	3394	pi = &tcbinfo[pi_cpu];
	3395	/*
	3396	* Our netstack runs lockless on MP systems
	3397	* (only for TCP connections at the moment).
	3398	*
	3399	* As we are not allowed to read another CPU's tcbinfo,
	3400	* we have to ask that CPU via remote call to search the
	3401	* table for us.
	3402	*
	3403	* Prepare a msg iff data belongs to another CPU.
	3404	*/
	3405	if (pi_cpu != mycpu->gd_cpuid) {
	3406	#ifdef PF_SOCKET_LOOKUP_DOMSG
	3407	/*
	3408	* NOTE:
	3409	*
	3410	* Following lwkt_domsg() is dangerous and could
	3411	* lockup the network system, e.g.
	3412	*
	3413	* On 2 CPU system:
	3414	* netisr0 domsg to netisr1 (due to lookup)
	3415	* netisr1 domsg to netisr0 (due to lookup)
	3416	*
	3417	* We simply return -1 here, since we are probably
	3418	* called before NAT, so the TCP packet should
	3419	* already be on the correct CPU.
	3420	*/
	3421	msg = &msg0;
	3422	netmsg_init(&msg->base, NULL, &curthread->td_msgport,
	3423	0, in_pcblookup_hash_handler);
	3424	msg->nm_pinp = &inp;
	3425	msg->nm_pcbinfo = pi;
	3426	msg->nm_saddr = saddr;
	3427	msg->nm_sport = sport;
	3428	msg->nm_daddr = daddr;
	3429	msg->nm_dport = dport;
	3430	msg->nm_af = pd->af;
	3431	#else /* !PF_SOCKET_LOOKUP_DOMSG */
	3432	kprintf("pf_socket_lookup: tcp packet not on the "
	3433	"correct cpu %d, cur cpu %d\n",
	3434	pi_cpu, mycpuid);
	3435	print_backtrace(-1);
	3436	return -1;
	3437	#endif /* PF_SOCKET_LOOKUP_DOMSG */
	3438	}
	3439	break;
	3440	case IPPROTO_UDP:
	3441	if (pd->hdr.udp == NULL)
	3442	return (-1);
	3443	sport = pd->hdr.udp->uh_sport;
	3444	dport = pd->hdr.udp->uh_dport;
	3445	pi = &udbinfo[mycpuid];
	3446	break;
	3447	default:
	3448	return (-1);
	3449	}
	3450	if (direction != PF_IN) {
	3451	u_int16_t p;
	3452
	3453	p = sport;
	3454	sport = dport;
	3455	dport = p;
	3456	}
	3457	switch (pd->af) {
	3458	#ifdef INET6
	3459	case AF_INET6:
	3460	/*
	3461	* Query other CPU, second part
	3462	*
	3463	* msg only gets initialized when:
	3464	* 1) packet is TCP
	3465	* 2) the info belongs to another CPU
	3466	*
	3467	* Use some switch/case magic to avoid code duplication.
	3468	*/
	3469	if (msg == NULL) {
	3470	inp = in6_pcblookup_hash(pi, &saddr->v6, sport,
	3471	&daddr->v6, dport, INPLOOKUP_WILDCARD, NULL);
	3472
	3473	if (inp == NULL)
	3474	return (-1);
	3475	break;
	3476	}
	3477	/* FALLTHROUGH if SMP and on other CPU */
	3478	#endif /* INET6 */
	3479	case AF_INET:
	3480	if (msg != NULL) {
	3481	lwkt_domsg(netisr_cpuport(pi_cpu),
	3482	&msg->base.lmsg, 0);
	3483	} else
	3484	{
	3485	inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4,
	3486	dport, INPLOOKUP_WILDCARD, NULL);
	3487	}
	3488	if (inp == NULL)
	3489	return (-1);
	3490	break;
	3491
	3492	default:
	3493	return (-1);
	3494	}
	3495	pd->lookup.uid = inp->inp_socket->so_cred->cr_uid;
	3496	pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0];
	3497	return (1);
	3498	}
	3499
	3500	u_int8_t
	3501	pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
	3502	{
	3503	int hlen;
	3504	u_int8_t hdr[60];
	3505	u_int8_t *opt, optlen;
	3506	u_int8_t wscale = 0;
	3507
	3508	hlen = th_off << 2; /* hlen <= sizeof(hdr) */
	3509	if (hlen <= sizeof(struct tcphdr))
	3510	return (0);
	3511	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
	3512	return (0);
	3513	opt = hdr + sizeof(struct tcphdr);
	3514	hlen -= sizeof(struct tcphdr);
	3515	while (hlen >= 3) {
	3516	switch (*opt) {
	3517	case TCPOPT_EOL:
	3518	case TCPOPT_NOP:
	3519	++opt;
	3520	--hlen;
	3521	break;
	3522	case TCPOPT_WINDOW:
	3523	wscale = opt[2];
	3524	if (wscale > TCP_MAX_WINSHIFT)
	3525	wscale = TCP_MAX_WINSHIFT;
	3526	wscale \|= PF_WSCALE_FLAG;
	3527	/* FALLTHROUGH */
	3528	default:
	3529	optlen = opt[1];
	3530	if (optlen < 2)
	3531	optlen = 2;
	3532	hlen -= optlen;
	3533	opt += optlen;
	3534	break;
	3535	}
	3536	}
	3537	return (wscale);
	3538	}
	3539
	3540	u_int16_t
	3541	pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
	3542	{
	3543	int hlen;
	3544	u_int8_t hdr[60];
	3545	u_int8_t *opt, optlen;
	3546	u_int16_t mss = tcp_mssdflt;
	3547
	3548	hlen = th_off << 2; /* hlen <= sizeof(hdr) */
	3549	if (hlen <= sizeof(struct tcphdr))
	3550	return (0);
	3551	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
	3552	return (0);
	3553	opt = hdr + sizeof(struct tcphdr);
	3554	hlen -= sizeof(struct tcphdr);
	3555	while (hlen >= TCPOLEN_MAXSEG) {
	3556	switch (*opt) {
	3557	case TCPOPT_EOL:
	3558	case TCPOPT_NOP:
	3559	++opt;
	3560	--hlen;
	3561	break;
	3562	case TCPOPT_MAXSEG:
	3563	bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
	3564	/* FALLTHROUGH */
	3565	default:
	3566	optlen = opt[1];
	3567	if (optlen < 2)
	3568	optlen = 2;
	3569	hlen -= optlen;
	3570	opt += optlen;
	3571	break;
	3572	}
	3573	}
	3574	return (mss);
	3575	}
	3576
	3577	u_int16_t
	3578	pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer)
	3579	{
	3580	#ifdef INET
	3581	struct sockaddr_in *dst;
	3582	struct route ro;
	3583	#endif /* INET */
	3584	#ifdef INET6
	3585	struct sockaddr_in6 *dst6;
	3586	struct route_in6 ro6;
	3587	#endif /* INET6 */
	3588	struct rtentry *rt = NULL;
	3589	int hlen = 0;
	3590	u_int16_t mss = tcp_mssdflt;
	3591
	3592	switch (af) {
	3593	#ifdef INET
	3594	case AF_INET:
	3595	hlen = sizeof(struct ip);
	3596	bzero(&ro, sizeof(ro));
	3597	dst = (struct sockaddr_in *)&ro.ro_dst;
	3598	dst->sin_family = AF_INET;
	3599	dst->sin_len = sizeof(*dst);
	3600	dst->sin_addr = addr->v4;
	3601	rtalloc_ign(&ro, (RTF_CLONING \| RTF_PRCLONING));
	3602	rt = ro.ro_rt;
	3603	break;
	3604	#endif /* INET */
	3605	#ifdef INET6
	3606	case AF_INET6:
	3607	hlen = sizeof(struct ip6_hdr);
	3608	bzero(&ro6, sizeof(ro6));
	3609	dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
	3610	dst6->sin6_family = AF_INET6;
	3611	dst6->sin6_len = sizeof(*dst6);
	3612	dst6->sin6_addr = addr->v6;
	3613	rtalloc_ign((struct route *)&ro6, (RTF_CLONING \| RTF_PRCLONING));
	3614	rt = ro6.ro_rt;
	3615	break;
	3616	#endif /* INET6 */
	3617	}
	3618
	3619	if (rt && rt->rt_ifp) {
	3620	mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
	3621	mss = max(tcp_mssdflt, mss);
	3622	RTFREE(rt);
	3623	}
	3624	mss = min(mss, offer);
	3625	mss = max(mss, 64); /* sanity - at least max opt space */
	3626	return (mss);
	3627	}
	3628
	3629	void
	3630	pf_set_rt_ifp(struct pf_state s, struct pf_addr saddr)
	3631	{
	3632	struct pf_rule *r = s->rule.ptr;
	3633
	3634	s->rt_kif = NULL;
	3635	if (!r->rt \|\| r->rt == PF_FASTROUTE)
	3636	return;
	3637	switch (s->key[PF_SK_WIRE]->af) {
	3638	#ifdef INET
	3639	case AF_INET:
	3640	pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL,
	3641	&s->nat_src_node);
	3642	s->rt_kif = r->rpool.cur->kif;
	3643	break;
	3644	#endif /* INET */
	3645	#ifdef INET6
	3646	case AF_INET6:
	3647	pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL,
	3648	&s->nat_src_node);
	3649	s->rt_kif = r->rpool.cur->kif;
	3650	break;
	3651	#endif /* INET6 */
	3652	}
	3653	}
	3654
	3655	u_int32_t
	3656	pf_tcp_iss(struct pf_pdesc *pd)
	3657	{
	3658	MD5_CTX ctx;
	3659	u_int32_t digest[4];
	3660
	3661	if (pf_tcp_secret_init == 0) {
	3662	lwkt_gettoken(&pf_gtoken);
	3663	if (pf_tcp_secret_init == 0) {
	3664	karc4random_buf(pf_tcp_secret, sizeof(pf_tcp_secret));
	3665	MD5Init(&pf_tcp_secret_ctx);
	3666	MD5Update(&pf_tcp_secret_ctx, pf_tcp_secret,
	3667	sizeof(pf_tcp_secret));
	3668	pf_tcp_secret_init = 1;
	3669	}
	3670	lwkt_reltoken(&pf_gtoken);
	3671	}
	3672	ctx = pf_tcp_secret_ctx;
	3673
	3674	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
	3675	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
	3676	if (pd->af == AF_INET6) {
	3677	MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
	3678	MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
	3679	} else {
	3680	MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
	3681	MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
	3682	}
	3683	MD5Final((u_char *)digest, &ctx);
	3684	pf_tcp_iss_off += 4096;
	3685
	3686	return (digest[0] + pd->hdr.tcp->th_seq + pf_tcp_iss_off);
	3687	}
	3688
	3689	int
	3690	pf_test_rule(struct pf_rule rm, struct pf_state sm, int direction,
	3691	struct pfi_kif kif, struct mbuf m, int off, void *h,
	3692	struct pf_pdesc pd, struct pf_rule am, struct pf_ruleset *rsm,
	3693	struct ifqueue ifq, struct inpcb inp)
	3694	{
	3695	struct pf_rule *nr = NULL;
	3696	struct pf_addr saddr = pd->src, daddr = pd->dst;
	3697	sa_family_t af = pd->af;
	3698	struct pf_rule r, a = NULL;
	3699	struct pf_ruleset *ruleset = NULL;
	3700	struct pf_src_node *nsn = NULL;
	3701	struct tcphdr *th = pd->hdr.tcp;
	3702	struct pf_state_key skw = NULL, sks = NULL;
	3703	struct pf_state_key sk = NULL, nk = NULL;
	3704	u_short reason;
	3705	int rewrite = 0, hdrlen = 0;
	3706	int tag = -1, rtableid = -1;
	3707	int asd = 0;
	3708	int match = 0;
	3709	int state_icmp = 0;
	3710	u_int16_t sport = 0, dport = 0;
	3711	u_int16_t bproto_sum = 0, bip_sum = 0;
	3712	u_int8_t icmptype = 0, icmpcode = 0;
	3713
	3714
	3715	if (direction == PF_IN && pf_check_congestion(ifq)) {
	3716	REASON_SET(&reason, PFRES_CONGEST);
	3717	return (PF_DROP);
	3718	}
	3719
	3720	if (inp != NULL)
	3721	pd->lookup.done = pf_socket_lookup(direction, pd);
	3722	else if (debug_pfugidhack) {
	3723	DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
	3724	pd->lookup.done = pf_socket_lookup(direction, pd);
	3725	}
	3726
	3727	switch (pd->proto) {
	3728	case IPPROTO_TCP:
	3729	sport = th->th_sport;
	3730	dport = th->th_dport;
	3731	hdrlen = sizeof(*th);
	3732	break;
	3733	case IPPROTO_UDP:
	3734	sport = pd->hdr.udp->uh_sport;
	3735	dport = pd->hdr.udp->uh_dport;
	3736	hdrlen = sizeof(*pd->hdr.udp);
	3737	break;
	3738	#ifdef INET
	3739	case IPPROTO_ICMP:
	3740	if (pd->af != AF_INET)
	3741	break;
	3742	sport = dport = pd->hdr.icmp->icmp_id;
	3743	hdrlen = sizeof(*pd->hdr.icmp);
	3744	icmptype = pd->hdr.icmp->icmp_type;
	3745	icmpcode = pd->hdr.icmp->icmp_code;
	3746
	3747	if (icmptype == ICMP_UNREACH \|\|
	3748	icmptype == ICMP_SOURCEQUENCH \|\|
	3749	icmptype == ICMP_REDIRECT \|\|
	3750	icmptype == ICMP_TIMXCEED \|\|
	3751	icmptype == ICMP_PARAMPROB)
	3752	state_icmp++;
	3753	break;
	3754	#endif /* INET */
	3755	#ifdef INET6
	3756	case IPPROTO_ICMPV6:
	3757	if (af != AF_INET6)
	3758	break;
	3759	sport = dport = pd->hdr.icmp6->icmp6_id;
	3760	hdrlen = sizeof(*pd->hdr.icmp6);
	3761	icmptype = pd->hdr.icmp6->icmp6_type;
	3762	icmpcode = pd->hdr.icmp6->icmp6_code;
	3763
	3764	if (icmptype == ICMP6_DST_UNREACH \|\|
	3765	icmptype == ICMP6_PACKET_TOO_BIG \|\|
	3766	icmptype == ICMP6_TIME_EXCEEDED \|\|
	3767	icmptype == ICMP6_PARAM_PROB)
	3768	state_icmp++;
	3769	break;
	3770	#endif /* INET6 */
	3771	default:
	3772	sport = dport = hdrlen = 0;
	3773	break;
	3774	}
	3775
	3776	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
	3777
	3778	/* check packet for BINAT/NAT/RDR */
	3779	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn,
	3780	&skw, &sks, &sk, &nk, saddr, daddr, sport, dport)) != NULL) {
	3781	if (nk == NULL \|\| sk == NULL) {
	3782	REASON_SET(&reason, PFRES_MEMORY);
	3783	goto cleanup;
	3784	}
	3785
	3786	if (pd->ip_sum)
	3787	bip_sum = *pd->ip_sum;
	3788
	3789	m->m_flags &= ~M_HASH;
	3790	switch (pd->proto) {
	3791	case IPPROTO_TCP:
	3792	bproto_sum = th->th_sum;
	3793	pd->proto_sum = &th->th_sum;
	3794
	3795	if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) \|\|
	3796	nk->port[pd->sidx] != sport) {
	3797	pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
	3798	&th->th_sum, &nk->addr[pd->sidx],
	3799	nk->port[pd->sidx], 0, af);
	3800	pd->sport = &th->th_sport;
	3801	sport = th->th_sport;
	3802	}
	3803
	3804	if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) \|\|
	3805	nk->port[pd->didx] != dport) {
	3806	pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
	3807	&th->th_sum, &nk->addr[pd->didx],
	3808	nk->port[pd->didx], 0, af);
	3809	dport = th->th_dport;
	3810	pd->dport = &th->th_dport;
	3811	}
	3812	rewrite++;
	3813	break;
	3814	case IPPROTO_UDP:
	3815	bproto_sum = pd->hdr.udp->uh_sum;
	3816	pd->proto_sum = &pd->hdr.udp->uh_sum;
	3817
	3818	if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) \|\|
	3819	nk->port[pd->sidx] != sport) {
	3820	pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
	3821	pd->ip_sum, &pd->hdr.udp->uh_sum,
	3822	&nk->addr[pd->sidx],
	3823	nk->port[pd->sidx], 1, af);
	3824	sport = pd->hdr.udp->uh_sport;
	3825	pd->sport = &pd->hdr.udp->uh_sport;
	3826	}
	3827
	3828	if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) \|\|
	3829	nk->port[pd->didx] != dport) {
	3830	pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
	3831	pd->ip_sum, &pd->hdr.udp->uh_sum,
	3832	&nk->addr[pd->didx],
	3833	nk->port[pd->didx], 1, af);
	3834	dport = pd->hdr.udp->uh_dport;
	3835	pd->dport = &pd->hdr.udp->uh_dport;
	3836	}
	3837	rewrite++;
	3838	break;
	3839	#ifdef INET
	3840	case IPPROTO_ICMP:
	3841	nk->port[0] = nk->port[1];
	3842	if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
	3843	pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
	3844	nk->addr[pd->sidx].v4.s_addr, 0);
	3845
	3846	if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
	3847	pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
	3848	nk->addr[pd->didx].v4.s_addr, 0);
	3849
	3850	if (nk->port[1] != pd->hdr.icmp->icmp_id) {
	3851	pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
	3852	pd->hdr.icmp->icmp_cksum, sport,
	3853	nk->port[1], 0);
	3854	pd->hdr.icmp->icmp_id = nk->port[1];
	3855	pd->sport = &pd->hdr.icmp->icmp_id;
	3856	}
	3857	m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp);
	3858	break;
	3859	#endif /* INET */
	3860	#ifdef INET6
	3861	case IPPROTO_ICMPV6:
	3862	nk->port[0] = nk->port[1];
	3863	if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
	3864	pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
	3865	&nk->addr[pd->sidx], 0);
	3866
	3867	if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
	3868	pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
	3869	&nk->addr[pd->didx], 0);
	3870	rewrite++;
	3871	break;
	3872	#endif /* INET */
	3873	default:
	3874	switch (af) {
	3875	#ifdef INET
	3876	case AF_INET:
	3877	if (PF_ANEQ(saddr,
	3878	&nk->addr[pd->sidx], AF_INET))
	3879	pf_change_a(&saddr->v4.s_addr,
	3880	pd->ip_sum,
	3881	nk->addr[pd->sidx].v4.s_addr, 0);
	3882
	3883	if (PF_ANEQ(daddr,
	3884	&nk->addr[pd->didx], AF_INET))
	3885	pf_change_a(&daddr->v4.s_addr,
	3886	pd->ip_sum,
	3887	nk->addr[pd->didx].v4.s_addr, 0);
	3888	break;
	3889	#endif /* INET */
	3890	#ifdef INET6
	3891	case AF_INET6:
	3892	if (PF_ANEQ(saddr,
	3893	&nk->addr[pd->sidx], AF_INET6))
	3894	PF_ACPY(saddr, &nk->addr[pd->sidx], af);
	3895
	3896	if (PF_ANEQ(daddr,
	3897	&nk->addr[pd->didx], AF_INET6))
	3898	PF_ACPY(saddr, &nk->addr[pd->didx], af);
	3899	break;
	3900	#endif /* INET */
	3901	}
	3902	break;
	3903	}
	3904	if (nr->natpass)
	3905	r = NULL;
	3906	pd->nat_rule = nr;
	3907	}
	3908
	3909	while (r != NULL) {
	3910	r->evaluations++;
	3911	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	3912	r = r->skip[PF_SKIP_IFP].ptr;
	3913	else if (r->direction && r->direction != direction)
	3914	r = r->skip[PF_SKIP_DIR].ptr;
	3915	else if (r->af && r->af != af)
	3916	r = r->skip[PF_SKIP_AF].ptr;
	3917	else if (r->proto && r->proto != pd->proto)
	3918	r = r->skip[PF_SKIP_PROTO].ptr;
	3919	else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
	3920	r->src.neg, kif))
	3921	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	3922	/* tcp/udp only. port_op always 0 in other cases */
	3923	else if (r->src.port_op && !pf_match_port(r->src.port_op,
	3924	r->src.port[0], r->src.port[1], sport))
	3925	r = r->skip[PF_SKIP_SRC_PORT].ptr;
	3926	else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
	3927	r->dst.neg, NULL))
	3928	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	3929	/* tcp/udp only. port_op always 0 in other cases */
	3930	else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
	3931	r->dst.port[0], r->dst.port[1], dport))
	3932	r = r->skip[PF_SKIP_DST_PORT].ptr;
	3933	/* icmp only. type always 0 in other cases */
	3934	else if (r->type && r->type != icmptype + 1)
	3935	r = TAILQ_NEXT(r, entries);
	3936	/* icmp only. type always 0 in other cases */
	3937	else if (r->code && r->code != icmpcode + 1)
	3938	r = TAILQ_NEXT(r, entries);
	3939	else if (r->tos && !(r->tos == pd->tos))
	3940	r = TAILQ_NEXT(r, entries);
	3941	else if (r->rule_flag & PFRULE_FRAGMENT)
	3942	r = TAILQ_NEXT(r, entries);
	3943	else if (pd->proto == IPPROTO_TCP &&
	3944	(r->flagset & th->th_flags) != r->flags)
	3945	r = TAILQ_NEXT(r, entries);
	3946	/* tcp/udp only. uid.op always 0 in other cases */
	3947	else if (r->uid.op && (pd->lookup.done \|\| (pd->lookup.done =
	3948	pf_socket_lookup(direction, pd), 1)) &&
	3949	!pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
	3950	pd->lookup.uid))
	3951	r = TAILQ_NEXT(r, entries);
	3952	/* tcp/udp only. gid.op always 0 in other cases */
	3953	else if (r->gid.op && (pd->lookup.done \|\| (pd->lookup.done =
	3954	pf_socket_lookup(direction, pd), 1)) &&
	3955	!pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
	3956	pd->lookup.gid))
	3957	r = TAILQ_NEXT(r, entries);
	3958	else if (r->prob &&
	3959	r->prob <= karc4random())
	3960	r = TAILQ_NEXT(r, entries);
	3961	else if (r->match_tag && !pf_match_tag(m, r, &tag))
	3962	r = TAILQ_NEXT(r, entries);
	3963	else if (r->os_fingerprint != PF_OSFP_ANY &&
	3964	(pd->proto != IPPROTO_TCP \|\| !pf_osfp_match(
	3965	pf_osfp_fingerprint(pd, m, off, th),
	3966	r->os_fingerprint)))
	3967	r = TAILQ_NEXT(r, entries);
	3968	else {
	3969	if (r->tag)
	3970	tag = r->tag;
	3971	if (r->rtableid >= 0)
	3972	rtableid = r->rtableid;
	3973	if (r->anchor == NULL) {
	3974	match = 1;
	3975	*rm = r;
	3976	*am = a;
	3977	*rsm = ruleset;
	3978	if ((*rm)->quick)
	3979	break;
	3980	r = TAILQ_NEXT(r, entries);
	3981	} else
	3982	pf_step_into_anchor(&asd, &ruleset,
	3983	PF_RULESET_FILTER, &r, &a, &match);
	3984	}
	3985	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	3986	PF_RULESET_FILTER, &r, &a, &match))
	3987	break;
	3988	}
	3989	r = *rm;
	3990	a = *am;
	3991	ruleset = *rsm;
	3992
	3993	REASON_SET(&reason, PFRES_MATCH);
	3994
	3995	if (r->log \|\| (nr != NULL && nr->log)) {
	3996	if (rewrite)
	3997	m_copyback(m, off, hdrlen, pd->hdr.any);
	3998	PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
	3999	a, ruleset, pd);
	4000	}
	4001
	4002	if ((r->action == PF_DROP) &&
	4003	((r->rule_flag & PFRULE_RETURNRST) \|\|
	4004	(r->rule_flag & PFRULE_RETURNICMP) \|\|
	4005	(r->rule_flag & PFRULE_RETURN))) {
	4006	/* undo NAT changes, if they have taken place */
	4007	if (nr != NULL) {
	4008	PF_ACPY(saddr, &sk->addr[pd->sidx], af);
	4009	PF_ACPY(daddr, &sk->addr[pd->didx], af);
	4010	if (pd->sport)
	4011	*pd->sport = sk->port[pd->sidx];
	4012	if (pd->dport)
	4013	*pd->dport = sk->port[pd->didx];
	4014	if (pd->proto_sum)
	4015	*pd->proto_sum = bproto_sum;
	4016	if (pd->ip_sum)
	4017	*pd->ip_sum = bip_sum;
	4018	m_copyback(m, off, hdrlen, pd->hdr.any);
	4019	}
	4020	if (pd->proto == IPPROTO_TCP &&
	4021	((r->rule_flag & PFRULE_RETURNRST) \|\|
	4022	(r->rule_flag & PFRULE_RETURN)) &&
	4023	!(th->th_flags & TH_RST)) {
	4024	u_int32_t ack = ntohl(th->th_seq) + pd->p_len;
	4025	int len = 0;
	4026	struct ip *h4;
	4027	#ifdef INET6
	4028	struct ip6_hdr *h6;
	4029	#endif
	4030	switch (af) {
	4031	case AF_INET:
	4032	h4 = mtod(m, struct ip *);
	4033	len = ntohs(h4->ip_len) - off;
	4034	break;
	4035	#ifdef INET6
	4036	case AF_INET6:
	4037	h6 = mtod(m, struct ip6_hdr *);
	4038	len = h6->ip6_plen - (off - sizeof(*h6));
	4039	break;
	4040	#endif
	4041	}
	4042
	4043	if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
	4044	REASON_SET(&reason, PFRES_PROTCKSUM);
	4045	else {
	4046	if (th->th_flags & TH_SYN)
	4047	ack++;
	4048	if (th->th_flags & TH_FIN)
	4049	ack++;
	4050	pf_send_tcp(r, af, pd->dst,
	4051	pd->src, th->th_dport, th->th_sport,
	4052	ntohl(th->th_ack), ack, TH_RST\|TH_ACK, 0, 0,
	4053	r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp);
	4054	}
	4055	} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
	4056	r->return_icmp)
	4057	pf_send_icmp(m, r->return_icmp >> 8,
	4058	r->return_icmp & 255, af, r);
	4059	else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
	4060	r->return_icmp6)
	4061	pf_send_icmp(m, r->return_icmp6 >> 8,
	4062	r->return_icmp6 & 255, af, r);
	4063	}
	4064
	4065	if (r->action == PF_DROP)
	4066	goto cleanup;
	4067
	4068	if (pf_tag_packet(m, tag, rtableid)) {
	4069	REASON_SET(&reason, PFRES_MEMORY);
	4070	goto cleanup;
	4071	}
	4072
	4073	if (!state_icmp && (r->keep_state \|\| nr != NULL \|\|
	4074	(pd->flags & PFDESC_TCP_NORM))) {
	4075	int action;
	4076	action = pf_create_state(r, nr, a, pd, nsn, skw, sks, nk, sk, m,
	4077	off, sport, dport, &rewrite, kif, sm, tag, bproto_sum,
	4078	bip_sum, hdrlen);
	4079	if (action != PF_PASS)
	4080	return (action);
	4081	}
	4082
	4083	/* copy back packet headers if we performed NAT operations */
	4084	if (rewrite)
	4085	m_copyback(m, off, hdrlen, pd->hdr.any);
	4086
	4087	return (PF_PASS);
	4088
	4089	cleanup:
	4090	if (sk != NULL)
	4091	kfree(sk, M_PFSTATEKEYPL);
	4092	if (nk != NULL)
	4093	kfree(nk, M_PFSTATEKEYPL);
	4094	return (PF_DROP);
	4095	}
	4096
	4097	static __inline int
	4098	pf_create_state(struct pf_rule r, struct pf_rule nr, struct pf_rule *a,
	4099	struct pf_pdesc pd, struct pf_src_node nsn, struct pf_state_key *skw,
	4100	struct pf_state_key sks, struct pf_state_key nk, struct pf_state_key *sk,
	4101	struct mbuf m, int off, u_int16_t sport, u_int16_t dport, int rewrite,
	4102	struct pfi_kif kif, struct pf_state *sm, int tag, u_int16_t bproto_sum,
	4103	u_int16_t bip_sum, int hdrlen)
	4104	{
	4105	struct pf_state *s = NULL;
	4106	struct pf_src_node *sn = NULL;
	4107	struct tcphdr *th = pd->hdr.tcp;
	4108	u_int16_t mss = tcp_mssdflt;
	4109	u_short reason;
	4110	int cpu = mycpu->gd_cpuid;
	4111
	4112	/* check maximums */
	4113	if (r->max_states && (r->states_cur >= r->max_states)) {
	4114	PF_INC_LCOUNTER(LCNT_STATES);
	4115	REASON_SET(&reason, PFRES_MAXSTATES);
	4116	return (PF_DROP);
	4117	}
	4118	/* src node for filter rule */
	4119	if ((r->rule_flag & PFRULE_SRCTRACK \|\|
	4120	r->rpool.opts & PF_POOL_STICKYADDR) &&
	4121	pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
	4122	REASON_SET(&reason, PFRES_SRCLIMIT);
	4123	goto csfailed;
	4124	}
	4125	/* src node for translation rule */
	4126	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
	4127	pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
	4128	REASON_SET(&reason, PFRES_SRCLIMIT);
	4129	goto csfailed;
	4130	}
	4131	s = kmalloc(sizeof(struct pf_state), M_PFSTATEPL, M_NOWAIT\|M_ZERO);
	4132	if (s == NULL) {
	4133	REASON_SET(&reason, PFRES_MEMORY);
	4134	goto csfailed;
	4135	}
	4136	lockinit(&s->lk, "pfstlk", 0, 0);
	4137	s->id = 0; /* XXX Do we really need that? not in OpenBSD */
	4138	s->creatorid = 0;
	4139	s->rule.ptr = r;
	4140	s->nat_rule.ptr = nr;
	4141	s->anchor.ptr = a;
	4142	s->state_flags = PFSTATE_CREATEINPROG;
	4143	STATE_INC_COUNTERS(s);
	4144	if (r->allow_opts)
	4145	s->state_flags \|= PFSTATE_ALLOWOPTS;
	4146	if (r->rule_flag & PFRULE_STATESLOPPY)
	4147	s->state_flags \|= PFSTATE_SLOPPY;
	4148	if (pd->not_cpu_localized)
	4149	s->state_flags \|= PFSTATE_STACK_GLOBAL;
	4150
	4151	s->log = r->log & PF_LOG_ALL;
	4152	if (nr != NULL)
	4153	s->log \|= nr->log & PF_LOG_ALL;
	4154	switch (pd->proto) {
	4155	case IPPROTO_TCP:
	4156	s->src.seqlo = ntohl(th->th_seq);
	4157	s->src.seqhi = s->src.seqlo + pd->p_len + 1;
	4158	if ((th->th_flags & (TH_SYN\|TH_ACK)) == TH_SYN &&
	4159	r->keep_state == PF_STATE_MODULATE) {
	4160	/* Generate sequence number modulator */
	4161	if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
	4162	0)
	4163	s->src.seqdiff = 1;
	4164	pf_change_a(&th->th_seq, &th->th_sum,
	4165	htonl(s->src.seqlo + s->src.seqdiff), 0);
	4166	*rewrite = 1;
	4167	} else
	4168	s->src.seqdiff = 0;
	4169	if (th->th_flags & TH_SYN) {
	4170	s->src.seqhi++;
	4171	s->src.wscale = pf_get_wscale(m, off,
	4172	th->th_off, pd->af);
	4173	}
	4174	s->src.max_win = MAX(ntohs(th->th_win), 1);
	4175	if (s->src.wscale & PF_WSCALE_MASK) {
	4176	/* Remove scale factor from initial window */
	4177	int win = s->src.max_win;
	4178	win += 1 << (s->src.wscale & PF_WSCALE_MASK);
	4179	s->src.max_win = (win - 1) >>
	4180	(s->src.wscale & PF_WSCALE_MASK);
	4181	}
	4182	if (th->th_flags & TH_FIN)
	4183	s->src.seqhi++;
	4184	s->dst.seqhi = 1;
	4185	s->dst.max_win = 1;
	4186	s->src.state = TCPS_SYN_SENT;
	4187	s->dst.state = TCPS_CLOSED;
	4188	s->timeout = PFTM_TCP_FIRST_PACKET;
	4189	break;
	4190	case IPPROTO_UDP:
	4191	s->src.state = PFUDPS_SINGLE;
	4192	s->dst.state = PFUDPS_NO_TRAFFIC;
	4193	s->timeout = PFTM_UDP_FIRST_PACKET;
	4194	break;
	4195	case IPPROTO_ICMP:
	4196	#ifdef INET6
	4197	case IPPROTO_ICMPV6:
	4198	#endif
	4199	s->timeout = PFTM_ICMP_FIRST_PACKET;
	4200	break;
	4201	default:
	4202	s->src.state = PFOTHERS_SINGLE;
	4203	s->dst.state = PFOTHERS_NO_TRAFFIC;
	4204	s->timeout = PFTM_OTHER_FIRST_PACKET;
	4205	}
	4206
	4207	s->creation = time_second;
	4208	s->expire = time_second;
	4209
	4210	if (sn != NULL) {
	4211	s->src_node = sn;
	4212	s->src_node->states++;
	4213	}
	4214	if (nsn != NULL) {
	4215	/* XXX We only modify one side for now. */
	4216	PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
	4217	s->nat_src_node = nsn;
	4218	s->nat_src_node->states++;
	4219	}
	4220	if (pd->proto == IPPROTO_TCP) {
	4221	if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
	4222	off, pd, th, &s->src, &s->dst)) {
	4223	REASON_SET(&reason, PFRES_MEMORY);
	4224	pf_src_tree_remove_state(s);
	4225	STATE_DEC_COUNTERS(s);
	4226	kfree(s, M_PFSTATEPL);
	4227	return (PF_DROP);
	4228	}
	4229	if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
	4230	pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
	4231	&s->src, &s->dst, rewrite)) {
	4232	/* This really shouldn't happen!!! */
	4233	DPFPRINTF(PF_DEBUG_URGENT,
	4234	("pf_normalize_tcp_stateful failed on first pkt"));
	4235	pf_normalize_tcp_cleanup(s);
	4236	pf_src_tree_remove_state(s);
	4237	STATE_DEC_COUNTERS(s);
	4238	kfree(s, M_PFSTATEPL);
	4239	return (PF_DROP);
	4240	}
	4241	}
	4242	s->direction = pd->dir;
	4243
	4244	if (sk == NULL && pf_state_key_setup(pd, nr, &skw, &sks, &sk, &nk,
	4245	pd->src, pd->dst, sport, dport)) {
	4246	REASON_SET(&reason, PFRES_MEMORY);
	4247	goto csfailed;
	4248	}
	4249
	4250	if (pf_state_insert(BOUND_IFACE(r, kif), skw, sks, s)) {
	4251	if (pd->proto == IPPROTO_TCP)
	4252	pf_normalize_tcp_cleanup(s);
	4253	REASON_SET(&reason, PFRES_STATEINS);
	4254	pf_src_tree_remove_state(s);
	4255	STATE_DEC_COUNTERS(s);
	4256	kfree(s, M_PFSTATEPL);
	4257	return (PF_DROP);
	4258	} else
	4259	*sm = s;
	4260
	4261	pf_set_rt_ifp(s, pd->src); /* needs s->state_key set */
	4262	if (tag > 0) {
	4263	pf_tag_ref(tag);
	4264	s->tag = tag;
	4265	}
	4266	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN\|TH_ACK)) ==
	4267	TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
	4268	s->src.state = PF_TCPS_PROXY_SRC;
	4269	/* undo NAT changes, if they have taken place */
	4270	if (nr != NULL) {
	4271	struct pf_state_key *skt = s->key[PF_SK_WIRE];
	4272	if (pd->dir == PF_OUT)
	4273	skt = s->key[PF_SK_STACK];
	4274	PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
	4275	PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
	4276	if (pd->sport)
	4277	*pd->sport = skt->port[pd->sidx];
	4278	if (pd->dport)
	4279	*pd->dport = skt->port[pd->didx];
	4280	if (pd->proto_sum)
	4281	*pd->proto_sum = bproto_sum;
	4282	if (pd->ip_sum)
	4283	*pd->ip_sum = bip_sum;
	4284	m->m_flags &= ~M_HASH;
	4285	m_copyback(m, off, hdrlen, pd->hdr.any);
	4286	}
	4287	s->src.seqhi = htonl(karc4random());
	4288	/* Find mss option */
	4289	mss = pf_get_mss(m, off, th->th_off, pd->af);
	4290	mss = pf_calc_mss(pd->src, pd->af, mss);
	4291	mss = pf_calc_mss(pd->dst, pd->af, mss);
	4292	s->src.mss = mss;
	4293	s->state_flags &= ~PFSTATE_CREATEINPROG;
	4294	pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
	4295	th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
	4296	TH_SYN\|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL);
	4297	REASON_SET(&reason, PFRES_SYNPROXY);
	4298	return (PF_SYNPROXY_DROP);
	4299	}
	4300
	4301	s->state_flags &= ~PFSTATE_CREATEINPROG;
	4302	return (PF_PASS);
	4303
	4304	csfailed:
	4305	if (sk != NULL)
	4306	kfree(sk, M_PFSTATEKEYPL);
	4307	if (nk != NULL)
	4308	kfree(nk, M_PFSTATEKEYPL);
	4309
	4310	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
	4311	RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], sn);
	4312	PF_INC_SCOUNTER(SCNT_SRC_NODE_REMOVALS);
	4313	atomic_add_int(&pf_status.src_nodes, -1);
	4314	kfree(sn, M_PFSRCTREEPL);
	4315	}
	4316	if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
	4317	RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], nsn);
	4318	PF_INC_SCOUNTER(SCNT_SRC_NODE_REMOVALS);
	4319	atomic_add_int(&pf_status.src_nodes, -1);
	4320	kfree(nsn, M_PFSRCTREEPL);
	4321	}
	4322	if (s) {
	4323	pf_src_tree_remove_state(s);
	4324	STATE_DEC_COUNTERS(s);
	4325	kfree(s, M_PFSTATEPL);
	4326	}
	4327
	4328	return (PF_DROP);
	4329	}
	4330
	4331	int
	4332	pf_test_fragment(struct pf_rule *rm, int direction, struct pfi_kif kif,
	4333	struct mbuf m, void h, struct pf_pdesc pd, struct pf_rule *am,
	4334	struct pf_ruleset **rsm)
	4335	{
	4336	struct pf_rule r, a = NULL;
	4337	struct pf_ruleset *ruleset = NULL;
	4338	sa_family_t af = pd->af;
	4339	u_short reason;
	4340	int tag = -1;
	4341	int asd = 0;
	4342	int match = 0;
	4343
	4344	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
	4345	while (r != NULL) {
	4346	r->evaluations++;
	4347	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	4348	r = r->skip[PF_SKIP_IFP].ptr;
	4349	else if (r->direction && r->direction != direction)
	4350	r = r->skip[PF_SKIP_DIR].ptr;
	4351	else if (r->af && r->af != af)
	4352	r = r->skip[PF_SKIP_AF].ptr;
	4353	else if (r->proto && r->proto != pd->proto)
	4354	r = r->skip[PF_SKIP_PROTO].ptr;
	4355	else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
	4356	r->src.neg, kif))
	4357	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	4358	else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
	4359	r->dst.neg, NULL))
	4360	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	4361	else if (r->tos && !(r->tos == pd->tos))
	4362	r = TAILQ_NEXT(r, entries);
	4363	else if (r->os_fingerprint != PF_OSFP_ANY)
	4364	r = TAILQ_NEXT(r, entries);
	4365	else if (pd->proto == IPPROTO_UDP &&
	4366	(r->src.port_op \|\| r->dst.port_op))
	4367	r = TAILQ_NEXT(r, entries);
	4368	else if (pd->proto == IPPROTO_TCP &&
	4369	(r->src.port_op \|\| r->dst.port_op \|\| r->flagset))
	4370	r = TAILQ_NEXT(r, entries);
	4371	else if ((pd->proto == IPPROTO_ICMP \|\|
	4372	pd->proto == IPPROTO_ICMPV6) &&
	4373	(r->type \|\| r->code))
	4374	r = TAILQ_NEXT(r, entries);
	4375	else if (r->prob && r->prob <= karc4random())
	4376	r = TAILQ_NEXT(r, entries);
	4377	else if (r->match_tag && !pf_match_tag(m, r, &tag))
	4378	r = TAILQ_NEXT(r, entries);
	4379	else {
	4380	if (r->anchor == NULL) {
	4381	match = 1;
	4382	*rm = r;
	4383	*am = a;
	4384	*rsm = ruleset;
	4385	if ((*rm)->quick)
	4386	break;
	4387	r = TAILQ_NEXT(r, entries);
	4388	} else
	4389	pf_step_into_anchor(&asd, &ruleset,
	4390	PF_RULESET_FILTER, &r, &a, &match);
	4391	}
	4392	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	4393	PF_RULESET_FILTER, &r, &a, &match))
	4394	break;
	4395	}
	4396	r = *rm;
	4397	a = *am;
	4398	ruleset = *rsm;
	4399
	4400	REASON_SET(&reason, PFRES_MATCH);
	4401
	4402	if (r->log)
	4403	PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset,
	4404	pd);
	4405
	4406	if (r->action != PF_PASS)
	4407	return (PF_DROP);
	4408
	4409	if (pf_tag_packet(m, tag, -1)) {
	4410	REASON_SET(&reason, PFRES_MEMORY);
	4411	return (PF_DROP);
	4412	}
	4413
	4414	return (PF_PASS);
	4415	}
	4416
	4417	/*
	4418	* Called with state locked
	4419	*/
	4420	int
	4421	pf_tcp_track_full(struct pf_state_peer src, struct pf_state_peer dst,
	4422	struct pf_state *state, struct pfi_kif kif, struct mbuf *m, int off,
	4423	struct pf_pdesc pd, u_short reason, int *copyback)
	4424	{
	4425	struct tcphdr *th = pd->hdr.tcp;
	4426	u_int16_t win = ntohs(th->th_win);
	4427	u_int32_t ack, end, seq, orig_seq;
	4428	u_int8_t sws, dws;
	4429	int ackskew;
	4430
	4431	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
	4432	sws = src->wscale & PF_WSCALE_MASK;
	4433	dws = dst->wscale & PF_WSCALE_MASK;
	4434	} else {
	4435	sws = dws = 0;
	4436	}
	4437
	4438	/*
	4439	* Sequence tracking algorithm from Guido van Rooij's paper:
	4440	* http://www.madison-gurkha.com/publications/tcp_filtering/
	4441	* tcp_filtering.ps
	4442	*/
	4443
	4444	orig_seq = seq = ntohl(th->th_seq);
	4445	if (src->seqlo == 0) {
	4446	/* First packet from this end. Set its state */
	4447
	4448	if ((pd->flags & PFDESC_TCP_NORM \|\| dst->scrub) &&
	4449	src->scrub == NULL) {
	4450	if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
	4451	REASON_SET(reason, PFRES_MEMORY);
	4452	return (PF_DROP);
	4453	}
	4454	}
	4455
	4456	/* Deferred generation of sequence number modulator */
	4457	if (dst->seqdiff && !src->seqdiff) {
	4458	/* use random iss for the TCP server */
	4459	while ((src->seqdiff = karc4random() - seq) == 0)
	4460	;
	4461	ack = ntohl(th->th_ack) - dst->seqdiff;
	4462	pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
	4463	src->seqdiff), 0);
	4464	pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
	4465	*copyback = 1;
	4466	} else {
	4467	ack = ntohl(th->th_ack);
	4468	}
	4469
	4470	end = seq + pd->p_len;
	4471	if (th->th_flags & TH_SYN) {
	4472	end++;
	4473	(*state)->sync_flags \|= PFSTATE_GOT_SYN2;
	4474	if (dst->wscale & PF_WSCALE_FLAG) {
	4475	src->wscale = pf_get_wscale(m, off, th->th_off,
	4476	pd->af);
	4477	if (src->wscale & PF_WSCALE_FLAG) {
	4478	/* Remove scale factor from initial
	4479	* window */
	4480	sws = src->wscale & PF_WSCALE_MASK;
	4481	win = ((u_int32_t)win + (1 << sws) - 1)
	4482	>> sws;
	4483	dws = dst->wscale & PF_WSCALE_MASK;
	4484	} else {
	4485	/* fixup other window */
	4486	dst->max_win <<= dst->wscale &
	4487	PF_WSCALE_MASK;
	4488	/* in case of a retrans SYN\|ACK */
	4489	dst->wscale = 0;
	4490	}
	4491	}
	4492	}
	4493	if (th->th_flags & TH_FIN)
	4494	end++;
	4495
	4496	src->seqlo = seq;
	4497	if (src->state < TCPS_SYN_SENT)
	4498	src->state = TCPS_SYN_SENT;
	4499
	4500	/*
	4501	* May need to slide the window (seqhi may have been set by
	4502	* the crappy stack check or if we picked up the connection
	4503	* after establishment)
	4504	*/
	4505	if (src->seqhi == 1 \|\|
	4506	SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
	4507	src->seqhi = end + MAX(1, dst->max_win << dws);
	4508	if (win > src->max_win)
	4509	src->max_win = win;
	4510
	4511	} else {
	4512	ack = ntohl(th->th_ack) - dst->seqdiff;
	4513	if (src->seqdiff) {
	4514	/* Modulate sequence numbers */
	4515	pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
	4516	src->seqdiff), 0);
	4517	pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
	4518	*copyback = 1;
	4519	}
	4520	end = seq + pd->p_len;
	4521	if (th->th_flags & TH_SYN)
	4522	end++;
	4523	if (th->th_flags & TH_FIN)
	4524	end++;
	4525	}
	4526
	4527	if ((th->th_flags & TH_ACK) == 0) {
	4528	/* Let it pass through the ack skew check */
	4529	ack = dst->seqlo;
	4530	} else if ((ack == 0 &&
	4531	(th->th_flags & (TH_ACK\|TH_RST)) == (TH_ACK\|TH_RST)) \|\|
	4532	/* broken tcp stacks do not set ack */
	4533	(dst->state < TCPS_SYN_SENT)) {
	4534	/*
	4535	* Many stacks (ours included) will set the ACK number in an
	4536	* FIN\|ACK if the SYN times out -- no sequence to ACK.
	4537	*/
	4538	ack = dst->seqlo;
	4539	}
	4540
	4541	if (seq == end) {
	4542	/* Ease sequencing restrictions on no data packets */
	4543	seq = src->seqlo;
	4544	end = seq;
	4545	}
	4546
	4547	ackskew = dst->seqlo - ack;
	4548
	4549
	4550	/*
	4551	* Need to demodulate the sequence numbers in any TCP SACK options
	4552	* (Selective ACK). We could optionally validate the SACK values
	4553	* against the current ACK window, either forwards or backwards, but
	4554	* I'm not confident that SACK has been implemented properly
	4555	* everywhere. It wouldn't surprise me if several stacks accidently
	4556	* SACK too far backwards of previously ACKed data. There really aren't
	4557	* any security implications of bad SACKing unless the target stack
	4558	* doesn't validate the option length correctly. Someone trying to
	4559	* spoof into a TCP connection won't bother blindly sending SACK
	4560	* options anyway.
	4561	*/
	4562	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
	4563	if (pf_modulate_sack(m, off, pd, th, dst))
	4564	*copyback = 1;
	4565	}
	4566
	4567
	4568	#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
	4569	if (SEQ_GEQ(src->seqhi, end) &&
	4570	/* Last octet inside other's window space */
	4571	SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
	4572	/* Retrans: not more than one window back */
	4573	(ackskew >= -MAXACKWINDOW) &&
	4574	/* Acking not more than one reassembled fragment backwards */
	4575	(ackskew <= (MAXACKWINDOW << sws)) &&
	4576	/* Acking not more than one window forward */
	4577	((th->th_flags & TH_RST) == 0 \|\| orig_seq == src->seqlo \|\|
	4578	(orig_seq == src->seqlo + 1) \|\| (orig_seq + 1 == src->seqlo)))
	4579	{
	4580	/*
	4581	* Require an exact/+1 sequence match on resets
	4582	* when possible
	4583	*/
	4584	if (dst->scrub \|\| src->scrub) {
	4585	if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
	4586	*state, src, dst, copyback))
	4587	return (PF_DROP);
	4588	}
	4589
	4590	/* update max window */
	4591	if (src->max_win < win)
	4592	src->max_win = win;
	4593	/* synchronize sequencing */
	4594	if (SEQ_GT(end, src->seqlo))
	4595	src->seqlo = end;
	4596	/* slide the window of what the other end can send */
	4597	if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
	4598	dst->seqhi = ack + MAX((win << sws), 1);
	4599
	4600
	4601	/* update states */
	4602	if (th->th_flags & TH_SYN)
	4603	if (src->state < TCPS_SYN_SENT)
	4604	src->state = TCPS_SYN_SENT;
	4605	if (th->th_flags & TH_FIN)
	4606	if (src->state < TCPS_CLOSING)
	4607	src->state = TCPS_CLOSING;
	4608	if (th->th_flags & TH_ACK) {
	4609	if (dst->state == TCPS_SYN_SENT) {
	4610	dst->state = TCPS_ESTABLISHED;
	4611	if (src->state == TCPS_ESTABLISHED &&
	4612	(*state)->src_node != NULL &&
	4613	pf_src_connlimit(*state)) {
	4614	REASON_SET(reason, PFRES_SRCLIMIT);
	4615	return (PF_DROP);
	4616	}
	4617	} else if (dst->state == TCPS_CLOSING)
	4618	dst->state = TCPS_FIN_WAIT_2;
	4619	}
	4620	if (th->th_flags & TH_RST)
	4621	src->state = dst->state = TCPS_TIME_WAIT;
	4622
	4623	/* update expire time */
	4624	(*state)->expire = time_second;
	4625	if (src->state >= TCPS_FIN_WAIT_2 &&
	4626	dst->state >= TCPS_FIN_WAIT_2)
	4627	(*state)->timeout = PFTM_TCP_CLOSED;
	4628	else if (src->state >= TCPS_CLOSING &&
	4629	dst->state >= TCPS_CLOSING)
	4630	(*state)->timeout = PFTM_TCP_FIN_WAIT;
	4631	else if (src->state < TCPS_ESTABLISHED \|\|
	4632	dst->state < TCPS_ESTABLISHED)
	4633	(*state)->timeout = PFTM_TCP_OPENING;
	4634	else if (src->state >= TCPS_CLOSING \|\|
	4635	dst->state >= TCPS_CLOSING)
	4636	(*state)->timeout = PFTM_TCP_CLOSING;
	4637	else if ((th->th_flags & TH_SYN) &&
	4638	((*state)->state_flags & PFSTATE_SLOPPY))
	4639	(*state)->timeout = PFTM_TCP_FIRST_PACKET;
	4640	else
	4641	(*state)->timeout = PFTM_TCP_ESTABLISHED;
	4642
	4643	/* Fall through to PASS packet */
	4644
	4645	} else if ((dst->state < TCPS_SYN_SENT \|\|
	4646	dst->state >= TCPS_FIN_WAIT_2 \|\|
	4647	src->state >= TCPS_FIN_WAIT_2) &&
	4648	SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
	4649	/* Within a window forward of the originating packet */
	4650	SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
	4651	/* Within a window backward of the originating packet */
	4652
	4653	/*
	4654	* This currently handles three situations:
	4655	* 1) Stupid stacks will shotgun SYNs before their peer
	4656	* replies.
	4657	* 2) When PF catches an already established stream (the
	4658	* firewall rebooted, the state table was flushed, routes
	4659	* changed...)
	4660	* 3) Packets get funky immediately after the connection
	4661	* closes (this should catch Solaris spurious ACK\|FINs
	4662	* that web servers like to spew after a close)
	4663	*
	4664	* This must be a little more careful than the above code
	4665	* since packet floods will also be caught here. We don't
	4666	* update the TTL here to mitigate the damage of a packet
	4667	* flood and so the same code can handle awkward establishment
	4668	* and a loosened connection close.
	4669	* In the establishment case, a correct peer response will
	4670	* validate the connection, go through the normal state code
	4671	* and keep updating the state TTL.
	4672	*/
	4673
	4674	if (pf_status.debug >= PF_DEBUG_MISC) {
	4675	kprintf("pf: loose state match: ");
	4676	pf_print_state(*state);
	4677	pf_print_flags(th->th_flags);
	4678	kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
	4679	"pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len,
	4680	ackskew, (unsigned long long)(*state)->packets[0],
	4681	(unsigned long long)(*state)->packets[1],
	4682	pd->dir == PF_IN ? "in" : "out",
	4683	pd->dir == (*state)->direction ? "fwd" : "rev");
	4684	}
	4685
	4686	if (dst->scrub \|\| src->scrub) {
	4687	if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
	4688	*state, src, dst, copyback))
	4689	return (PF_DROP);
	4690	}
	4691
	4692	/* update max window */
	4693	if (src->max_win < win)
	4694	src->max_win = win;
	4695	/* synchronize sequencing */
	4696	if (SEQ_GT(end, src->seqlo))
	4697	src->seqlo = end;
	4698	/* slide the window of what the other end can send */
	4699	if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
	4700	dst->seqhi = ack + MAX((win << sws), 1);
	4701
	4702	/*
	4703	* Cannot set dst->seqhi here since this could be a shotgunned
	4704	* SYN and not an already established connection.
	4705	*/
	4706
	4707	if (th->th_flags & TH_FIN)
	4708	if (src->state < TCPS_CLOSING)
	4709	src->state = TCPS_CLOSING;
	4710	if (th->th_flags & TH_RST)
	4711	src->state = dst->state = TCPS_TIME_WAIT;
	4712
	4713	/* Fall through to PASS packet */
	4714
	4715	} else if ((*state)->pickup_mode == PF_PICKUPS_HASHONLY \|\|
	4716	((*state)->pickup_mode == PF_PICKUPS_ENABLED &&
	4717	((*state)->sync_flags & PFSTATE_GOT_SYN_MASK) !=
	4718	PFSTATE_GOT_SYN_MASK)) {
	4719	/*
	4720	* If pickup mode is hash only, do not fail on sequence checks.
	4721	*
	4722	* If pickup mode is enabled and we did not see the SYN in
	4723	* both direction, do not fail on sequence checks because
	4724	* we do not have complete information on window scale.
	4725	*
	4726	* Adjust expiration and fall through to PASS packet.
	4727	* XXX Add a FIN check to reduce timeout?
	4728	*/
	4729	(*state)->expire = time_second;
	4730	} else {
	4731	/*
	4732	* Failure processing
	4733	*/
	4734	if ((*state)->dst.state == TCPS_SYN_SENT &&
	4735	(*state)->src.state == TCPS_SYN_SENT) {
	4736	/* Send RST for state mismatches during handshake */
	4737	if (!(th->th_flags & TH_RST))
	4738	pf_send_tcp((*state)->rule.ptr, pd->af,
	4739	pd->dst, pd->src, th->th_dport,
	4740	th->th_sport, ntohl(th->th_ack), 0,
	4741	TH_RST, 0, 0,
	4742	(*state)->rule.ptr->return_ttl, 1, 0,
	4743	pd->eh, kif->pfik_ifp);
	4744	src->seqlo = 0;
	4745	src->seqhi = 1;
	4746	src->max_win = 1;
	4747	} else if (pf_status.debug >= PF_DEBUG_MISC) {
	4748	kprintf("pf: BAD state: ");
	4749	pf_print_state(*state);
	4750	pf_print_flags(th->th_flags);
	4751	kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
	4752	"pkts=%llu:%llu dir=%s,%s\n",
	4753	seq, orig_seq, ack, pd->p_len, ackskew,
	4754	(unsigned long long)(*state)->packets[0],
	4755	(unsigned long long)(*state)->packets[1],
	4756	pd->dir == PF_IN ? "in" : "out",
	4757	pd->dir == (*state)->direction ? "fwd" : "rev");
	4758	kprintf("pf: State failure on: %c %c %c %c \| %c %c\n",
	4759	SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
	4760	SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
	4761	' ': '2',
	4762	(ackskew >= -MAXACKWINDOW) ? ' ' : '3',
	4763	(ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
	4764	SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
	4765	SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
	4766	}
	4767	REASON_SET(reason, PFRES_BADSTATE);
	4768	return (PF_DROP);
	4769	}
	4770
	4771	return (PF_PASS);
	4772	}
	4773
	4774	/*
	4775	* Called with state locked
	4776	*/
	4777	int
	4778	pf_tcp_track_sloppy(struct pf_state_peer src, struct pf_state_peer dst,
	4779	struct pf_state *state, struct pf_pdesc pd, u_short *reason)
	4780	{
	4781	struct tcphdr *th = pd->hdr.tcp;
	4782
	4783	if (th->th_flags & TH_SYN)
	4784	if (src->state < TCPS_SYN_SENT)
	4785	src->state = TCPS_SYN_SENT;
	4786	if (th->th_flags & TH_FIN)
	4787	if (src->state < TCPS_CLOSING)
	4788	src->state = TCPS_CLOSING;
	4789	if (th->th_flags & TH_ACK) {
	4790	if (dst->state == TCPS_SYN_SENT) {
	4791	dst->state = TCPS_ESTABLISHED;
	4792	if (src->state == TCPS_ESTABLISHED &&
	4793	(*state)->src_node != NULL &&
	4794	pf_src_connlimit(*state)) {
	4795	REASON_SET(reason, PFRES_SRCLIMIT);
	4796	return (PF_DROP);
	4797	}
	4798	} else if (dst->state == TCPS_CLOSING) {
	4799	dst->state = TCPS_FIN_WAIT_2;
	4800	} else if (src->state == TCPS_SYN_SENT &&
	4801	dst->state < TCPS_SYN_SENT) {
	4802	/*
	4803	* Handle a special sloppy case where we only see one
	4804	* half of the connection. If there is a ACK after
	4805	* the initial SYN without ever seeing a packet from
	4806	* the destination, set the connection to established.
	4807	*/
	4808	dst->state = src->state = TCPS_ESTABLISHED;
	4809	if ((*state)->src_node != NULL &&
	4810	pf_src_connlimit(*state)) {
	4811	REASON_SET(reason, PFRES_SRCLIMIT);
	4812	return (PF_DROP);
	4813	}
	4814	} else if (src->state == TCPS_CLOSING &&
	4815	dst->state == TCPS_ESTABLISHED &&
	4816	dst->seqlo == 0) {
	4817	/*
	4818	* Handle the closing of half connections where we
	4819	* don't see the full bidirectional FIN/ACK+ACK
	4820	* handshake.
	4821	*/
	4822	dst->state = TCPS_CLOSING;
	4823	}
	4824	}
	4825	if (th->th_flags & TH_RST)
	4826	src->state = dst->state = TCPS_TIME_WAIT;
	4827
	4828	/* update expire time */
	4829	(*state)->expire = time_second;
	4830	if (src->state >= TCPS_FIN_WAIT_2 &&
	4831	dst->state >= TCPS_FIN_WAIT_2)
	4832	(*state)->timeout = PFTM_TCP_CLOSED;
	4833	else if (src->state >= TCPS_CLOSING &&
	4834	dst->state >= TCPS_CLOSING)
	4835	(*state)->timeout = PFTM_TCP_FIN_WAIT;
	4836	else if (src->state < TCPS_ESTABLISHED \|\|
	4837	dst->state < TCPS_ESTABLISHED)
	4838	(*state)->timeout = PFTM_TCP_OPENING;
	4839	else if (src->state >= TCPS_CLOSING \|\|
	4840	dst->state >= TCPS_CLOSING)
	4841	(*state)->timeout = PFTM_TCP_CLOSING;
	4842	else if ((th->th_flags & TH_SYN) &&
	4843	((*state)->state_flags & PFSTATE_SLOPPY))
	4844	(*state)->timeout = PFTM_TCP_FIRST_PACKET;
	4845	else
	4846	(*state)->timeout = PFTM_TCP_ESTABLISHED;
	4847
	4848	return (PF_PASS);
	4849	}
	4850
	4851	/*
	4852	* Test TCP connection state. Caller must hold the state locked.
	4853	*/
	4854	int
	4855	pf_test_state_tcp(struct pf_state *state, int direction, struct pfi_kif kif,
	4856	struct mbuf m, int off, void h, struct pf_pdesc *pd,
	4857	u_short *reason)
	4858	{
	4859	struct pf_state_key_cmp key;
	4860	struct tcphdr *th = pd->hdr.tcp;
	4861	int copyback = 0;
	4862	int error;
	4863	struct pf_state_peer src, dst;
	4864	struct pf_state_key *sk;
	4865
	4866	bzero(&key, sizeof(key));
	4867	key.af = pd->af;
	4868	key.proto = IPPROTO_TCP;
	4869	if (direction == PF_IN) { /* wire side, straight */
	4870	PF_ACPY(&key.addr[0], pd->src, key.af);
	4871	PF_ACPY(&key.addr[1], pd->dst, key.af);
	4872	key.port[0] = th->th_sport;
	4873	key.port[1] = th->th_dport;
	4874	if (pf_status.debug >= PF_DEBUG_MISC) {
	4875	kprintf("test-tcp IN (%08x:%d) -> (%08x:%d)\n",
	4876	ntohl(key.addr[0].addr32[0]),
	4877	ntohs(key.port[0]),
	4878	ntohl(key.addr[1].addr32[0]),
	4879	ntohs(key.port[1]));
	4880	}
	4881	} else { /* stack side, reverse */
	4882	PF_ACPY(&key.addr[1], pd->src, key.af);
	4883	PF_ACPY(&key.addr[0], pd->dst, key.af);
	4884	key.port[1] = th->th_sport;
	4885	key.port[0] = th->th_dport;
	4886	if (pf_status.debug >= PF_DEBUG_MISC) {
	4887	kprintf("test-tcp OUT (%08x:%d) <- (%08x:%d)\n",
	4888	ntohl(key.addr[0].addr32[0]),
	4889	ntohs(key.port[0]),
	4890	ntohl(key.addr[1].addr32[0]),
	4891	ntohs(key.port[1]));
	4892	}
	4893	}
	4894
	4895	STATE_LOOKUP(kif, &key, direction, *state, m);
	4896	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	4897
	4898	if (direction == (*state)->direction) {
	4899	src = &(*state)->src;
	4900	dst = &(*state)->dst;
	4901	} else {
	4902	src = &(*state)->dst;
	4903	dst = &(*state)->src;
	4904	}
	4905
	4906	sk = (*state)->key[pd->didx];
	4907
	4908	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
	4909	if (direction != (*state)->direction) {
	4910	REASON_SET(reason, PFRES_SYNPROXY);
	4911	FAIL (PF_SYNPROXY_DROP);
	4912	}
	4913	if (th->th_flags & TH_SYN) {
	4914	if (ntohl(th->th_seq) != (*state)->src.seqlo) {
	4915	REASON_SET(reason, PFRES_SYNPROXY);
	4916	FAIL (PF_DROP);
	4917	}
	4918	pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
	4919	pd->src, th->th_dport, th->th_sport,
	4920	(*state)->src.seqhi, ntohl(th->th_seq) + 1,
	4921	TH_SYN\|TH_ACK, 0, (*state)->src.mss, 0, 1,
	4922	0, NULL, NULL);
	4923	REASON_SET(reason, PFRES_SYNPROXY);
	4924	FAIL (PF_SYNPROXY_DROP);
	4925	} else if (!(th->th_flags & TH_ACK) \|\|
	4926	(ntohl(th->th_ack) != (*state)->src.seqhi + 1) \|\|
	4927	(ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
	4928	REASON_SET(reason, PFRES_SYNPROXY);
	4929	FAIL (PF_DROP);
	4930	} else if ((*state)->src_node != NULL &&
	4931	pf_src_connlimit(*state)) {
	4932	REASON_SET(reason, PFRES_SRCLIMIT);
	4933	FAIL (PF_DROP);
	4934	} else
	4935	(*state)->src.state = PF_TCPS_PROXY_DST;
	4936	}
	4937	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
	4938	if (direction == (*state)->direction) {
	4939	if (((th->th_flags & (TH_SYN\|TH_ACK)) != TH_ACK) \|\|
	4940	(ntohl(th->th_ack) != (*state)->src.seqhi + 1) \|\|
	4941	(ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
	4942	REASON_SET(reason, PFRES_SYNPROXY);
	4943	FAIL (PF_DROP);
	4944	}
	4945	(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
	4946	if ((*state)->dst.seqhi == 1)
	4947	(*state)->dst.seqhi = htonl(karc4random());
	4948	pf_send_tcp((*state)->rule.ptr, pd->af,
	4949	&sk->addr[pd->sidx], &sk->addr[pd->didx],
	4950	sk->port[pd->sidx], sk->port[pd->didx],
	4951	(*state)->dst.seqhi, 0, TH_SYN, 0,
	4952	(state)->src.mss, 0, 0, (state)->tag, NULL, NULL);
	4953	REASON_SET(reason, PFRES_SYNPROXY);
	4954	FAIL (PF_SYNPROXY_DROP);
	4955	} else if (((th->th_flags & (TH_SYN\|TH_ACK)) !=
	4956	(TH_SYN\|TH_ACK)) \|\|
	4957	(ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
	4958	REASON_SET(reason, PFRES_SYNPROXY);
	4959	FAIL (PF_DROP);
	4960	} else {
	4961	(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
	4962	(*state)->dst.seqlo = ntohl(th->th_seq);
	4963	pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
	4964	pd->src, th->th_dport, th->th_sport,
	4965	ntohl(th->th_ack), ntohl(th->th_seq) + 1,
	4966	TH_ACK, (*state)->src.max_win, 0, 0, 0,
	4967	(*state)->tag, NULL, NULL);
	4968	pf_send_tcp((*state)->rule.ptr, pd->af,
	4969	&sk->addr[pd->sidx], &sk->addr[pd->didx],
	4970	sk->port[pd->sidx], sk->port[pd->didx],
	4971	(state)->src.seqhi + 1, (state)->src.seqlo + 1,
	4972	TH_ACK, (*state)->dst.max_win, 0, 0, 1,
	4973	0, NULL, NULL);
	4974	(state)->src.seqdiff = (state)->dst.seqhi -
	4975	(*state)->src.seqlo;
	4976	(state)->dst.seqdiff = (state)->src.seqhi -
	4977	(*state)->dst.seqlo;
	4978	(state)->src.seqhi = (state)->src.seqlo +
	4979	(*state)->dst.max_win;
	4980	(state)->dst.seqhi = (state)->dst.seqlo +
	4981	(*state)->src.max_win;
	4982	(state)->src.wscale = (state)->dst.wscale = 0;
	4983	(state)->src.state = (state)->dst.state =
	4984	TCPS_ESTABLISHED;
	4985	REASON_SET(reason, PFRES_SYNPROXY);
	4986	FAIL (PF_SYNPROXY_DROP);
	4987	}
	4988	}
	4989
	4990	/*
	4991	* Check for connection (addr+port pair) reuse. We can't actually
	4992	* unlink the state if we don't own it.
	4993	*/
	4994	if (((th->th_flags & (TH_SYN\|TH_ACK)) == TH_SYN) &&
	4995	dst->state >= TCPS_FIN_WAIT_2 &&
	4996	src->state >= TCPS_FIN_WAIT_2) {
	4997	if (pf_status.debug >= PF_DEBUG_MISC) {
	4998	kprintf("pf: state reuse ");
	4999	pf_print_state(*state);
	5000	pf_print_flags(th->th_flags);
	5001	kprintf("\n");
	5002	}
	5003	/* XXX make sure it's the same direction ?? */
	5004	(state)->src.state = (state)->dst.state = TCPS_CLOSED;
	5005	if ((*state)->cpuid == mycpu->gd_cpuid) {
	5006	pf_unlink_state(*state);
	5007	*state = NULL;
	5008	} else {
	5009	(*state)->timeout = PFTM_PURGE;
	5010	}
	5011	FAIL (PF_DROP);
	5012	}
	5013
	5014	if ((*state)->state_flags & PFSTATE_SLOPPY) {
	5015	if (pf_tcp_track_sloppy(src, dst, state, pd,
	5016	reason) == PF_DROP) {
	5017	FAIL (PF_DROP);
	5018	}
	5019	} else {
	5020	if (pf_tcp_track_full(src, dst, state, kif, m, off, pd,
	5021	reason, &copyback) == PF_DROP) {
	5022	FAIL (PF_DROP);
	5023	}
	5024	}
	5025
	5026	/* translate source/destination address, if necessary */
	5027	if ((state)->key[PF_SK_WIRE] != (state)->key[PF_SK_STACK]) {
	5028	struct pf_state_key nk = (state)->key[pd->didx];
	5029
	5030	if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) \|\|
	5031	nk->port[pd->sidx] != th->th_sport) {
	5032	/*
	5033	* The translated source address may be completely
	5034	* unrelated to the saved link header, make sure
	5035	* a bridge doesn't try to use it.
	5036	*/
	5037	m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
	5038	pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
	5039	&th->th_sum, &nk->addr[pd->sidx],
	5040	nk->port[pd->sidx], 0, pd->af);
	5041	}
	5042
	5043	if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) \|\|
	5044	nk->port[pd->didx] != th->th_dport) {
	5045	/*
	5046	* If we don't redispatch the packet will go into
	5047	* the protocol stack on the wrong cpu for the
	5048	* post-translated address.
	5049	*/
	5050	pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
	5051	&th->th_sum, &nk->addr[pd->didx],
	5052	nk->port[pd->didx], 0, pd->af);
	5053	}
	5054	copyback = 1;
	5055	}
	5056
	5057	/* Copyback sequence modulation or stateful scrub changes if needed */
	5058	if (copyback) {
	5059	m->m_flags &= ~M_HASH;
	5060	m_copyback(m, off, sizeof(*th), th);
	5061	}
	5062
	5063	pfsync_update_state(*state);
	5064	error = PF_PASS;
	5065	done:
	5066	if (*state)
	5067	lockmgr(&(*state)->lk, LK_RELEASE);
	5068	return (error);
	5069	}
	5070
	5071	/*
	5072	* Test UDP connection state. Caller must hold the state locked.
	5073	*/
	5074	int
	5075	pf_test_state_udp(struct pf_state *state, int direction, struct pfi_kif kif,
	5076	struct mbuf m, int off, void h, struct pf_pdesc *pd)
	5077	{
	5078	struct pf_state_peer src, dst;
	5079	struct pf_state_key_cmp key;
	5080	struct udphdr *uh = pd->hdr.udp;
	5081
	5082	bzero(&key, sizeof(key));
	5083	key.af = pd->af;
	5084	key.proto = IPPROTO_UDP;
	5085	if (direction == PF_IN) { /* wire side, straight */
	5086	PF_ACPY(&key.addr[0], pd->src, key.af);
	5087	PF_ACPY(&key.addr[1], pd->dst, key.af);
	5088	key.port[0] = uh->uh_sport;
	5089	key.port[1] = uh->uh_dport;
	5090	} else { /* stack side, reverse */
	5091	PF_ACPY(&key.addr[1], pd->src, key.af);
	5092	PF_ACPY(&key.addr[0], pd->dst, key.af);
	5093	key.port[1] = uh->uh_sport;
	5094	key.port[0] = uh->uh_dport;
	5095	}
	5096
	5097	STATE_LOOKUP(kif, &key, direction, *state, m);
	5098	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5099
	5100	if (direction == (*state)->direction) {
	5101	src = &(*state)->src;
	5102	dst = &(*state)->dst;
	5103	} else {
	5104	src = &(*state)->dst;
	5105	dst = &(*state)->src;
	5106	}
	5107
	5108	/* update states */
	5109	if (src->state < PFUDPS_SINGLE)
	5110	src->state = PFUDPS_SINGLE;
	5111	if (dst->state == PFUDPS_SINGLE)
	5112	dst->state = PFUDPS_MULTIPLE;
	5113
	5114	/* update expire time */
	5115	(*state)->expire = time_second;
	5116	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
	5117	(*state)->timeout = PFTM_UDP_MULTIPLE;
	5118	else
	5119	(*state)->timeout = PFTM_UDP_SINGLE;
	5120
	5121	/* translate source/destination address, if necessary */
	5122	if ((state)->key[PF_SK_WIRE] != (state)->key[PF_SK_STACK]) {
	5123	struct pf_state_key nk = (state)->key[pd->didx];
	5124
	5125	if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) \|\|
	5126	nk->port[pd->sidx] != uh->uh_sport) {
	5127	/*
	5128	* The translated source address may be completely
	5129	* unrelated to the saved link header, make sure
	5130	* a bridge doesn't try to use it.
	5131	*/
	5132	m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
	5133	m->m_flags &= ~M_HASH;
	5134	pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
	5135	&uh->uh_sum, &nk->addr[pd->sidx],
	5136	nk->port[pd->sidx], 1, pd->af);
	5137	}
	5138
	5139	if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) \|\|
	5140	nk->port[pd->didx] != uh->uh_dport) {
	5141	/*
	5142	* If we don't redispatch the packet will go into
	5143	* the protocol stack on the wrong cpu for the
	5144	* post-translated address.
	5145	*/
	5146	m->m_flags &= ~M_HASH;
	5147	pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
	5148	&uh->uh_sum, &nk->addr[pd->didx],
	5149	nk->port[pd->didx], 1, pd->af);
	5150	}
	5151	m_copyback(m, off, sizeof(*uh), uh);
	5152	}
	5153
	5154	pfsync_update_state(*state);
	5155	lockmgr(&(*state)->lk, LK_RELEASE);
	5156	return (PF_PASS);
	5157	}
	5158
	5159	/*
	5160	* Test ICMP connection state. Caller must hold the state locked.
	5161	*/
	5162	int
	5163	pf_test_state_icmp(struct pf_state *state, int direction, struct pfi_kif kif,
	5164	struct mbuf m, int off, void h, struct pf_pdesc *pd,
	5165	u_short *reason)
	5166	{
	5167	struct pf_addr saddr = pd->src, daddr = pd->dst;
	5168	u_int16_t icmpid = 0, *icmpsum = NULL;
	5169	u_int8_t icmptype = 0;
	5170	int state_icmp = 0;
	5171	int error;
	5172	struct pf_state_key_cmp key;
	5173
	5174	bzero(&key, sizeof(key));
	5175
	5176	switch (pd->proto) {
	5177	#ifdef INET
	5178	case IPPROTO_ICMP:
	5179	icmptype = pd->hdr.icmp->icmp_type;
	5180	icmpid = pd->hdr.icmp->icmp_id;
	5181	icmpsum = &pd->hdr.icmp->icmp_cksum;
	5182
	5183	if (icmptype == ICMP_UNREACH \|\|
	5184	icmptype == ICMP_SOURCEQUENCH \|\|
	5185	icmptype == ICMP_REDIRECT \|\|
	5186	icmptype == ICMP_TIMXCEED \|\|
	5187	icmptype == ICMP_PARAMPROB)
	5188	state_icmp++;
	5189	break;
	5190	#endif /* INET */
	5191	#ifdef INET6
	5192	case IPPROTO_ICMPV6:
	5193	icmptype = pd->hdr.icmp6->icmp6_type;
	5194	icmpid = pd->hdr.icmp6->icmp6_id;
	5195	icmpsum = &pd->hdr.icmp6->icmp6_cksum;
	5196
	5197	if (icmptype == ICMP6_DST_UNREACH \|\|
	5198	icmptype == ICMP6_PACKET_TOO_BIG \|\|
	5199	icmptype == ICMP6_TIME_EXCEEDED \|\|
	5200	icmptype == ICMP6_PARAM_PROB)
	5201	state_icmp++;
	5202	break;
	5203	#endif /* INET6 */
	5204	}
	5205
	5206	if (!state_icmp) {
	5207
	5208	/*
	5209	* ICMP query/reply message not related to a TCP/UDP packet.
	5210	* Search for an ICMP state.
	5211	*/
	5212	key.af = pd->af;
	5213	key.proto = pd->proto;
	5214	key.port[0] = key.port[1] = icmpid;
	5215	if (direction == PF_IN) { /* wire side, straight */
	5216	PF_ACPY(&key.addr[0], pd->src, key.af);
	5217	PF_ACPY(&key.addr[1], pd->dst, key.af);
	5218	} else { /* stack side, reverse */
	5219	PF_ACPY(&key.addr[1], pd->src, key.af);
	5220	PF_ACPY(&key.addr[0], pd->dst, key.af);
	5221	}
	5222
	5223	STATE_LOOKUP(kif, &key, direction, *state, m);
	5224	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5225
	5226	(*state)->expire = time_second;
	5227	(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
	5228
	5229	/* translate source/destination address, if necessary */
	5230	if ((state)->key[PF_SK_WIRE] != (state)->key[PF_SK_STACK]) {
	5231	struct pf_state_key nk = (state)->key[pd->didx];
	5232
	5233	switch (pd->af) {
	5234	#ifdef INET
	5235	case AF_INET:
	5236	if (PF_ANEQ(pd->src,
	5237	&nk->addr[pd->sidx], AF_INET))
	5238	pf_change_a(&saddr->v4.s_addr,
	5239	pd->ip_sum,
	5240	nk->addr[pd->sidx].v4.s_addr, 0);
	5241
	5242	if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
	5243	AF_INET))
	5244	pf_change_a(&daddr->v4.s_addr,
	5245	pd->ip_sum,
	5246	nk->addr[pd->didx].v4.s_addr, 0);
	5247
	5248	if (nk->port[0] !=
	5249	pd->hdr.icmp->icmp_id) {
	5250	pd->hdr.icmp->icmp_cksum =
	5251	pf_cksum_fixup(
	5252	pd->hdr.icmp->icmp_cksum, icmpid,
	5253	nk->port[pd->sidx], 0);
	5254	pd->hdr.icmp->icmp_id =
	5255	nk->port[pd->sidx];
	5256	}
	5257
	5258	m->m_flags &= ~M_HASH;
	5259	m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp);
	5260	break;
	5261	#endif /* INET */
	5262	#ifdef INET6
	5263	case AF_INET6:
	5264	if (PF_ANEQ(pd->src,
	5265	&nk->addr[pd->sidx], AF_INET6))
	5266	pf_change_a6(saddr,
	5267	&pd->hdr.icmp6->icmp6_cksum,
	5268	&nk->addr[pd->sidx], 0);
	5269
	5270	if (PF_ANEQ(pd->dst,
	5271	&nk->addr[pd->didx], AF_INET6))
	5272	pf_change_a6(daddr,
	5273	&pd->hdr.icmp6->icmp6_cksum,
	5274	&nk->addr[pd->didx], 0);
	5275
	5276	m->m_flags &= ~M_HASH;
	5277	m_copyback(m, off, sizeof(struct icmp6_hdr),
	5278	pd->hdr.icmp6);
	5279	break;
	5280	#endif /* INET6 */
	5281	}
	5282	}
	5283	} else {
	5284	/*
	5285	* ICMP error message in response to a TCP/UDP packet.
	5286	* Extract the inner TCP/UDP header and search for that state.
	5287	*/
	5288
	5289	struct pf_pdesc pd2;
	5290	#ifdef INET
	5291	struct ip h2;
	5292	#endif /* INET */
	5293	#ifdef INET6
	5294	struct ip6_hdr h2_6;
	5295	int terminal = 0;
	5296	#endif /* INET6 */
	5297	int ipoff2;
	5298	int off2;
	5299
	5300	pd2.not_cpu_localized = 1;
	5301	pd2.af = pd->af;
	5302	/* Payload packet is from the opposite direction. */
	5303	pd2.sidx = (direction == PF_IN) ? 1 : 0;
	5304	pd2.didx = (direction == PF_IN) ? 0 : 1;
	5305	switch (pd->af) {
	5306	#ifdef INET
	5307	case AF_INET:
	5308	/* offset of h2 in mbuf chain */
	5309	ipoff2 = off + ICMP_MINLEN;
	5310
	5311	if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
	5312	NULL, reason, pd2.af)) {
	5313	DPFPRINTF(PF_DEBUG_MISC,
	5314	("pf: ICMP error message too short "
	5315	"(ip)\n"));
	5316	FAIL (PF_DROP);
	5317	}
	5318	/*
	5319	* ICMP error messages don't refer to non-first
	5320	* fragments
	5321	*/
	5322	if (h2.ip_off & htons(IP_OFFMASK)) {
	5323	REASON_SET(reason, PFRES_FRAG);
	5324	FAIL (PF_DROP);
	5325	}
	5326
	5327	/* offset of protocol header that follows h2 */
	5328	off2 = ipoff2 + (h2.ip_hl << 2);
	5329
	5330	pd2.proto = h2.ip_p;
	5331	pd2.src = (struct pf_addr *)&h2.ip_src;
	5332	pd2.dst = (struct pf_addr *)&h2.ip_dst;
	5333	pd2.ip_sum = &h2.ip_sum;
	5334	break;
	5335	#endif /* INET */
	5336	#ifdef INET6
	5337	case AF_INET6:
	5338	ipoff2 = off + sizeof(struct icmp6_hdr);
	5339
	5340	if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
	5341	NULL, reason, pd2.af)) {
	5342	DPFPRINTF(PF_DEBUG_MISC,
	5343	("pf: ICMP error message too short "
	5344	"(ip6)\n"));
	5345	FAIL (PF_DROP);
	5346	}
	5347	pd2.proto = h2_6.ip6_nxt;
	5348	pd2.src = (struct pf_addr *)&h2_6.ip6_src;
	5349	pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
	5350	pd2.ip_sum = NULL;
	5351	off2 = ipoff2 + sizeof(h2_6);
	5352	do {
	5353	switch (pd2.proto) {
	5354	case IPPROTO_FRAGMENT:
	5355	/*
	5356	* ICMPv6 error messages for
	5357	* non-first fragments
	5358	*/
	5359	REASON_SET(reason, PFRES_FRAG);
	5360	FAIL (PF_DROP);
	5361	case IPPROTO_AH:
	5362	case IPPROTO_HOPOPTS:
	5363	case IPPROTO_ROUTING:
	5364	case IPPROTO_DSTOPTS: {
	5365	/* get next header and header length */
	5366	struct ip6_ext opt6;
	5367
	5368	if (!pf_pull_hdr(m, off2, &opt6,
	5369	sizeof(opt6), NULL, reason,
	5370	pd2.af)) {
	5371	DPFPRINTF(PF_DEBUG_MISC,
	5372	("pf: ICMPv6 short opt\n"));
	5373	FAIL (PF_DROP);
	5374	}
	5375	if (pd2.proto == IPPROTO_AH)
	5376	off2 += (opt6.ip6e_len + 2) * 4;
	5377	else
	5378	off2 += (opt6.ip6e_len + 1) * 8;
	5379	pd2.proto = opt6.ip6e_nxt;
	5380	/* goto the next header */
	5381	break;
	5382	}
	5383	default:
	5384	terminal++;
	5385	break;
	5386	}
	5387	} while (!terminal);
	5388	break;
	5389	#endif /* INET6 */
	5390	default:
	5391	DPFPRINTF(PF_DEBUG_MISC,
	5392	("pf: ICMP AF %d unknown (ip6)\n", pd->af));
	5393	FAIL (PF_DROP);
	5394	break;
	5395	}
	5396
	5397	switch (pd2.proto) {
	5398	case IPPROTO_TCP: {
	5399	struct tcphdr th;
	5400	u_int32_t seq;
	5401	struct pf_state_peer src, dst;
	5402	u_int8_t dws;
	5403	int copyback = 0;
	5404
	5405	/*
	5406	* Only the first 8 bytes of the TCP header can be
	5407	* expected. Don't access any TCP header fields after
	5408	* th_seq, an ackskew test is not possible.
	5409	*/
	5410	if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
	5411	pd2.af)) {
	5412	DPFPRINTF(PF_DEBUG_MISC,
	5413	("pf: ICMP error message too short "
	5414	"(tcp)\n"));
	5415	FAIL (PF_DROP);
	5416	}
	5417
	5418	key.af = pd2.af;
	5419	key.proto = IPPROTO_TCP;
	5420	PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
	5421	PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
	5422	key.port[pd2.sidx] = th.th_sport;
	5423	key.port[pd2.didx] = th.th_dport;
	5424
	5425	STATE_LOOKUP(kif, &key, direction, *state, m);
	5426	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5427
	5428	if (direction == (*state)->direction) {
	5429	src = &(*state)->dst;
	5430	dst = &(*state)->src;
	5431	} else {
	5432	src = &(*state)->src;
	5433	dst = &(*state)->dst;
	5434	}
	5435
	5436	if (src->wscale && dst->wscale)
	5437	dws = dst->wscale & PF_WSCALE_MASK;
	5438	else
	5439	dws = 0;
	5440
	5441	/* Demodulate sequence number */
	5442	seq = ntohl(th.th_seq) - src->seqdiff;
	5443	if (src->seqdiff) {
	5444	pf_change_a(&th.th_seq, icmpsum,
	5445	htonl(seq), 0);
	5446	copyback = 1;
	5447	}
	5448
	5449	if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
	5450	(!SEQ_GEQ(src->seqhi, seq) \|\|
	5451	!SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
	5452	if (pf_status.debug >= PF_DEBUG_MISC) {
	5453	kprintf("pf: BAD ICMP %d:%d ",
	5454	icmptype, pd->hdr.icmp->icmp_code);
	5455	pf_print_host(pd->src, 0, pd->af);
	5456	kprintf(" -> ");
	5457	pf_print_host(pd->dst, 0, pd->af);
	5458	kprintf(" state: ");
	5459	pf_print_state(*state);
	5460	kprintf(" seq=%u\n", seq);
	5461	}
	5462	REASON_SET(reason, PFRES_BADSTATE);
	5463	FAIL (PF_DROP);
	5464	} else {
	5465	if (pf_status.debug >= PF_DEBUG_MISC) {
	5466	kprintf("pf: OK ICMP %d:%d ",
	5467	icmptype, pd->hdr.icmp->icmp_code);
	5468	pf_print_host(pd->src, 0, pd->af);
	5469	kprintf(" -> ");
	5470	pf_print_host(pd->dst, 0, pd->af);
	5471	kprintf(" state: ");
	5472	pf_print_state(*state);
	5473	kprintf(" seq=%u\n", seq);
	5474	}
	5475	}
	5476
	5477	/* translate source/destination address, if necessary */
	5478	if ((*state)->key[PF_SK_WIRE] !=
	5479	(*state)->key[PF_SK_STACK]) {
	5480	struct pf_state_key *nk =
	5481	(*state)->key[pd->didx];
	5482
	5483	if (PF_ANEQ(pd2.src,
	5484	&nk->addr[pd2.sidx], pd2.af) \|\|
	5485	nk->port[pd2.sidx] != th.th_sport)
	5486	pf_change_icmp(pd2.src, &th.th_sport,
	5487	daddr, &nk->addr[pd2.sidx],
	5488	nk->port[pd2.sidx], NULL,
	5489	pd2.ip_sum, icmpsum,
	5490	pd->ip_sum, 0, pd2.af);
	5491
	5492	if (PF_ANEQ(pd2.dst,
	5493	&nk->addr[pd2.didx], pd2.af) \|\|
	5494	nk->port[pd2.didx] != th.th_dport)
	5495	pf_change_icmp(pd2.dst, &th.th_dport,
	5496	NULL, /* XXX Inbound NAT? */
	5497	&nk->addr[pd2.didx],
	5498	nk->port[pd2.didx], NULL,
	5499	pd2.ip_sum, icmpsum,
	5500	pd->ip_sum, 0, pd2.af);
	5501	copyback = 1;
	5502	}
	5503
	5504	if (copyback) {
	5505	switch (pd2.af) {
	5506	#ifdef INET
	5507	case AF_INET:
	5508	m_copyback(m, off, ICMP_MINLEN,
	5509	pd->hdr.icmp);
	5510	m_copyback(m, ipoff2, sizeof(h2),
	5511	&h2);
	5512	break;
	5513	#endif /* INET */
	5514	#ifdef INET6
	5515	case AF_INET6:
	5516	m_copyback(m, off,
	5517	sizeof(struct icmp6_hdr),
	5518	pd->hdr.icmp6);
	5519	m_copyback(m, ipoff2, sizeof(h2_6),
	5520	&h2_6);
	5521	break;
	5522	#endif /* INET6 */
	5523	}
	5524	m->m_flags &= ~M_HASH;
	5525	m_copyback(m, off2, 8, &th);
	5526	}
	5527	break;
	5528	}
	5529	case IPPROTO_UDP: {
	5530	struct udphdr uh;
	5531
	5532	if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
	5533	NULL, reason, pd2.af)) {
	5534	DPFPRINTF(PF_DEBUG_MISC,
	5535	("pf: ICMP error message too short "
	5536	"(udp)\n"));
	5537	return (PF_DROP);
	5538	}
	5539
	5540	key.af = pd2.af;
	5541	key.proto = IPPROTO_UDP;
	5542	PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
	5543	PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
	5544	key.port[pd2.sidx] = uh.uh_sport;
	5545	key.port[pd2.didx] = uh.uh_dport;
	5546
	5547	STATE_LOOKUP(kif, &key, direction, *state, m);
	5548	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5549
	5550	/* translate source/destination address, if necessary */
	5551	if ((*state)->key[PF_SK_WIRE] !=
	5552	(*state)->key[PF_SK_STACK]) {
	5553	struct pf_state_key *nk =
	5554	(*state)->key[pd->didx];
	5555
	5556	if (PF_ANEQ(pd2.src,
	5557	&nk->addr[pd2.sidx], pd2.af) \|\|
	5558	nk->port[pd2.sidx] != uh.uh_sport)
	5559	pf_change_icmp(pd2.src, &uh.uh_sport,
	5560	daddr, &nk->addr[pd2.sidx],
	5561	nk->port[pd2.sidx], &uh.uh_sum,
	5562	pd2.ip_sum, icmpsum,
	5563	pd->ip_sum, 1, pd2.af);
	5564
	5565	if (PF_ANEQ(pd2.dst,
	5566	&nk->addr[pd2.didx], pd2.af) \|\|
	5567	nk->port[pd2.didx] != uh.uh_dport)
	5568	pf_change_icmp(pd2.dst, &uh.uh_dport,
	5569	NULL, /* XXX Inbound NAT? */
	5570	&nk->addr[pd2.didx],
	5571	nk->port[pd2.didx], &uh.uh_sum,
	5572	pd2.ip_sum, icmpsum,
	5573	pd->ip_sum, 1, pd2.af);
	5574
	5575	switch (pd2.af) {
	5576	#ifdef INET
	5577	case AF_INET:
	5578	m_copyback(m, off, ICMP_MINLEN,
	5579	pd->hdr.icmp);
	5580	m_copyback(m, ipoff2, sizeof(h2),
	5581	&h2);
	5582	break;
	5583	#endif /* INET */
	5584	#ifdef INET6
	5585	case AF_INET6:
	5586	m_copyback(m, off,
	5587	sizeof(struct icmp6_hdr),
	5588	pd->hdr.icmp6);
	5589	m_copyback(m, ipoff2, sizeof(h2_6),
	5590	&h2_6);
	5591	break;
	5592	#endif /* INET6 */
	5593	}
	5594	m->m_flags &= ~M_HASH;
	5595	m_copyback(m, off2, sizeof(uh), &uh);
	5596	}
	5597	break;
	5598	}
	5599	#ifdef INET
	5600	case IPPROTO_ICMP: {
	5601	struct icmp iih;
	5602
	5603	if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
	5604	NULL, reason, pd2.af)) {
	5605	DPFPRINTF(PF_DEBUG_MISC,
	5606	("pf: ICMP error message too short i"
	5607	"(icmp)\n"));
	5608	return (PF_DROP);
	5609	}
	5610
	5611	key.af = pd2.af;
	5612	key.proto = IPPROTO_ICMP;
	5613	PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
	5614	PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
	5615	key.port[0] = key.port[1] = iih.icmp_id;
	5616
	5617	STATE_LOOKUP(kif, &key, direction, *state, m);
	5618	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5619
	5620	/* translate source/destination address, if necessary */
	5621	if ((*state)->key[PF_SK_WIRE] !=
	5622	(*state)->key[PF_SK_STACK]) {
	5623	struct pf_state_key *nk =
	5624	(*state)->key[pd->didx];
	5625
	5626	if (PF_ANEQ(pd2.src,
	5627	&nk->addr[pd2.sidx], pd2.af) \|\|
	5628	nk->port[pd2.sidx] != iih.icmp_id)
	5629	pf_change_icmp(pd2.src, &iih.icmp_id,
	5630	daddr, &nk->addr[pd2.sidx],
	5631	nk->port[pd2.sidx], NULL,
	5632	pd2.ip_sum, icmpsum,
	5633	pd->ip_sum, 0, AF_INET);
	5634
	5635	if (PF_ANEQ(pd2.dst,
	5636	&nk->addr[pd2.didx], pd2.af) \|\|
	5637	nk->port[pd2.didx] != iih.icmp_id)
	5638	pf_change_icmp(pd2.dst, &iih.icmp_id,
	5639	NULL, /* XXX Inbound NAT? */
	5640	&nk->addr[pd2.didx],
	5641	nk->port[pd2.didx], NULL,
	5642	pd2.ip_sum, icmpsum,
	5643	pd->ip_sum, 0, AF_INET);
	5644
	5645	m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp);
	5646	m_copyback(m, ipoff2, sizeof(h2), &h2);
	5647	m_copyback(m, off2, ICMP_MINLEN, &iih);
	5648	m->m_flags &= ~M_HASH;
	5649	}
	5650	break;
	5651	}
	5652	#endif /* INET */
	5653	#ifdef INET6
	5654	case IPPROTO_ICMPV6: {
	5655	struct icmp6_hdr iih;
	5656
	5657	if (!pf_pull_hdr(m, off2, &iih,
	5658	sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
	5659	DPFPRINTF(PF_DEBUG_MISC,
	5660	("pf: ICMP error message too short "
	5661	"(icmp6)\n"));
	5662	FAIL (PF_DROP);
	5663	}
	5664
	5665	key.af = pd2.af;
	5666	key.proto = IPPROTO_ICMPV6;
	5667	PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
	5668	PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
	5669	key.port[0] = key.port[1] = iih.icmp6_id;
	5670
	5671	STATE_LOOKUP(kif, &key, direction, *state, m);
	5672	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5673
	5674	/* translate source/destination address, if necessary */
	5675	if ((*state)->key[PF_SK_WIRE] !=
	5676	(*state)->key[PF_SK_STACK]) {
	5677	struct pf_state_key *nk =
	5678	(*state)->key[pd->didx];
	5679
	5680	if (PF_ANEQ(pd2.src,
	5681	&nk->addr[pd2.sidx], pd2.af) \|\|
	5682	nk->port[pd2.sidx] != iih.icmp6_id)
	5683	pf_change_icmp(pd2.src, &iih.icmp6_id,
	5684	daddr, &nk->addr[pd2.sidx],
	5685	nk->port[pd2.sidx], NULL,
	5686	pd2.ip_sum, icmpsum,
	5687	pd->ip_sum, 0, AF_INET6);
	5688
	5689	if (PF_ANEQ(pd2.dst,
	5690	&nk->addr[pd2.didx], pd2.af) \|\|
	5691	nk->port[pd2.didx] != iih.icmp6_id)
	5692	pf_change_icmp(pd2.dst, &iih.icmp6_id,
	5693	NULL, /* XXX Inbound NAT? */
	5694	&nk->addr[pd2.didx],
	5695	nk->port[pd2.didx], NULL,
	5696	pd2.ip_sum, icmpsum,
	5697	pd->ip_sum, 0, AF_INET6);
	5698
	5699	m_copyback(m, off, sizeof(struct icmp6_hdr),
	5700	pd->hdr.icmp6);
	5701	m_copyback(m, ipoff2, sizeof(h2_6), &h2_6);
	5702	m_copyback(m, off2, sizeof(struct icmp6_hdr),
	5703	&iih);
	5704	m->m_flags &= ~M_HASH;
	5705	}
	5706	break;
	5707	}
	5708	#endif /* INET6 */
	5709	default: {
	5710	key.af = pd2.af;
	5711	key.proto = pd2.proto;
	5712	PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
	5713	PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
	5714	key.port[0] = key.port[1] = 0;
	5715
	5716	STATE_LOOKUP(kif, &key, direction, *state, m);
	5717	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5718
	5719	/* translate source/destination address, if necessary */
	5720	if ((*state)->key[PF_SK_WIRE] !=
	5721	(*state)->key[PF_SK_STACK]) {
	5722	struct pf_state_key *nk =
	5723	(*state)->key[pd->didx];
	5724
	5725	if (PF_ANEQ(pd2.src,
	5726	&nk->addr[pd2.sidx], pd2.af))
	5727	pf_change_icmp(pd2.src, NULL, daddr,
	5728	&nk->addr[pd2.sidx], 0, NULL,
	5729	pd2.ip_sum, icmpsum,
	5730	pd->ip_sum, 0, pd2.af);
	5731
	5732	if (PF_ANEQ(pd2.dst,
	5733	&nk->addr[pd2.didx], pd2.af))
	5734	pf_change_icmp(pd2.src, NULL,
	5735	NULL, /* XXX Inbound NAT? */
	5736	&nk->addr[pd2.didx], 0, NULL,
	5737	pd2.ip_sum, icmpsum,
	5738	pd->ip_sum, 0, pd2.af);
	5739
	5740	switch (pd2.af) {
	5741	#ifdef INET
	5742	case AF_INET:
	5743	m_copyback(m, off, ICMP_MINLEN,
	5744	pd->hdr.icmp);
	5745	m_copyback(m, ipoff2, sizeof(h2),
	5746	&h2);
	5747	m->m_flags &= ~M_HASH;
	5748	break;
	5749	#endif /* INET */
	5750	#ifdef INET6
	5751	case AF_INET6:
	5752	m_copyback(m, off,
	5753	sizeof(struct icmp6_hdr),
	5754	pd->hdr.icmp6);
	5755	m_copyback(m, ipoff2, sizeof(h2_6),
	5756	&h2_6);
	5757	m->m_flags &= ~M_HASH;
	5758	break;
	5759	#endif /* INET6 */
	5760	}
	5761	}
	5762	break;
	5763	}
	5764	}
	5765	}
	5766
	5767	pfsync_update_state(*state);
	5768	error = PF_PASS;
	5769	done:
	5770	if (*state)
	5771	lockmgr(&(*state)->lk, LK_RELEASE);
	5772	return (error);
	5773	}
	5774
	5775	/*
	5776	* Test other connection state. Caller must hold the state locked.
	5777	*/
	5778	int
	5779	pf_test_state_other(struct pf_state *state, int direction, struct pfi_kif kif,
	5780	struct mbuf m, struct pf_pdesc pd)
	5781	{
	5782	struct pf_state_peer src, dst;
	5783	struct pf_state_key_cmp key;
	5784
	5785	bzero(&key, sizeof(key));
	5786	key.af = pd->af;
	5787	key.proto = pd->proto;
	5788	if (direction == PF_IN) {
	5789	PF_ACPY(&key.addr[0], pd->src, key.af);
	5790	PF_ACPY(&key.addr[1], pd->dst, key.af);
	5791	key.port[0] = key.port[1] = 0;
	5792	} else {
	5793	PF_ACPY(&key.addr[1], pd->src, key.af);
	5794	PF_ACPY(&key.addr[0], pd->dst, key.af);
	5795	key.port[1] = key.port[0] = 0;
	5796	}
	5797
	5798	STATE_LOOKUP(kif, &key, direction, *state, m);
	5799	lockmgr(&(*state)->lk, LK_EXCLUSIVE);
	5800
	5801	if (direction == (*state)->direction) {
	5802	src = &(*state)->src;
	5803	dst = &(*state)->dst;
	5804	} else {
	5805	src = &(*state)->dst;
	5806	dst = &(*state)->src;
	5807	}
	5808
	5809	/* update states */
	5810	if (src->state < PFOTHERS_SINGLE)
	5811	src->state = PFOTHERS_SINGLE;
	5812	if (dst->state == PFOTHERS_SINGLE)
	5813	dst->state = PFOTHERS_MULTIPLE;
	5814
	5815	/* update expire time */
	5816	(*state)->expire = time_second;
	5817	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
	5818	(*state)->timeout = PFTM_OTHER_MULTIPLE;
	5819	else
	5820	(*state)->timeout = PFTM_OTHER_SINGLE;
	5821
	5822	/* translate source/destination address, if necessary */
	5823	if ((state)->key[PF_SK_WIRE] != (state)->key[PF_SK_STACK]) {
	5824	struct pf_state_key nk = (state)->key[pd->didx];
	5825
	5826	KKASSERT(nk);
	5827	KKASSERT(pd);
	5828	KKASSERT(pd->src);
	5829	KKASSERT(pd->dst);
	5830	switch (pd->af) {
	5831	#ifdef INET
	5832	case AF_INET:
	5833	if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
	5834	pf_change_a(&pd->src->v4.s_addr,
	5835	pd->ip_sum,
	5836	nk->addr[pd->sidx].v4.s_addr,
	5837	0);
	5838
	5839
	5840	if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
	5841	pf_change_a(&pd->dst->v4.s_addr,
	5842	pd->ip_sum,
	5843	nk->addr[pd->didx].v4.s_addr,
	5844	0);
	5845
	5846	break;
	5847	#endif /* INET */
	5848	#ifdef INET6
	5849	case AF_INET6:
	5850	if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET6))
	5851	PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
	5852
	5853	if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET6))
	5854	PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
	5855	#endif /* INET6 */
	5856	}
	5857	}
	5858
	5859	pfsync_update_state(*state);
	5860	lockmgr(&(*state)->lk, LK_RELEASE);
	5861	return (PF_PASS);
	5862	}
	5863
	5864	/*
	5865	* ipoff and off are measured from the start of the mbuf chain.
	5866	* h must be at "ipoff" on the mbuf chain.
	5867	*/
	5868	void *
	5869	pf_pull_hdr(struct mbuf m, int off, void p, int len,
	5870	u_short actionp, u_short reasonp, sa_family_t af)
	5871	{
	5872	switch (af) {
	5873	#ifdef INET
	5874	case AF_INET: {
	5875	struct ip h = mtod(m, struct ip );
	5876	u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
	5877
	5878	if (fragoff) {
	5879	if (fragoff >= len)
	5880	ACTION_SET(actionp, PF_PASS);
	5881	else {
	5882	ACTION_SET(actionp, PF_DROP);
	5883	REASON_SET(reasonp, PFRES_FRAG);
	5884	}
	5885	return (NULL);
	5886	}
	5887	if (m->m_pkthdr.len < off + len \|\|
	5888	ntohs(h->ip_len) < off + len) {
	5889	ACTION_SET(actionp, PF_DROP);
	5890	REASON_SET(reasonp, PFRES_SHORT);
	5891	return (NULL);
	5892	}
	5893	break;
	5894	}
	5895	#endif /* INET */
	5896	#ifdef INET6
	5897	case AF_INET6: {
	5898	struct ip6_hdr h = mtod(m, struct ip6_hdr );
	5899
	5900	if (m->m_pkthdr.len < off + len \|\|
	5901	(ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
	5902	(unsigned)(off + len)) {
	5903	ACTION_SET(actionp, PF_DROP);
	5904	REASON_SET(reasonp, PFRES_SHORT);
	5905	return (NULL);
	5906	}
	5907	break;
	5908	}
	5909	#endif /* INET6 */
	5910	}
	5911	m_copydata(m, off, len, p);
	5912	return (p);
	5913	}
	5914
	5915	int
	5916	pf_routable(struct pf_addr addr, sa_family_t af, struct pfi_kif kif)
	5917	{
	5918	struct sockaddr_in *dst;
	5919	int ret = 1;
	5920	int check_mpath;
	5921	#ifdef INET6
	5922	struct sockaddr_in6 *dst6;
	5923	struct route_in6 ro;
	5924	#else
	5925	struct route ro;
	5926	#endif
	5927	struct radix_node *rn;
	5928	struct rtentry *rt;
	5929	struct ifnet *ifp;
	5930
	5931	check_mpath = 0;
	5932	bzero(&ro, sizeof(ro));
	5933	switch (af) {
	5934	case AF_INET:
	5935	dst = satosin(&ro.ro_dst);
	5936	dst->sin_family = AF_INET;
	5937	dst->sin_len = sizeof(*dst);
	5938	dst->sin_addr = addr->v4;
	5939	break;
	5940	#ifdef INET6
	5941	case AF_INET6:
	5942	/*
	5943	* Skip check for addresses with embedded interface scope,
	5944	* as they would always match anyway.
	5945	*/
	5946	if (IN6_IS_SCOPE_EMBED(&addr->v6))
	5947	goto out;
	5948	dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
	5949	dst6->sin6_family = AF_INET6;
	5950	dst6->sin6_len = sizeof(*dst6);
	5951	dst6->sin6_addr = addr->v6;
	5952	break;
	5953	#endif /* INET6 */
	5954	default:
	5955	return (0);
	5956	}
	5957
	5958	/* Skip checks for ipsec interfaces */
	5959	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
	5960	goto out;
	5961
	5962	rtalloc_ign((struct route *)&ro, 0);
	5963
	5964	if (ro.ro_rt != NULL) {
	5965	/* No interface given, this is a no-route check */
	5966	if (kif == NULL)
	5967	goto out;
	5968
	5969	if (kif->pfik_ifp == NULL) {
	5970	ret = 0;
	5971	goto out;
	5972	}
	5973
	5974	/* Perform uRPF check if passed input interface */
	5975	ret = 0;
	5976	rn = (struct radix_node *)ro.ro_rt;
	5977	do {
	5978	rt = (struct rtentry *)rn;
	5979	ifp = rt->rt_ifp;
	5980
	5981	if (kif->pfik_ifp == ifp)
	5982	ret = 1;
	5983	rn = NULL;
	5984	} while (check_mpath == 1 && rn != NULL && ret == 0);
	5985	} else
	5986	ret = 0;
	5987	out:
	5988	if (ro.ro_rt != NULL)
	5989	RTFREE(ro.ro_rt);
	5990	return (ret);
	5991	}
	5992
	5993	int
	5994	pf_rtlabel_match(struct pf_addr addr, sa_family_t af, struct pf_addr_wrap aw)
	5995	{
	5996	struct sockaddr_in *dst;
	5997	#ifdef INET6
	5998	struct sockaddr_in6 *dst6;
	5999	struct route_in6 ro;
	6000	#else
	6001	struct route ro;
	6002	#endif
	6003	int ret = 0;
	6004
	6005	ASSERT_LWKT_TOKEN_HELD(&pf_token);
	6006
	6007	bzero(&ro, sizeof(ro));
	6008	switch (af) {
	6009	case AF_INET:
	6010	dst = satosin(&ro.ro_dst);
	6011	dst->sin_family = AF_INET;
	6012	dst->sin_len = sizeof(*dst);
	6013	dst->sin_addr = addr->v4;
	6014	break;
	6015	#ifdef INET6
	6016	case AF_INET6:
	6017	dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
	6018	dst6->sin6_family = AF_INET6;
	6019	dst6->sin6_len = sizeof(*dst6);
	6020	dst6->sin6_addr = addr->v6;
	6021	break;
	6022	#endif /* INET6 */
	6023	default:
	6024	return (0);
	6025	}
	6026
	6027	rtalloc_ign((struct route *)&ro, (RTF_CLONING \| RTF_PRCLONING));
	6028
	6029	if (ro.ro_rt != NULL) {
	6030	RTFREE(ro.ro_rt);
	6031	}
	6032
	6033	return (ret);
	6034	}
	6035
	6036	#ifdef INET
	6037	void
	6038	pf_route(struct mbuf *m, struct pf_rule r, int dir, struct ifnet *oifp,
	6039	struct pf_state s, struct pf_pdesc pd)
	6040	{
	6041	struct mbuf m0, m1;
	6042	struct route iproute;
	6043	struct route *ro = NULL;
	6044	struct sockaddr_in *dst;
	6045	struct ip *ip;
	6046	struct ifnet *ifp = NULL;
	6047	struct pf_addr naddr;
	6048	struct pf_src_node *sn = NULL;
	6049	int error = 0;
	6050	int sw_csum;
	6051
	6052	ASSERT_LWKT_TOKEN_HELD(&pf_token);
	6053
	6054	if (m == NULL \|\| *m == NULL \|\| r == NULL \|\|
	6055	(dir != PF_IN && dir != PF_OUT) \|\| oifp == NULL)
	6056	panic("pf_route: invalid parameters");
	6057
	6058	if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
	6059	(*m)->m_pkthdr.fw_flags \|= PF_MBUF_ROUTED;
	6060	(*m)->m_pkthdr.pf.routed = 1;
	6061	} else {
	6062	if ((*m)->m_pkthdr.pf.routed++ > 3) {
	6063	m0 = *m;
	6064	*m = NULL;
	6065	goto bad;
	6066	}
	6067	}
	6068
	6069	if (r->rt == PF_DUPTO) {
	6070	if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
	6071	return;
	6072	}
	6073	} else {
	6074	if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
	6075	return;
	6076	}
	6077	m0 = *m;
	6078	}
	6079
	6080	if (m0->m_len < sizeof(struct ip)) {
	6081	DPFPRINTF(PF_DEBUG_URGENT,
	6082	("pf_route: m0->m_len < sizeof(struct ip)\n"));
	6083	goto bad;
	6084	}
	6085
	6086	ip = mtod(m0, struct ip *);
	6087
	6088	ro = &iproute;
	6089	bzero((caddr_t)ro, sizeof(*ro));
	6090	dst = satosin(&ro->ro_dst);
	6091	dst->sin_family = AF_INET;
	6092	dst->sin_len = sizeof(*dst);
	6093	dst->sin_addr = ip->ip_dst;
	6094
	6095	if (r->rt == PF_FASTROUTE) {
	6096	rtalloc(ro);
	6097	if (ro->ro_rt == 0) {
	6098	ipstat.ips_noroute++;
	6099	goto bad;
	6100	}
	6101
	6102	ifp = ro->ro_rt->rt_ifp;
	6103	ro->ro_rt->rt_use++;
	6104
	6105	if (ro->ro_rt->rt_flags & RTF_GATEWAY)
	6106	dst = satosin(ro->ro_rt->rt_gateway);
	6107	} else {
	6108	if (TAILQ_EMPTY(&r->rpool.list)) {
	6109	DPFPRINTF(PF_DEBUG_URGENT,
	6110	("pf_route: TAILQ_EMPTY(&r->rpool.list)\n"));
	6111	goto bad;
	6112	}
	6113	if (s == NULL) {
	6114	pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
	6115	&naddr, NULL, &sn);
	6116	if (!PF_AZERO(&naddr, AF_INET))
	6117	dst->sin_addr.s_addr = naddr.v4.s_addr;
	6118	ifp = r->rpool.cur->kif ?
	6119	r->rpool.cur->kif->pfik_ifp : NULL;
	6120	} else {
	6121	if (!PF_AZERO(&s->rt_addr, AF_INET))
	6122	dst->sin_addr.s_addr =
	6123	s->rt_addr.v4.s_addr;
	6124	ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
	6125	}
	6126	}
	6127	if (ifp == NULL)
	6128	goto bad;
	6129
	6130	if (oifp != ifp) {
	6131	if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
	6132	goto bad;
	6133	} else if (m0 == NULL) {
	6134	goto done;
	6135	}
	6136	if (m0->m_len < sizeof(struct ip)) {
	6137	DPFPRINTF(PF_DEBUG_URGENT,
	6138	("pf_route: m0->m_len < sizeof(struct ip)\n"));
	6139	goto bad;
	6140	}
	6141	ip = mtod(m0, struct ip *);
	6142	}
	6143
	6144	/* Copied from FreeBSD 5.1-CURRENT ip_output. */
	6145	m0->m_pkthdr.csum_flags \|= CSUM_IP;
	6146	sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
	6147	if (sw_csum & CSUM_DELAY_DATA) {
	6148	in_delayed_cksum(m0);
	6149	sw_csum &= ~CSUM_DELAY_DATA;
	6150	}
	6151	m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
	6152	m0->m_pkthdr.csum_iphlen = (ip->ip_hl << 2);
	6153
	6154	/*
	6155	* WARNING! We cannot fragment if the packet was modified from an
	6156	* original which expected to be using TSO. In this
	6157	* situation we pray that the target interface is
	6158	* compatible with the originating interface.
	6159	*/
	6160	if (ntohs(ip->ip_len) <= ifp->if_mtu \|\|
	6161	(m0->m_pkthdr.csum_flags & CSUM_TSO) \|\|
	6162	((ifp->if_hwassist & CSUM_FRAGMENT) &&
	6163	(ip->ip_off & htons(IP_DF)) == 0)) {
	6164	ip->ip_sum = 0;
	6165	if (sw_csum & CSUM_DELAY_IP) {
	6166	/* From KAME */
	6167	if (ip->ip_v == IPVERSION &&
	6168	(ip->ip_hl << 2) == sizeof(*ip)) {
	6169	ip->ip_sum = in_cksum_hdr(ip);
	6170	} else {
	6171	ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
	6172	}
	6173	}
	6174	lwkt_reltoken(&pf_token);
	6175	error = ifp->if_output(ifp, m0, sintosa(dst), ro->ro_rt);
	6176	lwkt_gettoken(&pf_token);
	6177	goto done;
	6178	}
	6179
	6180	/*
	6181	* Too large for interface; fragment if possible.
	6182	* Must be able to put at least 8 bytes per fragment.
	6183	*/
	6184	if (ip->ip_off & htons(IP_DF)) {
	6185	ipstat.ips_cantfrag++;
	6186	if (r->rt != PF_DUPTO) {
	6187	icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
	6188	ifp->if_mtu);
	6189	goto done;
	6190	} else
	6191	goto bad;
	6192	}
	6193
	6194	m1 = m0;
	6195	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
	6196	if (error) {
	6197	goto bad;
	6198	}
	6199
	6200	for (m0 = m1; m0; m0 = m1) {
	6201	m1 = m0->m_nextpkt;
	6202	m0->m_nextpkt = 0;
	6203	if (error == 0) {
	6204	lwkt_reltoken(&pf_token);
	6205	error = (*ifp->if_output)(ifp, m0, sintosa(dst),
	6206	NULL);
	6207	lwkt_gettoken(&pf_token);
	6208	} else
	6209	m_freem(m0);
	6210	}
	6211
	6212	if (error == 0)
	6213	ipstat.ips_fragmented++;
	6214
	6215	done:
	6216	if (r->rt != PF_DUPTO)
	6217	*m = NULL;
	6218	if (ro == &iproute && ro->ro_rt)
	6219	RTFREE(ro->ro_rt);
	6220	return;
	6221
	6222	bad:
	6223	m_freem(m0);
	6224	goto done;
	6225	}
	6226	#endif /* INET */
	6227
	6228	#ifdef INET6
	6229	void
	6230	pf_route6(struct mbuf *m, struct pf_rule r, int dir, struct ifnet *oifp,
	6231	struct pf_state s, struct pf_pdesc pd)
	6232	{
	6233	struct mbuf *m0;
	6234	struct route_in6 ip6route;
	6235	struct route_in6 *ro;
	6236	struct sockaddr_in6 *dst;
	6237	struct ip6_hdr *ip6;
	6238	struct ifnet *ifp = NULL;
	6239	struct pf_addr naddr;
	6240	struct pf_src_node *sn = NULL;
	6241
	6242	if (m == NULL \|\| *m == NULL \|\| r == NULL \|\|
	6243	(dir != PF_IN && dir != PF_OUT) \|\| oifp == NULL)
	6244	panic("pf_route6: invalid parameters");
	6245
	6246	if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
	6247	(*m)->m_pkthdr.fw_flags \|= PF_MBUF_ROUTED;
	6248	(*m)->m_pkthdr.pf.routed = 1;
	6249	} else {
	6250	if ((*m)->m_pkthdr.pf.routed++ > 3) {
	6251	m0 = *m;
	6252	*m = NULL;
	6253	goto bad;
	6254	}
	6255	}
	6256
	6257	if (r->rt == PF_DUPTO) {
	6258	if ((m0 = m_dup(*m, M_NOWAIT)) == NULL)
	6259	return;
	6260	} else {
	6261	if ((r->rt == PF_REPLYTO) == (r->direction == dir))
	6262	return;
	6263	m0 = *m;
	6264	}
	6265
	6266	if (m0->m_len < sizeof(struct ip6_hdr)) {
	6267	DPFPRINTF(PF_DEBUG_URGENT,
	6268	("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
	6269	goto bad;
	6270	}
	6271	ip6 = mtod(m0, struct ip6_hdr *);
	6272
	6273	ro = &ip6route;
	6274	bzero((caddr_t)ro, sizeof(*ro));
	6275	dst = (struct sockaddr_in6 *)&ro->ro_dst;
	6276	dst->sin6_family = AF_INET6;
	6277	dst->sin6_len = sizeof(*dst);
	6278	dst->sin6_addr = ip6->ip6_dst;
	6279
	6280	/*
	6281	* DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
	6282	* so make sure pf.flags is clear.
	6283	*
	6284	* Cheat. XXX why only in the v6 case???
	6285	*/
	6286	if (r->rt == PF_FASTROUTE) {
	6287	m0->m_pkthdr.fw_flags \|= PF_MBUF_TAGGED;
	6288	m0->m_pkthdr.pf.flags = 0;
	6289	/* XXX Re-Check when Upgrading to > 4.4 */
	6290	m0->m_pkthdr.pf.statekey = NULL;
	6291	ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
	6292	return;
	6293	}
	6294
	6295	if (TAILQ_EMPTY(&r->rpool.list)) {
	6296	DPFPRINTF(PF_DEBUG_URGENT,
	6297	("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n"));
	6298	goto bad;
	6299	}
	6300	if (s == NULL) {
	6301	pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
	6302	&naddr, NULL, &sn);
	6303	if (!PF_AZERO(&naddr, AF_INET6))
	6304	PF_ACPY((struct pf_addr *)&dst->sin6_addr,
	6305	&naddr, AF_INET6);
	6306	ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
	6307	} else {
	6308	if (!PF_AZERO(&s->rt_addr, AF_INET6))
	6309	PF_ACPY((struct pf_addr *)&dst->sin6_addr,
	6310	&s->rt_addr, AF_INET6);
	6311	ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
	6312	}
	6313	if (ifp == NULL)
	6314	goto bad;
	6315
	6316	if (oifp != ifp) {
	6317	if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
	6318	goto bad;
	6319	} else if (m0 == NULL) {
	6320	goto done;
	6321	}
	6322	if (m0->m_len < sizeof(struct ip6_hdr)) {
	6323	DPFPRINTF(PF_DEBUG_URGENT,
	6324	("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
	6325	goto bad;
	6326	}
	6327	ip6 = mtod(m0, struct ip6_hdr *);
	6328	}
	6329
	6330	/*
	6331	* If the packet is too large for the outgoing interface,
	6332	* send back an icmp6 error.
	6333	*/
	6334	if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr))
	6335	dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index);
	6336	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
	6337	nd6_output(ifp, ifp, m0, dst, NULL);
	6338	} else {
	6339	in6_ifstat_inc(ifp, ifs6_in_toobig);
	6340	if (r->rt != PF_DUPTO)
	6341	icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
	6342	else
	6343	goto bad;
	6344	}
	6345
	6346	done:
	6347	if (r->rt != PF_DUPTO)
	6348	*m = NULL;
	6349	return;
	6350
	6351	bad:
	6352	m_freem(m0);
	6353	goto done;
	6354	}
	6355	#endif /* INET6 */
	6356
	6357
	6358	/*
	6359	* check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag
	6360	* off is the offset where the protocol header starts
	6361	* len is the total length of protocol header plus payload
	6362	* returns 0 when the checksum is valid, otherwise returns 1.
	6363	*/
	6364	/*
	6365	* XXX
	6366	* FreeBSD supports cksum offload for the following drivers.
	6367	* em(4), gx(4), lge(4), nge(4), ti(4), xl(4)
	6368	* If we can make full use of it we would outperform ipfw/ipfilter in
	6369	* very heavy traffic.
	6370	* I have not tested 'cause I don't have NICs that supports cksum offload.
	6371	* (There might be problems. Typical phenomena would be
	6372	* 1. No route message for UDP packet.
	6373	* 2. No connection acceptance from external hosts regardless of rule set.)
	6374	*/
	6375	int
	6376	pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p,
	6377	sa_family_t af)
	6378	{
	6379	u_int16_t sum = 0;
	6380	int hw_assist = 0;
	6381	struct ip *ip;
	6382
	6383	if (off < sizeof(struct ip) \|\| len < sizeof(struct udphdr))
	6384	return (1);
	6385	if (m->m_pkthdr.len < off + len)
	6386	return (1);
	6387
	6388	switch (p) {
	6389	case IPPROTO_TCP:
	6390	case IPPROTO_UDP:
	6391	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	6392	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
	6393	sum = m->m_pkthdr.csum_data;
	6394	} else {
	6395	ip = mtod(m, struct ip *);
	6396	sum = in_pseudo(ip->ip_src.s_addr,
	6397	ip->ip_dst.s_addr, htonl((u_short)len +
	6398	m->m_pkthdr.csum_data + p));
	6399	}
	6400	sum ^= 0xffff;
	6401	++hw_assist;
	6402	}
	6403	break;
	6404	case IPPROTO_ICMP:
	6405	#ifdef INET6
	6406	case IPPROTO_ICMPV6:
	6407	#endif /* INET6 */
	6408	break;
	6409	default:
	6410	return (1);
	6411	}
	6412
	6413	if (!hw_assist) {
	6414	switch (af) {
	6415	case AF_INET:
	6416	if (p == IPPROTO_ICMP) {
	6417	if (m->m_len < off)
	6418	return (1);
	6419	m->m_data += off;
	6420	m->m_len -= off;
	6421	sum = in_cksum(m, len);
	6422	m->m_data -= off;
	6423	m->m_len += off;
	6424	} else {
	6425	if (m->m_len < sizeof(struct ip))
	6426	return (1);
	6427	sum = in_cksum_range(m, p, off, len);
	6428	if (sum == 0) {
	6429	m->m_pkthdr.csum_flags \|=
	6430	(CSUM_DATA_VALID \|
	6431	CSUM_PSEUDO_HDR);
	6432	m->m_pkthdr.csum_data = 0xffff;
	6433	}
	6434	}
	6435	break;
	6436	#ifdef INET6
	6437	case AF_INET6:
	6438	if (m->m_len < sizeof(struct ip6_hdr))
	6439	return (1);
	6440	sum = in6_cksum(m, p, off, len);
	6441	/*
	6442	* XXX
	6443	* IPv6 H/W cksum off-load not supported yet!
	6444	*
	6445	* if (sum == 0) {
	6446	* m->m_pkthdr.csum_flags \|=
	6447	* (CSUM_DATA_VALID\|CSUM_PSEUDO_HDR);
	6448	* m->m_pkthdr.csum_data = 0xffff;
	6449	*}
	6450	*/
	6451	break;
	6452	#endif /* INET6 */
	6453	default:
	6454	return (1);
	6455	}
	6456	}
	6457	if (sum) {
	6458	switch (p) {
	6459	case IPPROTO_TCP:
	6460	tcpstat.tcps_rcvbadsum++;
	6461	break;
	6462	case IPPROTO_UDP:
	6463	udp_stat.udps_badsum++;
	6464	break;
	6465	case IPPROTO_ICMP:
	6466	icmpstat.icps_checksum++;
	6467	break;
	6468	#ifdef INET6
	6469	case IPPROTO_ICMPV6:
	6470	icmp6stat.icp6s_checksum++;
	6471	break;
	6472	#endif /* INET6 */
	6473	}
	6474	return (1);
	6475	}
	6476	return (0);
	6477	}
	6478
	6479	struct pf_divert *
	6480	pf_find_divert(struct mbuf *m)
	6481	{
	6482	struct m_tag *mtag;
	6483
	6484	if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL)
	6485	return (NULL);
	6486
	6487	return ((struct pf_divert *)(mtag + 1));
	6488	}
	6489
	6490	struct pf_divert *
	6491	pf_get_divert(struct mbuf *m)
	6492	{
	6493	struct m_tag *mtag;
	6494
	6495	if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL) {
	6496	mtag = m_tag_get(PACKET_TAG_PF_DIVERT, sizeof(struct pf_divert),
	6497	M_NOWAIT);
	6498	if (mtag == NULL)
	6499	return (NULL);
	6500	bzero(mtag + 1, sizeof(struct pf_divert));
	6501	m_tag_prepend(m, mtag);
	6502	}
	6503
	6504	return ((struct pf_divert *)(mtag + 1));
	6505	}
	6506
	6507	#ifdef INET
	6508
	6509	/*
	6510	* WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
	6511	*/
	6512	int
	6513	pf_test(int dir, struct ifnet ifp, struct mbuf *m0,
	6514	struct ether_header eh, struct inpcb inp)
	6515	{
	6516	struct pfi_kif *kif;
	6517	u_short action, reason = 0, log = 0;
	6518	struct mbuf m = m0;
	6519	struct ip *h = NULL;
	6520	struct pf_rule a = NULL, r = &pf_default_rule, tr, nr;
	6521	struct pf_state *s = NULL;
	6522	struct pf_ruleset *ruleset = NULL;
	6523	struct pf_pdesc pd;
	6524	int off, dirndx;
	6525	#ifdef ALTQ
	6526	int pqid = 0;
	6527	#endif
	6528
	6529	if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
	6530	/* Skip us; continue in ipfw. */
	6531	return (PF_PASS);
	6532	}
	6533
	6534	if (!pf_status.running)
	6535	return (PF_PASS);
	6536
	6537	memset(&pd, 0, sizeof(pd));
	6538	#ifdef foo
	6539	if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
	6540	kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
	6541	else
	6542	#endif
	6543	kif = (struct pfi_kif *)ifp->if_pf_kif;
	6544
	6545	if (kif == NULL) {
	6546	DPFPRINTF(PF_DEBUG_URGENT,
	6547	("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
	6548	return (PF_DROP);
	6549	}
	6550	if (kif->pfik_flags & PFI_IFLAG_SKIP)
	6551	return (PF_PASS);
	6552
	6553	#ifdef DIAGNOSTIC
	6554	if ((m->m_flags & M_PKTHDR) == 0)
	6555	panic("non-M_PKTHDR is passed to pf_test");
	6556	#endif /* DIAGNOSTIC */
	6557
	6558	if (m->m_pkthdr.len < (int)sizeof(*h)) {
	6559	action = PF_DROP;
	6560	REASON_SET(&reason, PFRES_SHORT);
	6561	log = 1;
	6562	goto done;
	6563	}
	6564
	6565	/*
	6566	* DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
	6567	* so make sure pf.flags is clear.
	6568	*/
	6569	if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
	6570	return (PF_PASS);
	6571	m->m_pkthdr.pf.flags = 0;
	6572	/* Re-Check when updating to > 4.4 */
	6573	m->m_pkthdr.pf.statekey = NULL;
	6574
	6575	/* We do IP header normalization and packet reassembly here */
	6576	if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
	6577	action = PF_DROP;
	6578	goto done;
	6579	}
	6580	m = m0; / pf_normalize messes with m0 */
	6581	h = mtod(m, struct ip *);
	6582
	6583	off = h->ip_hl << 2;
	6584	if (off < (int)sizeof(*h)) {
	6585	action = PF_DROP;
	6586	REASON_SET(&reason, PFRES_SHORT);
	6587	log = 1;
	6588	goto done;
	6589	}
	6590
	6591	pd.src = (struct pf_addr *)&h->ip_src;
	6592	pd.dst = (struct pf_addr *)&h->ip_dst;
	6593	pd.sport = pd.dport = NULL;
	6594	pd.ip_sum = &h->ip_sum;
	6595	pd.proto_sum = NULL;
	6596	pd.proto = h->ip_p;
	6597	pd.dir = dir;
	6598	pd.sidx = (dir == PF_IN) ? 0 : 1;
	6599	pd.didx = (dir == PF_IN) ? 1 : 0;
	6600	pd.af = AF_INET;
	6601	pd.tos = h->ip_tos;
	6602	pd.tot_len = ntohs(h->ip_len);
	6603	pd.eh = eh;
	6604
	6605	/* handle fragments that didn't get reassembled by normalization */
	6606	if (h->ip_off & htons(IP_MF \| IP_OFFMASK)) {
	6607	action = pf_test_fragment(&r, dir, kif, m, h,
	6608	&pd, &a, &ruleset);
	6609	goto done;
	6610	}
	6611
	6612	switch (h->ip_p) {
	6613
	6614	case IPPROTO_TCP: {
	6615	struct tcphdr th;
	6616
	6617	pd.hdr.tcp = &th;
	6618	if (!pf_pull_hdr(m, off, &th, sizeof(th),
	6619	&action, &reason, AF_INET)) {
	6620	log = action != PF_PASS;
	6621	goto done;
	6622	}
	6623	pd.p_len = pd.tot_len - off - (th.th_off << 2);
	6624	#ifdef ALTQ
	6625	if ((th.th_flags & TH_ACK) && pd.p_len == 0)
	6626	pqid = 1;
	6627	#endif
	6628	action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
	6629	if (action == PF_DROP)
	6630	goto done;
	6631	action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
	6632	&reason);
	6633	if (action == PF_PASS) {
	6634	r = s->rule.ptr;
	6635	a = s->anchor.ptr;
	6636	log = s->log;
	6637	} else if (s == NULL) {
	6638	action = pf_test_rule(&r, &s, dir, kif,
	6639	m, off, h, &pd, &a,
	6640	&ruleset, NULL, inp);
	6641	}
	6642	break;
	6643	}
	6644
	6645	case IPPROTO_UDP: {
	6646	struct udphdr uh;
	6647
	6648	pd.hdr.udp = &uh;
	6649	if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
	6650	&action, &reason, AF_INET)) {
	6651	log = action != PF_PASS;
	6652	goto done;
	6653	}
	6654	if (uh.uh_dport == 0 \|\|
	6655	ntohs(uh.uh_ulen) > m->m_pkthdr.len - off \|\|
	6656	ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
	6657	action = PF_DROP;
	6658	REASON_SET(&reason, PFRES_SHORT);
	6659	goto done;
	6660	}
	6661	action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
	6662	if (action == PF_PASS) {
	6663	r = s->rule.ptr;
	6664	a = s->anchor.ptr;
	6665	log = s->log;
	6666	} else if (s == NULL) {
	6667	action = pf_test_rule(&r, &s, dir, kif,
	6668	m, off, h, &pd, &a,
	6669	&ruleset, NULL, inp);
	6670	}
	6671	break;
	6672	}
	6673
	6674	case IPPROTO_ICMP: {
	6675	struct icmp ih;
	6676
	6677	pd.hdr.icmp = &ih;
	6678	if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
	6679	&action, &reason, AF_INET)) {
	6680	log = action != PF_PASS;
	6681	goto done;
	6682	}
	6683	action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
	6684	&reason);
	6685	if (action == PF_PASS) {
	6686	r = s->rule.ptr;
	6687	a = s->anchor.ptr;
	6688	log = s->log;
	6689	} else if (s == NULL) {
	6690	action = pf_test_rule(&r, &s, dir, kif,
	6691	m, off, h, &pd, &a,
	6692	&ruleset, NULL, inp);
	6693	}
	6694	break;
	6695	}
	6696
	6697	default:
	6698	action = pf_test_state_other(&s, dir, kif, m, &pd);
	6699	if (action == PF_PASS) {
	6700	r = s->rule.ptr;
	6701	a = s->anchor.ptr;
	6702	log = s->log;
	6703	} else if (s == NULL) {
	6704	action = pf_test_rule(&r, &s, dir, kif, m, off, h,
	6705	&pd, &a, &ruleset, NULL, inp);
	6706	}
	6707	break;
	6708	}
	6709
	6710	done:
	6711	if (action == PF_PASS && h->ip_hl > 5 &&
	6712	!((s && s->state_flags & PFSTATE_ALLOWOPTS) \|\| r->allow_opts)) {
	6713	action = PF_DROP;
	6714	REASON_SET(&reason, PFRES_IPOPTIONS);
	6715	log = 1;
	6716	DPFPRINTF(PF_DEBUG_MISC,
	6717	("pf: dropping packet with ip options\n"));
	6718	}
	6719
	6720	if ((s && s->tag) \|\| r->rtableid)
	6721	pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
	6722
	6723	#if 0
	6724	if (dir == PF_IN && s && s->key[PF_SK_STACK])
	6725	m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
	6726	#endif
	6727
	6728	#ifdef ALTQ
	6729	/*
	6730	* Generate a hash code and qid request for ALTQ. A qid of 0
	6731	* is allowed and will cause altq to select the default queue.
	6732	*/
	6733	if (action == PF_PASS) {
	6734	m->m_pkthdr.fw_flags \|= PF_MBUF_STRUCTURE;
	6735	if (pqid \|\| (pd.tos & IPTOS_LOWDELAY))
	6736	m->m_pkthdr.pf.qid = r->pqid;
	6737	else
	6738	m->m_pkthdr.pf.qid = r->qid;
	6739	m->m_pkthdr.pf.ecn_af = AF_INET;
	6740	m->m_pkthdr.pf.hdr = h;
	6741	/* add connection hash for fairq */
	6742	if (s) {
	6743	/* for fairq */
	6744	m->m_pkthdr.pf.state_hash = s->hash;
	6745	m->m_pkthdr.pf.flags \|= PF_TAG_STATE_HASHED;
	6746	}
	6747	}
	6748	#endif /* ALTQ */
	6749
	6750	/*
	6751	* connections redirected to loopback should not match sockets
	6752	* bound specifically to loopback due to security implications,
	6753	* see tcp_input() and in_pcblookup_listen().
	6754	*/
	6755	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP \|\|
	6756	pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
	6757	(s->nat_rule.ptr->action == PF_RDR \|\|
	6758	s->nat_rule.ptr->action == PF_BINAT) &&
	6759	(ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
	6760	{
	6761	m->m_pkthdr.pf.flags \|= PF_TAG_TRANSLATE_LOCALHOST;
	6762	}
	6763
	6764	if (dir == PF_IN && action == PF_PASS && r->divert.port) {
	6765	struct pf_divert *divert;
	6766
	6767	if ((divert = pf_get_divert(m))) {
	6768	m->m_pkthdr.pf.flags \|= PF_TAG_DIVERTED;
	6769	divert->port = r->divert.port;
	6770	divert->addr.ipv4 = r->divert.addr.v4;
	6771	}
	6772	}
	6773
	6774	if (log) {
	6775	struct pf_rule *lr;
	6776
	6777	if (s != NULL && s->nat_rule.ptr != NULL &&
	6778	s->nat_rule.ptr->log & PF_LOG_ALL)
	6779	lr = s->nat_rule.ptr;
	6780	else
	6781	lr = r;
	6782	PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset,
	6783	&pd);
	6784	}
	6785
	6786	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
	6787	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
	6788
	6789	if (action == PF_PASS \|\| r->action == PF_DROP) {
	6790	dirndx = (dir == PF_OUT);
	6791	r->packets[dirndx]++;
	6792	r->bytes[dirndx] += pd.tot_len;
	6793	if (a != NULL) {
	6794	a->packets[dirndx]++;
	6795	a->bytes[dirndx] += pd.tot_len;
	6796	}
	6797	if (s != NULL) {
	6798	if (s->nat_rule.ptr != NULL) {
	6799	s->nat_rule.ptr->packets[dirndx]++;
	6800	s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
	6801	}
	6802	if (s->src_node != NULL) {
	6803	s->src_node->packets[dirndx]++;
	6804	s->src_node->bytes[dirndx] += pd.tot_len;
	6805	}
	6806	if (s->nat_src_node != NULL) {
	6807	s->nat_src_node->packets[dirndx]++;
	6808	s->nat_src_node->bytes[dirndx] += pd.tot_len;
	6809	}
	6810	dirndx = (dir == s->direction) ? 0 : 1;
	6811	s->packets[dirndx]++;
	6812	s->bytes[dirndx] += pd.tot_len;
	6813	}
	6814	tr = r;
	6815	nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
	6816	if (nr != NULL && r == &pf_default_rule)
	6817	tr = nr;
	6818	if (tr->src.addr.type == PF_ADDR_TABLE)
	6819	pfr_update_stats(tr->src.addr.p.tbl,
	6820	(s == NULL) ? pd.src :
	6821	&s->key[(s->direction == PF_IN)]->
	6822	addr[(s->direction == PF_OUT)],
	6823	pd.af, pd.tot_len, dir == PF_OUT,
	6824	r->action == PF_PASS, tr->src.neg);
	6825	if (tr->dst.addr.type == PF_ADDR_TABLE)
	6826	pfr_update_stats(tr->dst.addr.p.tbl,
	6827	(s == NULL) ? pd.dst :
	6828	&s->key[(s->direction == PF_IN)]->
	6829	addr[(s->direction == PF_IN)],
	6830	pd.af, pd.tot_len, dir == PF_OUT,
	6831	r->action == PF_PASS, tr->dst.neg);
	6832	}
	6833
	6834
	6835	if (action == PF_SYNPROXY_DROP) {
	6836	m_freem(*m0);
	6837	*m0 = NULL;
	6838	action = PF_PASS;
	6839	} else if (r->rt) {
	6840	/* pf_route can free the mbuf causing m0 to become NULL /
	6841	pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
	6842	}
	6843
	6844	return (action);
	6845	}
	6846	#endif /* INET */
	6847
	6848	#ifdef INET6
	6849
	6850	/*
	6851	* WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
	6852	*/
	6853	int
	6854	pf_test6(int dir, struct ifnet ifp, struct mbuf *m0,
	6855	struct ether_header eh, struct inpcb inp)
	6856	{
	6857	struct pfi_kif *kif;
	6858	u_short action, reason = 0, log = 0;
	6859	struct mbuf m = m0, *n = NULL;
	6860	struct ip6_hdr *h = NULL;
	6861	struct pf_rule a = NULL, r = &pf_default_rule, tr, nr;
	6862	struct pf_state *s = NULL;
	6863	struct pf_ruleset *ruleset = NULL;
	6864	struct pf_pdesc pd;
	6865	int off, terminal = 0, dirndx, rh_cnt = 0;
	6866
	6867	if (!pf_status.running)
	6868	return (PF_PASS);
	6869
	6870	memset(&pd, 0, sizeof(pd));
	6871	#ifdef foo
	6872	if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
	6873	kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
	6874	else
	6875	#endif
	6876	kif = (struct pfi_kif *)ifp->if_pf_kif;
	6877
	6878	if (kif == NULL) {
	6879	DPFPRINTF(PF_DEBUG_URGENT,
	6880	("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
	6881	return (PF_DROP);
	6882	}
	6883	if (kif->pfik_flags & PFI_IFLAG_SKIP)
	6884	return (PF_PASS);
	6885
	6886	#ifdef DIAGNOSTIC
	6887	if ((m->m_flags & M_PKTHDR) == 0)
	6888	panic("non-M_PKTHDR is passed to pf_test6");
	6889	#endif /* DIAGNOSTIC */
	6890
	6891	if (m->m_pkthdr.len < (int)sizeof(*h)) {
	6892	action = PF_DROP;
	6893	REASON_SET(&reason, PFRES_SHORT);
	6894	log = 1;
	6895	goto done;
	6896	}
	6897
	6898	/*
	6899	* DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
	6900	* so make sure pf.flags is clear.
	6901	*/
	6902	if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
	6903	return (PF_PASS);
	6904	m->m_pkthdr.pf.flags = 0;
	6905	/* Re-Check when updating to > 4.4 */
	6906	m->m_pkthdr.pf.statekey = NULL;
	6907
	6908	/* We do IP header normalization and packet reassembly here */
	6909	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
	6910	action = PF_DROP;
	6911	goto done;
	6912	}
	6913	m = m0; / pf_normalize messes with m0 */
	6914	h = mtod(m, struct ip6_hdr *);
	6915
	6916	#if 1
	6917	/*
	6918	* we do not support jumbogram yet. if we keep going, zero ip6_plen
	6919	* will do something bad, so drop the packet for now.
	6920	*/
	6921	if (htons(h->ip6_plen) == 0) {
	6922	action = PF_DROP;
	6923	REASON_SET(&reason, PFRES_NORM); /XXX/
	6924	goto done;
	6925	}
	6926	#endif
	6927
	6928	pd.src = (struct pf_addr *)&h->ip6_src;
	6929	pd.dst = (struct pf_addr *)&h->ip6_dst;
	6930	pd.sport = pd.dport = NULL;
	6931	pd.ip_sum = NULL;
	6932	pd.proto_sum = NULL;
	6933	pd.dir = dir;
	6934	pd.sidx = (dir == PF_IN) ? 0 : 1;
	6935	pd.didx = (dir == PF_IN) ? 1 : 0;
	6936	pd.af = AF_INET6;
	6937	pd.tos = 0;
	6938	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
	6939	pd.eh = eh;
	6940
	6941	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
	6942	pd.proto = h->ip6_nxt;
	6943	do {
	6944	switch (pd.proto) {
	6945	case IPPROTO_FRAGMENT:
	6946	action = pf_test_fragment(&r, dir, kif, m, h,
	6947	&pd, &a, &ruleset);
	6948	if (action == PF_DROP)
	6949	REASON_SET(&reason, PFRES_FRAG);
	6950	goto done;
	6951	case IPPROTO_ROUTING: {
	6952	struct ip6_rthdr rthdr;
	6953
	6954	if (rh_cnt++) {
	6955	DPFPRINTF(PF_DEBUG_MISC,
	6956	("pf: IPv6 more than one rthdr\n"));
	6957	action = PF_DROP;
	6958	REASON_SET(&reason, PFRES_IPOPTIONS);
	6959	log = 1;
	6960	goto done;
	6961	}
	6962	if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
	6963	&reason, pd.af)) {
	6964	DPFPRINTF(PF_DEBUG_MISC,
	6965	("pf: IPv6 short rthdr\n"));
	6966	action = PF_DROP;
	6967	REASON_SET(&reason, PFRES_SHORT);
	6968	log = 1;
	6969	goto done;
	6970	}
	6971	if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
	6972	DPFPRINTF(PF_DEBUG_MISC,
	6973	("pf: IPv6 rthdr0\n"));
	6974	action = PF_DROP;
	6975	REASON_SET(&reason, PFRES_IPOPTIONS);
	6976	log = 1;
	6977	goto done;
	6978	}
	6979	/* FALLTHROUGH */
	6980	}
	6981	case IPPROTO_AH:
	6982	case IPPROTO_HOPOPTS:
	6983	case IPPROTO_DSTOPTS: {
	6984	/* get next header and header length */
	6985	struct ip6_ext opt6;
	6986
	6987	if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
	6988	NULL, &reason, pd.af)) {
	6989	DPFPRINTF(PF_DEBUG_MISC,
	6990	("pf: IPv6 short opt\n"));
	6991	action = PF_DROP;
	6992	log = 1;
	6993	goto done;
	6994	}
	6995	if (pd.proto == IPPROTO_AH)
	6996	off += (opt6.ip6e_len + 2) * 4;
	6997	else
	6998	off += (opt6.ip6e_len + 1) * 8;
	6999	pd.proto = opt6.ip6e_nxt;
	7000	/* goto the next header */
	7001	break;
	7002	}
	7003	default:
	7004	terminal++;
	7005	break;
	7006	}
	7007	} while (!terminal);
	7008
	7009	/* if there's no routing header, use unmodified mbuf for checksumming */
	7010	if (!n)
	7011	n = m;
	7012
	7013	switch (pd.proto) {
	7014
	7015	case IPPROTO_TCP: {
	7016	struct tcphdr th;
	7017
	7018	pd.hdr.tcp = &th;
	7019	if (!pf_pull_hdr(m, off, &th, sizeof(th),
	7020	&action, &reason, AF_INET6)) {
	7021	log = action != PF_PASS;
	7022	goto done;
	7023	}
	7024	pd.p_len = pd.tot_len - off - (th.th_off << 2);
	7025	action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
	7026	if (action == PF_DROP)
	7027	goto done;
	7028	action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
	7029	&reason);
	7030	if (action == PF_PASS) {
	7031	r = s->rule.ptr;
	7032	a = s->anchor.ptr;
	7033	log = s->log;
	7034	} else if (s == NULL) {
	7035	action = pf_test_rule(&r, &s, dir, kif,
	7036	m, off, h, &pd, &a,
	7037	&ruleset, NULL, inp);
	7038	}
	7039	break;
	7040	}
	7041
	7042	case IPPROTO_UDP: {
	7043	struct udphdr uh;
	7044
	7045	pd.hdr.udp = &uh;
	7046	if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
	7047	&action, &reason, AF_INET6)) {
	7048	log = action != PF_PASS;
	7049	goto done;
	7050	}
	7051	if (uh.uh_dport == 0 \|\|
	7052	ntohs(uh.uh_ulen) > m->m_pkthdr.len - off \|\|
	7053	ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
	7054	action = PF_DROP;
	7055	REASON_SET(&reason, PFRES_SHORT);
	7056	goto done;
	7057	}
	7058	action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
	7059	if (action == PF_PASS) {
	7060	r = s->rule.ptr;
	7061	a = s->anchor.ptr;
	7062	log = s->log;
	7063	} else if (s == NULL) {
	7064	action = pf_test_rule(&r, &s, dir, kif,
	7065	m, off, h, &pd, &a,
	7066	&ruleset, NULL, inp);
	7067	}
	7068	break;
	7069	}
	7070
	7071	case IPPROTO_ICMPV6: {
	7072	struct icmp6_hdr ih;
	7073
	7074	pd.hdr.icmp6 = &ih;
	7075	if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
	7076	&action, &reason, AF_INET6)) {
	7077	log = action != PF_PASS;
	7078	goto done;
	7079	}
	7080	action = pf_test_state_icmp(&s, dir, kif,
	7081	m, off, h, &pd, &reason);
	7082	if (action == PF_PASS) {
	7083	r = s->rule.ptr;
	7084	a = s->anchor.ptr;
	7085	log = s->log;
	7086	} else if (s == NULL) {
	7087	action = pf_test_rule(&r, &s, dir, kif,
	7088	m, off, h, &pd, &a,
	7089	&ruleset, NULL, inp);
	7090	}
	7091	break;
	7092	}
	7093
	7094	default:
	7095	action = pf_test_state_other(&s, dir, kif, m, &pd);
	7096	if (action == PF_PASS) {
	7097	r = s->rule.ptr;
	7098	a = s->anchor.ptr;
	7099	log = s->log;
	7100	} else if (s == NULL) {
	7101	action = pf_test_rule(&r, &s, dir, kif, m, off, h,
	7102	&pd, &a, &ruleset, NULL, inp);
	7103	}
	7104	break;
	7105	}
	7106
	7107	done:
	7108	if (n != m) {
	7109	m_freem(n);
	7110	n = NULL;
	7111	}
	7112
	7113	/* handle dangerous IPv6 extension headers. */
	7114	if (action == PF_PASS && rh_cnt &&
	7115	!((s && s->state_flags & PFSTATE_ALLOWOPTS) \|\| r->allow_opts)) {
	7116	action = PF_DROP;
	7117	REASON_SET(&reason, PFRES_IPOPTIONS);
	7118	log = 1;
	7119	DPFPRINTF(PF_DEBUG_MISC,
	7120	("pf: dropping packet with dangerous v6 headers\n"));
	7121	}
	7122
	7123	if ((s && s->tag) \|\| r->rtableid)
	7124	pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
	7125
	7126	#if 0
	7127	if (dir == PF_IN && s && s->key[PF_SK_STACK])
	7128	m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
	7129	#endif
	7130
	7131	#ifdef ALTQ
	7132	/*
	7133	* Generate a hash code and qid request for ALTQ. A qid of 0
	7134	* is allowed and will cause altq to select the default queue.
	7135	*/
	7136	if (action == PF_PASS) {
	7137	m->m_pkthdr.fw_flags \|= PF_MBUF_STRUCTURE;
	7138	if (pd.tos & IPTOS_LOWDELAY)
	7139	m->m_pkthdr.pf.qid = r->pqid;
	7140	else
	7141	m->m_pkthdr.pf.qid = r->qid;
	7142	m->m_pkthdr.pf.ecn_af = AF_INET6;
	7143	m->m_pkthdr.pf.hdr = h;
	7144	if (s) {
	7145	/* for fairq */
	7146	m->m_pkthdr.pf.state_hash = s->hash;
	7147	m->m_pkthdr.pf.flags \|= PF_TAG_STATE_HASHED;
	7148	}
	7149	}
	7150	#endif /* ALTQ */
	7151
	7152	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP \|\|
	7153	pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
	7154	(s->nat_rule.ptr->action == PF_RDR \|\|
	7155	s->nat_rule.ptr->action == PF_BINAT) &&
	7156	IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
	7157	{
	7158	m->m_pkthdr.pf.flags \|= PF_TAG_TRANSLATE_LOCALHOST;
	7159	}
	7160
	7161	if (dir == PF_IN && action == PF_PASS && r->divert.port) {
	7162	struct pf_divert *divert;
	7163
	7164	if ((divert = pf_get_divert(m))) {
	7165	m->m_pkthdr.pf.flags \|= PF_TAG_DIVERTED;
	7166	divert->port = r->divert.port;
	7167	divert->addr.ipv6 = r->divert.addr.v6;
	7168	}
	7169	}
	7170
	7171	if (log) {
	7172	struct pf_rule *lr;
	7173
	7174	if (s != NULL && s->nat_rule.ptr != NULL &&
	7175	s->nat_rule.ptr->log & PF_LOG_ALL)
	7176	lr = s->nat_rule.ptr;
	7177	else
	7178	lr = r;
	7179	PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset,
	7180	&pd);
	7181	}
	7182
	7183	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
	7184	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
	7185
	7186	if (action == PF_PASS \|\| r->action == PF_DROP) {
	7187	dirndx = (dir == PF_OUT);
	7188	r->packets[dirndx]++;
	7189	r->bytes[dirndx] += pd.tot_len;
	7190	if (a != NULL) {
	7191	a->packets[dirndx]++;
	7192	a->bytes[dirndx] += pd.tot_len;
	7193	}
	7194	if (s != NULL) {
	7195	if (s->nat_rule.ptr != NULL) {
	7196	s->nat_rule.ptr->packets[dirndx]++;
	7197	s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
	7198	}
	7199	if (s->src_node != NULL) {
	7200	s->src_node->packets[dirndx]++;
	7201	s->src_node->bytes[dirndx] += pd.tot_len;
	7202	}
	7203	if (s->nat_src_node != NULL) {
	7204	s->nat_src_node->packets[dirndx]++;
	7205	s->nat_src_node->bytes[dirndx] += pd.tot_len;
	7206	}
	7207	dirndx = (dir == s->direction) ? 0 : 1;
	7208	s->packets[dirndx]++;
	7209	s->bytes[dirndx] += pd.tot_len;
	7210	}
	7211	tr = r;
	7212	nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
	7213	if (nr != NULL && r == &pf_default_rule)
	7214	tr = nr;
	7215	if (tr->src.addr.type == PF_ADDR_TABLE)
	7216	pfr_update_stats(tr->src.addr.p.tbl,
	7217	(s == NULL) ? pd.src :
	7218	&s->key[(s->direction == PF_IN)]->addr[0],
	7219	pd.af, pd.tot_len, dir == PF_OUT,
	7220	r->action == PF_PASS, tr->src.neg);
	7221	if (tr->dst.addr.type == PF_ADDR_TABLE)
	7222	pfr_update_stats(tr->dst.addr.p.tbl,
	7223	(s == NULL) ? pd.dst :
	7224	&s->key[(s->direction == PF_IN)]->addr[1],
	7225	pd.af, pd.tot_len, dir == PF_OUT,
	7226	r->action == PF_PASS, tr->dst.neg);
	7227	}
	7228
	7229
	7230	if (action == PF_SYNPROXY_DROP) {
	7231	m_freem(*m0);
	7232	*m0 = NULL;
	7233	action = PF_PASS;
	7234	} else if (r->rt)
	7235	/* pf_route6 can free the mbuf causing m0 to become NULL /
	7236	pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
	7237
	7238	return (action);
	7239	}
	7240	#endif /* INET6 */
	7241
	7242	int
	7243	pf_check_congestion(struct ifqueue *ifq)
	7244	{
	7245	return (0);
	7246	}