Merge branch 'vendor/OPENSSL'
[dragonfly.git] / sys / net / pf / pf.c
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * Copyright (c) 2001 Daniel Hartmeier
5  * Copyright (c) 2002 - 2008 Henning Brauer
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  *    - Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    - Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following
16  *      disclaimer in the documentation and/or other materials provided
17  *      with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * Effort sponsored in part by the Defense Advanced Research Projects
33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35  *
36  */
37
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/filio.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/kernel.h>
49 #include <sys/time.h>
50 #include <sys/sysctl.h>
51 #include <sys/endian.h>
52 #include <sys/proc.h>
53 #include <sys/kthread.h>
54 #include <sys/spinlock.h>
55
56 #include <machine/inttypes.h>
57
58 #include <sys/md5.h>
59
60 #include <net/if.h>
61 #include <net/if_types.h>
62 #include <net/bpf.h>
63 #include <net/netisr2.h>
64 #include <net/route.h>
65
66 #include <netinet/in.h>
67 #include <netinet/in_var.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/ip.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/udp.h>
74 #include <netinet/ip_icmp.h>
75 #include <netinet/in_pcb.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #include <netinet/udp_var.h>
79 #include <netinet/icmp_var.h>
80 #include <netinet/if_ether.h>
81
82 #include <net/pf/pfvar.h>
83 #include <net/pf/if_pflog.h>
84
85 #include <net/pf/if_pfsync.h>
86
87 #ifdef INET6
88 #include <netinet/ip6.h>
89 #include <netinet/icmp6.h>
90 #include <netinet6/nd6.h>
91 #include <netinet6/ip6_var.h>
92 #include <netinet6/in6_pcb.h>
93 #endif /* INET6 */
94
95 #include <sys/in_cksum.h>
96 #include <sys/ucred.h>
97 #include <machine/limits.h>
98 #include <sys/msgport2.h>
99 #include <sys/spinlock2.h>
100 #include <net/netmsg2.h>
101 #include <net/toeplitz2.h>
102
103 extern int ip_optcopy(struct ip *, struct ip *);
104 extern int debug_pfugidhack;
105
106 /*
107  * pf_token - shared lock for cpu-localized operations,
108  *            exclusive lock otherwise.
109  *
110  * pf_gtoken- exclusive lock used for initialization.
111  *
112  * pf_spin  - only used to atomically fetch and increment stateid
113  *            on 32-bit systems.
114  */
115 struct lwkt_token pf_token = LWKT_TOKEN_INITIALIZER(pf_token);
116 struct lwkt_token pf_gtoken = LWKT_TOKEN_INITIALIZER(pf_gtoken);
117 #if __SIZEOF_LONG__ != 8
118 struct spinlock pf_spin = SPINLOCK_INITIALIZER(pf_spin, "pf_spin");
119 #endif
120
121 #define DPFPRINTF(n, x) if (pf_status.debug >= (n)) kprintf x
122
123 #define FAIL(code)      { error = (code); goto done; }
124
125 /*
126  * Global variables
127  */
128
129 /* mask radix tree */
130 struct radix_node_head  *pf_maskhead;
131
132 /* state tables */
133 struct pf_state_tree     pf_statetbl[MAXCPU+1]; /* incls one global table */
134
135 struct pf_altqqueue      pf_altqs[2];
136 struct pf_palist         pf_pabuf;
137 struct pf_altqqueue     *pf_altqs_active;
138 struct pf_altqqueue     *pf_altqs_inactive;
139 struct pf_status         pf_status;
140
141 u_int32_t                ticket_altqs_active;
142 u_int32_t                ticket_altqs_inactive;
143 int                      altqs_inactive_open;
144 u_int32_t                ticket_pabuf;
145
146 MD5_CTX                  pf_tcp_secret_ctx;
147 u_char                   pf_tcp_secret[16];
148 int                      pf_tcp_secret_init;
149 int                      pf_tcp_iss_off;
150
151 struct pf_anchor_stackframe {
152         struct pf_ruleset                       *rs;
153         struct pf_rule                          *r;
154         struct pf_anchor_node                   *parent;
155         struct pf_anchor                        *child;
156 } pf_anchor_stack[64];
157
158 struct malloc_type       *pf_src_tree_pl, *pf_rule_pl, *pf_pooladdr_pl;
159 struct malloc_type       *pf_state_pl, *pf_state_key_pl, *pf_state_item_pl;
160 struct malloc_type       *pf_altq_pl;
161
162 void                     pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
163
164 void                     pf_init_threshold(struct pf_threshold *, u_int32_t,
165                             u_int32_t);
166 void                     pf_add_threshold(struct pf_threshold *);
167 int                      pf_check_threshold(struct pf_threshold *);
168
169 void                     pf_change_ap(struct pf_addr *, u_int16_t *,
170                             u_int16_t *, u_int16_t *, struct pf_addr *,
171                             u_int16_t, u_int8_t, sa_family_t);
172 int                      pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
173                             struct tcphdr *, struct pf_state_peer *);
174 #ifdef INET6
175 void                     pf_change_a6(struct pf_addr *, u_int16_t *,
176                             struct pf_addr *, u_int8_t);
177 #endif /* INET6 */
178 void                     pf_change_icmp(struct pf_addr *, u_int16_t *,
179                             struct pf_addr *, struct pf_addr *, u_int16_t,
180                             u_int16_t *, u_int16_t *, u_int16_t *,
181                             u_int16_t *, u_int8_t, sa_family_t);
182 void                     pf_send_tcp(const struct pf_rule *, sa_family_t,
183                             const struct pf_addr *, const struct pf_addr *,
184                             u_int16_t, u_int16_t, u_int32_t, u_int32_t,
185                             u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
186                             u_int16_t, struct ether_header *, struct ifnet *);
187 void                     pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
188                             sa_family_t, struct pf_rule *);
189 struct pf_rule          *pf_match_translation(struct pf_pdesc *, struct mbuf *,
190                             int, int, struct pfi_kif *,
191                             struct pf_addr *, u_int16_t, struct pf_addr *,
192                             u_int16_t, int);
193 struct pf_rule          *pf_get_translation(struct pf_pdesc *, struct mbuf *,
194                             int, int, struct pfi_kif *, struct pf_src_node **,
195                             struct pf_state_key **, struct pf_state_key **,
196                             struct pf_state_key **, struct pf_state_key **,
197                             struct pf_addr *, struct pf_addr *,
198                             u_int16_t, u_int16_t);
199 void                     pf_detach_state(struct pf_state *);
200 int                      pf_state_key_setup(struct pf_pdesc *, struct pf_rule *,
201                             struct pf_state_key **, struct pf_state_key **,
202                             struct pf_state_key **, struct pf_state_key **,
203                             struct pf_addr *, struct pf_addr *,
204                             u_int16_t, u_int16_t);
205 void                     pf_state_key_detach(struct pf_state *, int);
206 u_int32_t                pf_tcp_iss(struct pf_pdesc *);
207 int                      pf_test_rule(struct pf_rule **, struct pf_state **,
208                             int, struct pfi_kif *, struct mbuf *, int,
209                             void *, struct pf_pdesc *, struct pf_rule **,
210                             struct pf_ruleset **, struct ifqueue *, struct inpcb *);
211 static __inline int      pf_create_state(struct pf_rule *, struct pf_rule *,
212                             struct pf_rule *, struct pf_pdesc *,
213                             struct pf_src_node *, struct pf_state_key *,
214                             struct pf_state_key *, struct pf_state_key *,
215                             struct pf_state_key *, struct mbuf *, int,
216                             u_int16_t, u_int16_t, int *, struct pfi_kif *,
217                             struct pf_state **, int, u_int16_t, u_int16_t,
218                             int);
219 int                      pf_test_fragment(struct pf_rule **, int,
220                             struct pfi_kif *, struct mbuf *, void *,
221                             struct pf_pdesc *, struct pf_rule **,
222                             struct pf_ruleset **);
223 int                      pf_tcp_track_full(struct pf_state_peer *,
224                             struct pf_state_peer *, struct pf_state **,
225                             struct pfi_kif *, struct mbuf *, int,
226                             struct pf_pdesc *, u_short *, int *);
227 int                     pf_tcp_track_sloppy(struct pf_state_peer *,
228                             struct pf_state_peer *, struct pf_state **,
229                             struct pf_pdesc *, u_short *);
230 int                      pf_test_state_tcp(struct pf_state **, int,
231                             struct pfi_kif *, struct mbuf *, int,
232                             void *, struct pf_pdesc *, u_short *);
233 int                      pf_test_state_udp(struct pf_state **, int,
234                             struct pfi_kif *, struct mbuf *, int,
235                             void *, struct pf_pdesc *);
236 int                      pf_test_state_icmp(struct pf_state **, int,
237                             struct pfi_kif *, struct mbuf *, int,
238                             void *, struct pf_pdesc *, u_short *);
239 int                      pf_test_state_other(struct pf_state **, int,
240                             struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
241 void                     pf_step_into_anchor(int *, struct pf_ruleset **, int,
242                             struct pf_rule **, struct pf_rule **, int *);
243 int                      pf_step_out_of_anchor(int *, struct pf_ruleset **,
244                              int, struct pf_rule **, struct pf_rule **,
245                              int *);
246 void                     pf_hash(struct pf_addr *, struct pf_addr *,
247                             struct pf_poolhashkey *, sa_family_t);
248 int                      pf_map_addr(u_int8_t, struct pf_rule *,
249                             struct pf_addr *, struct pf_addr *,
250                             struct pf_addr *, struct pf_src_node **);
251 int                      pf_get_sport(struct pf_pdesc *,
252                             sa_family_t, u_int8_t, struct pf_rule *,
253                             struct pf_addr *, struct pf_addr *,
254                             u_int16_t, u_int16_t,
255                             struct pf_addr *, u_int16_t *,
256                             u_int16_t, u_int16_t,
257                             struct pf_src_node **);
258 void                     pf_route(struct mbuf **, struct pf_rule *, int,
259                             struct ifnet *, struct pf_state *,
260                             struct pf_pdesc *);
261 void                     pf_route6(struct mbuf **, struct pf_rule *, int,
262                             struct ifnet *, struct pf_state *,
263                             struct pf_pdesc *);
264 u_int8_t                 pf_get_wscale(struct mbuf *, int, u_int16_t,
265                             sa_family_t);
266 u_int16_t                pf_get_mss(struct mbuf *, int, u_int16_t,
267                             sa_family_t);
268 u_int16_t                pf_calc_mss(struct pf_addr *, sa_family_t,
269                                 u_int16_t);
270 void                     pf_set_rt_ifp(struct pf_state *,
271                             struct pf_addr *);
272 int                      pf_check_proto_cksum(struct mbuf *, int, int,
273                             u_int8_t, sa_family_t);
274 struct pf_divert        *pf_get_divert(struct mbuf *);
275 void                     pf_print_state_parts(struct pf_state *,
276                             struct pf_state_key *, struct pf_state_key *);
277 int                      pf_addr_wrap_neq(struct pf_addr_wrap *,
278                             struct pf_addr_wrap *);
279 struct pf_state         *pf_find_state(struct pfi_kif *,
280                             struct pf_state_key_cmp *, u_int, struct mbuf *);
281 int                      pf_src_connlimit(struct pf_state *);
282 int                      pf_check_congestion(struct ifqueue *);
283
284 extern int pf_end_threads;
285
286 struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
287         { &pf_state_pl, PFSTATE_HIWAT },
288         { &pf_src_tree_pl, PFSNODE_HIWAT },
289         { &pf_frent_pl, PFFRAG_FRENT_HIWAT },
290         { &pfr_ktable_pl, PFR_KTABLE_HIWAT },
291         { &pfr_kentry_pl, PFR_KENTRY_HIWAT }
292 };
293
294 /*
295  * If route-to and direction is out we match with no further processing
296  *      (rt_kif must be assigned and not equal to the out interface)
297  * If reply-to and direction is in we match with no further processing
298  *      (rt_kif must be assigned and not equal to the in interface)
299  */
300 #define STATE_LOOKUP(i, k, d, s, m)                                     \
301         do {                                                            \
302                 s = pf_find_state(i, k, d, m);                          \
303                 if (s == NULL || (s)->timeout == PFTM_PURGE)            \
304                         return (PF_DROP);                               \
305                 if (d == PF_OUT &&                                      \
306                     (((s)->rule.ptr->rt == PF_ROUTETO &&                \
307                     (s)->rule.ptr->direction == PF_OUT) ||              \
308                     ((s)->rule.ptr->rt == PF_REPLYTO &&                 \
309                     (s)->rule.ptr->direction == PF_IN)) &&              \
310                     (s)->rt_kif != NULL &&                              \
311                     (s)->rt_kif != i)                                   \
312                         return (PF_PASS);                               \
313         } while (0)
314
315 #define BOUND_IFACE(r, k) \
316         ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all
317
318 #define STATE_INC_COUNTERS(s)                           \
319         do {                                            \
320                 atomic_add_int(&s->rule.ptr->states_cur, 1);    \
321                 s->rule.ptr->states_tot++;              \
322                 if (s->anchor.ptr != NULL) {            \
323                         atomic_add_int(&s->anchor.ptr->states_cur, 1);  \
324                         s->anchor.ptr->states_tot++;    \
325                 }                                       \
326                 if (s->nat_rule.ptr != NULL) {          \
327                         atomic_add_int(&s->nat_rule.ptr->states_cur, 1); \
328                         s->nat_rule.ptr->states_tot++;  \
329                 }                                       \
330         } while (0)
331
332 #define STATE_DEC_COUNTERS(s)                           \
333         do {                                            \
334                 if (s->nat_rule.ptr != NULL)            \
335                         atomic_add_int(&s->nat_rule.ptr->states_cur, -1); \
336                 if (s->anchor.ptr != NULL)              \
337                         atomic_add_int(&s->anchor.ptr->states_cur, -1); \
338                 atomic_add_int(&s->rule.ptr->states_cur, -1);           \
339         } while (0)
340
341 static MALLOC_DEFINE(M_PFSTATEPL, "pfstatepl", "pf state pool list");
342 static MALLOC_DEFINE(M_PFSRCTREEPL, "pfsrctpl", "pf source tree pool list");
343 static MALLOC_DEFINE(M_PFSTATEKEYPL, "pfstatekeypl", "pf state key pool list");
344 static MALLOC_DEFINE(M_PFSTATEITEMPL, "pfstateitempl", "pf state item pool list");
345
346 static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
347 static __inline int pf_state_compare_key(struct pf_state_key *,
348                                 struct pf_state_key *);
349 static __inline int pf_state_compare_rkey(struct pf_state_key *,
350                                 struct pf_state_key *);
351 static __inline int pf_state_compare_id(struct pf_state *,
352                                 struct pf_state *);
353
354 struct pf_src_tree tree_src_tracking[MAXCPU];
355 struct pf_state_tree_id tree_id[MAXCPU];
356 struct pf_state_queue state_list[MAXCPU];
357
358 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
359 RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
360 RB_GENERATE(pf_state_rtree, pf_state_key, entry, pf_state_compare_rkey);
361 RB_GENERATE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id);
362
363 static __inline int
364 pf_src_compare(struct pf_src_node *a, struct pf_src_node *b)
365 {
366         int     diff;
367
368         if (a->rule.ptr > b->rule.ptr)
369                 return (1);
370         if (a->rule.ptr < b->rule.ptr)
371                 return (-1);
372         if ((diff = a->af - b->af) != 0)
373                 return (diff);
374         switch (a->af) {
375 #ifdef INET
376         case AF_INET:
377                 if (a->addr.addr32[0] > b->addr.addr32[0])
378                         return (1);
379                 if (a->addr.addr32[0] < b->addr.addr32[0])
380                         return (-1);
381                 break;
382 #endif /* INET */
383 #ifdef INET6
384         case AF_INET6:
385                 if (a->addr.addr32[3] > b->addr.addr32[3])
386                         return (1);
387                 if (a->addr.addr32[3] < b->addr.addr32[3])
388                         return (-1);
389                 if (a->addr.addr32[2] > b->addr.addr32[2])
390                         return (1);
391                 if (a->addr.addr32[2] < b->addr.addr32[2])
392                         return (-1);
393                 if (a->addr.addr32[1] > b->addr.addr32[1])
394                         return (1);
395                 if (a->addr.addr32[1] < b->addr.addr32[1])
396                         return (-1);
397                 if (a->addr.addr32[0] > b->addr.addr32[0])
398                         return (1);
399                 if (a->addr.addr32[0] < b->addr.addr32[0])
400                         return (-1);
401                 break;
402 #endif /* INET6 */
403         }
404         return (0);
405 }
406
407 u_int32_t
408 pf_state_hash(struct pf_state_key *sk)
409 {
410         u_int32_t hv = (u_int32_t)(((intptr_t)sk >> 6) ^ ((intptr_t)sk >> 15));
411         if (hv == 0)    /* disallow 0 */
412                 hv = 1;
413         return(hv);
414 }
415
416 #ifdef INET6
417 void
418 pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
419 {
420         switch (af) {
421 #ifdef INET
422         case AF_INET:
423                 dst->addr32[0] = src->addr32[0];
424                 break;
425 #endif /* INET */
426         case AF_INET6:
427                 dst->addr32[0] = src->addr32[0];
428                 dst->addr32[1] = src->addr32[1];
429                 dst->addr32[2] = src->addr32[2];
430                 dst->addr32[3] = src->addr32[3];
431                 break;
432         }
433 }
434 #endif /* INET6 */
435
436 void
437 pf_init_threshold(struct pf_threshold *threshold,
438     u_int32_t limit, u_int32_t seconds)
439 {
440         threshold->limit = limit * PF_THRESHOLD_MULT;
441         threshold->seconds = seconds;
442         threshold->count = 0;
443         threshold->last = time_second;
444 }
445
446 void
447 pf_add_threshold(struct pf_threshold *threshold)
448 {
449         u_int32_t t = time_second, diff = t - threshold->last;
450
451         if (diff >= threshold->seconds)
452                 threshold->count = 0;
453         else
454                 threshold->count -= threshold->count * diff /
455                     threshold->seconds;
456         threshold->count += PF_THRESHOLD_MULT;
457         threshold->last = t;
458 }
459
460 int
461 pf_check_threshold(struct pf_threshold *threshold)
462 {
463         return (threshold->count > threshold->limit);
464 }
465
466 int
467 pf_src_connlimit(struct pf_state *state)
468 {
469         int bad = 0;
470         int cpu = mycpu->gd_cpuid;
471
472         state->src_node->conn++;
473         state->src.tcp_est = 1;
474         pf_add_threshold(&state->src_node->conn_rate);
475
476         if (state->rule.ptr->max_src_conn &&
477             state->rule.ptr->max_src_conn <
478             state->src_node->conn) {
479                 pf_status.lcounters[LCNT_SRCCONN]++;
480                 bad++;
481         }
482
483         if (state->rule.ptr->max_src_conn_rate.limit &&
484             pf_check_threshold(&state->src_node->conn_rate)) {
485                 pf_status.lcounters[LCNT_SRCCONNRATE]++;
486                 bad++;
487         }
488
489         if (!bad)
490                 return 0;
491
492         if (state->rule.ptr->overload_tbl) {
493                 struct pfr_addr p;
494                 u_int32_t       killed = 0;
495
496                 pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
497                 if (pf_status.debug >= PF_DEBUG_MISC) {
498                         kprintf("pf_src_connlimit: blocking address ");
499                         pf_print_host(&state->src_node->addr, 0,
500                             state->key[PF_SK_WIRE]->af);
501                 }
502
503                 bzero(&p, sizeof(p));
504                 p.pfra_af = state->key[PF_SK_WIRE]->af;
505                 switch (state->key[PF_SK_WIRE]->af) {
506 #ifdef INET
507                 case AF_INET:
508                         p.pfra_net = 32;
509                         p.pfra_ip4addr = state->src_node->addr.v4;
510                         break;
511 #endif /* INET */
512 #ifdef INET6
513                 case AF_INET6:
514                         p.pfra_net = 128;
515                         p.pfra_ip6addr = state->src_node->addr.v6;
516                         break;
517 #endif /* INET6 */
518                 }
519
520                 pfr_insert_kentry(state->rule.ptr->overload_tbl,
521                     &p, time_second);
522
523                 /* kill existing states if that's required. */
524                 if (state->rule.ptr->flush) {
525                         struct pf_state_key *sk;
526                         struct pf_state *st;
527
528                         pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
529                         RB_FOREACH(st, pf_state_tree_id, &tree_id[cpu]) {
530                                 sk = st->key[PF_SK_WIRE];
531                                 /*
532                                  * Kill states from this source.  (Only those
533                                  * from the same rule if PF_FLUSH_GLOBAL is not
534                                  * set).  (Only on current cpu).
535                                  */
536                                 if (sk->af ==
537                                     state->key[PF_SK_WIRE]->af &&
538                                     ((state->direction == PF_OUT &&
539                                     PF_AEQ(&state->src_node->addr,
540                                         &sk->addr[0], sk->af)) ||
541                                     (state->direction == PF_IN &&
542                                     PF_AEQ(&state->src_node->addr,
543                                         &sk->addr[1], sk->af))) &&
544                                     (state->rule.ptr->flush &
545                                     PF_FLUSH_GLOBAL ||
546                                     state->rule.ptr == st->rule.ptr)) {
547                                         st->timeout = PFTM_PURGE;
548                                         st->src.state = st->dst.state =
549                                             TCPS_CLOSED;
550                                         killed++;
551                                 }
552                         }
553                         if (pf_status.debug >= PF_DEBUG_MISC)
554                                 kprintf(", %u states killed", killed);
555                 }
556                 if (pf_status.debug >= PF_DEBUG_MISC)
557                         kprintf("\n");
558         }
559
560         /* kill this state */
561         state->timeout = PFTM_PURGE;
562         state->src.state = state->dst.state = TCPS_CLOSED;
563
564         return 1;
565 }
566
567 int
568 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
569     struct pf_addr *src, sa_family_t af)
570 {
571         struct pf_src_node      k;
572         int cpu = mycpu->gd_cpuid;
573
574         if (*sn == NULL) {
575                 k.af = af;
576                 PF_ACPY(&k.addr, src, af);
577                 if (rule->rule_flag & PFRULE_RULESRCTRACK ||
578                     rule->rpool.opts & PF_POOL_STICKYADDR)
579                         k.rule.ptr = rule;
580                 else
581                         k.rule.ptr = NULL;
582                 pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
583                 *sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
584         }
585         if (*sn == NULL) {
586                 if (!rule->max_src_nodes ||
587                     rule->src_nodes < rule->max_src_nodes)
588                         (*sn) = kmalloc(sizeof(struct pf_src_node),
589                                         M_PFSRCTREEPL, M_NOWAIT|M_ZERO);
590                 else
591                         pf_status.lcounters[LCNT_SRCNODES]++;
592                 if ((*sn) == NULL)
593                         return (-1);
594
595                 pf_init_threshold(&(*sn)->conn_rate,
596                     rule->max_src_conn_rate.limit,
597                     rule->max_src_conn_rate.seconds);
598
599                 (*sn)->af = af;
600                 if (rule->rule_flag & PFRULE_RULESRCTRACK ||
601                     rule->rpool.opts & PF_POOL_STICKYADDR)
602                         (*sn)->rule.ptr = rule;
603                 else
604                         (*sn)->rule.ptr = NULL;
605                 PF_ACPY(&(*sn)->addr, src, af);
606                 if (RB_INSERT(pf_src_tree,
607                     &tree_src_tracking[cpu], *sn) != NULL) {
608                         if (pf_status.debug >= PF_DEBUG_MISC) {
609                                 kprintf("pf: src_tree insert failed: ");
610                                 pf_print_host(&(*sn)->addr, 0, af);
611                                 kprintf("\n");
612                         }
613                         kfree(*sn, M_PFSRCTREEPL);
614                         return (-1);
615                 }
616
617                 /*
618                  * Atomic op required to increment src_nodes in the rule
619                  * because we hold a shared token here (decrements will use
620                  * an exclusive token).
621                  */
622                 (*sn)->creation = time_second;
623                 (*sn)->ruletype = rule->action;
624                 if ((*sn)->rule.ptr != NULL)
625                         atomic_add_int(&(*sn)->rule.ptr->src_nodes, 1);
626                 pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
627                 atomic_add_int(&pf_status.src_nodes, 1);
628         } else {
629                 if (rule->max_src_states &&
630                     (*sn)->states >= rule->max_src_states) {
631                         pf_status.lcounters[LCNT_SRCSTATES]++;
632                         return (-1);
633                 }
634         }
635         return (0);
636 }
637
638 /*
639  * state table (indexed by the pf_state_key structure), normal RBTREE
640  * comparison.
641  */
642 static __inline int
643 pf_state_compare_key(struct pf_state_key *a, struct pf_state_key *b)
644 {
645         int     diff;
646
647         if ((diff = a->proto - b->proto) != 0)
648                 return (diff);
649         if ((diff = a->af - b->af) != 0)
650                 return (diff);
651         switch (a->af) {
652 #ifdef INET
653         case AF_INET:
654                 if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
655                         return (1);
656                 if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
657                         return (-1);
658                 if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
659                         return (1);
660                 if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
661                         return (-1);
662                 break;
663 #endif /* INET */
664 #ifdef INET6
665         case AF_INET6:
666                 if (a->addr[0].addr32[3] > b->addr[0].addr32[3])
667                         return (1);
668                 if (a->addr[0].addr32[3] < b->addr[0].addr32[3])
669                         return (-1);
670                 if (a->addr[1].addr32[3] > b->addr[1].addr32[3])
671                         return (1);
672                 if (a->addr[1].addr32[3] < b->addr[1].addr32[3])
673                         return (-1);
674                 if (a->addr[0].addr32[2] > b->addr[0].addr32[2])
675                         return (1);
676                 if (a->addr[0].addr32[2] < b->addr[0].addr32[2])
677                         return (-1);
678                 if (a->addr[1].addr32[2] > b->addr[1].addr32[2])
679                         return (1);
680                 if (a->addr[1].addr32[2] < b->addr[1].addr32[2])
681                         return (-1);
682                 if (a->addr[0].addr32[1] > b->addr[0].addr32[1])
683                         return (1);
684                 if (a->addr[0].addr32[1] < b->addr[0].addr32[1])
685                         return (-1);
686                 if (a->addr[1].addr32[1] > b->addr[1].addr32[1])
687                         return (1);
688                 if (a->addr[1].addr32[1] < b->addr[1].addr32[1])
689                         return (-1);
690                 if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
691                         return (1);
692                 if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
693                         return (-1);
694                 if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
695                         return (1);
696                 if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
697                         return (-1);
698                 break;
699 #endif /* INET6 */
700         }
701
702         if ((diff = a->port[0] - b->port[0]) != 0)
703                 return (diff);
704         if ((diff = a->port[1] - b->port[1]) != 0)
705                 return (diff);
706
707         return (0);
708 }
709
710 /*
711  * Used for RB_FIND only, compare in the reverse direction.  The
712  * element to be reversed is always (a), since we obviously can't
713  * reverse the state tree depicted by (b).
714  */
715 static __inline int
716 pf_state_compare_rkey(struct pf_state_key *a, struct pf_state_key *b)
717 {
718         int     diff;
719
720         if ((diff = a->proto - b->proto) != 0)
721                 return (diff);
722         if ((diff = a->af - b->af) != 0)
723                 return (diff);
724         switch (a->af) {
725 #ifdef INET
726         case AF_INET:
727                 if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
728                         return (1);
729                 if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
730                         return (-1);
731                 if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
732                         return (1);
733                 if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
734                         return (-1);
735                 break;
736 #endif /* INET */
737 #ifdef INET6
738         case AF_INET6:
739                 if (a->addr[1].addr32[3] > b->addr[0].addr32[3])
740                         return (1);
741                 if (a->addr[1].addr32[3] < b->addr[0].addr32[3])
742                         return (-1);
743                 if (a->addr[0].addr32[3] > b->addr[1].addr32[3])
744                         return (1);
745                 if (a->addr[0].addr32[3] < b->addr[1].addr32[3])
746                         return (-1);
747                 if (a->addr[1].addr32[2] > b->addr[0].addr32[2])
748                         return (1);
749                 if (a->addr[1].addr32[2] < b->addr[0].addr32[2])
750                         return (-1);
751                 if (a->addr[0].addr32[2] > b->addr[1].addr32[2])
752                         return (1);
753                 if (a->addr[0].addr32[2] < b->addr[1].addr32[2])
754                         return (-1);
755                 if (a->addr[1].addr32[1] > b->addr[0].addr32[1])
756                         return (1);
757                 if (a->addr[1].addr32[1] < b->addr[0].addr32[1])
758                         return (-1);
759                 if (a->addr[0].addr32[1] > b->addr[1].addr32[1])
760                         return (1);
761                 if (a->addr[0].addr32[1] < b->addr[1].addr32[1])
762                         return (-1);
763                 if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
764                         return (1);
765                 if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
766                         return (-1);
767                 if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
768                         return (1);
769                 if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
770                         return (-1);
771                 break;
772 #endif /* INET6 */
773         }
774
775         if ((diff = a->port[1] - b->port[0]) != 0)
776                 return (diff);
777         if ((diff = a->port[0] - b->port[1]) != 0)
778                 return (diff);
779
780         return (0);
781 }
782
783 static __inline int
784 pf_state_compare_id(struct pf_state *a, struct pf_state *b)
785 {
786         if (a->id > b->id)
787                 return (1);
788         if (a->id < b->id)
789                 return (-1);
790         if (a->creatorid > b->creatorid)
791                 return (1);
792         if (a->creatorid < b->creatorid)
793                 return (-1);
794
795         return (0);
796 }
797
798 int
799 pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx)
800 {
801         struct pf_state_item    *si;
802         struct pf_state_key     *cur;
803         int cpu;
804         int error;
805
806         /*
807          * PFSTATE_STACK_GLOBAL is set when the state might not hash to the
808          * current cpu.  The keys are managed on the global statetbl tree
809          * for this case.  Only translations (RDR, NAT) can cause this.
810          *
811          * When this flag is not set we must still check the global statetbl
812          * for a collision, and if we find one we set the HALF_DUPLEX flag
813          * in the state.
814          */
815         if (s->state_flags & PFSTATE_STACK_GLOBAL) {
816                 cpu = MAXCPU;
817                 lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
818         } else {
819                 cpu = mycpu->gd_cpuid;
820                 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
821         }
822         KKASSERT(s->key[idx] == NULL);  /* XXX handle this? */
823
824         if (pf_status.debug >= PF_DEBUG_MISC) {
825                 kprintf("state_key attach cpu %d (%08x:%d) %s (%08x:%d)\n",
826                         cpu,
827                         ntohl(sk->addr[0].addr32[0]), ntohs(sk->port[0]),
828                         (idx == PF_SK_WIRE ? "->" : "<-"),
829                         ntohl(sk->addr[1].addr32[0]), ntohs(sk->port[1]));
830         }
831
832         /*
833          * Check whether (e.g.) a PASS rule being put on a per-cpu tree
834          * collides with a translation rule on the global tree.  This is
835          * NOT an error.  We *WANT* to establish state for this case so the
836          * packet path is short-cutted and doesn't need to scan the ruleset
837          * on every packet.  But the established state will only see one
838          * side of a two-way packet conversation.  To prevent this from
839          * causing problems (e.g. generating a RST), we force PFSTATE_SLOPPY
840          * to be set on the established state.
841          *
842          * A collision against RDR state can only occur with a PASS IN in the
843          * opposite direction or a PASS OUT in the forwards direction.  This
844          * is because RDRs are processed on the input side.
845          *
846          * A collision against NAT state can only occur with a PASS IN in the
847          * forwards direction or a PASS OUT in the opposite direction.  This
848          * is because NATs are processed on the output side.
849          *
850          * In both situations we need to do a reverse addr/port test because
851          * the PASS IN or PASS OUT only establishes if it doesn't match the
852          * established RDR state in the forwards direction.  The direction
853          * flag has to be ignored (it will be one way for a PASS IN and the
854          * other way for a PASS OUT).
855          *
856          * pf_global_statetbl_lock will be locked shared when testing and
857          * not entering into the global state table.
858          */
859         if (cpu != MAXCPU &&
860             (cur = RB_FIND(pf_state_rtree,
861                            (struct pf_state_rtree *)&pf_statetbl[MAXCPU],
862                            sk)) != NULL) {
863                 TAILQ_FOREACH(si, &cur->states, entry) {
864                         /*
865                          * NOTE: We must ignore direction mismatches.
866                          */
867                         if (si->s->kif == s->kif) {
868                                 s->state_flags |= PFSTATE_HALF_DUPLEX |
869                                                   PFSTATE_SLOPPY;
870                                 if (pf_status.debug >= PF_DEBUG_MISC) {
871                                         kprintf(
872                                             "pf: %s key attach collision "
873                                             "on %s: ",
874                                             (idx == PF_SK_WIRE) ?
875                                             "wire" : "stack",
876                                             s->kif->pfik_name);
877                                         pf_print_state_parts(s,
878                                             (idx == PF_SK_WIRE) ? sk : NULL,
879                                             (idx == PF_SK_STACK) ? sk : NULL);
880                                         kprintf("\n");
881                                 }
882                                 break;
883                         }
884                 }
885         }
886
887         /*
888          * Enter into either the per-cpu or the global state table.
889          *
890          * pf_global_statetbl_lock will be locked exclusively when entering
891          * into the global state table.
892          */
893         if ((cur = RB_INSERT(pf_state_tree, &pf_statetbl[cpu], sk)) != NULL) {
894                 /* key exists. check for same kif, if none, add to key */
895                 TAILQ_FOREACH(si, &cur->states, entry) {
896                         if (si->s->kif == s->kif &&
897                             si->s->direction == s->direction) {
898                                 if (pf_status.debug >= PF_DEBUG_MISC) {
899                                         kprintf(
900                                             "pf: %s key attach failed on %s: ",
901                                             (idx == PF_SK_WIRE) ?
902                                             "wire" : "stack",
903                                             s->kif->pfik_name);
904                                         pf_print_state_parts(s,
905                                             (idx == PF_SK_WIRE) ? sk : NULL,
906                                             (idx == PF_SK_STACK) ? sk : NULL);
907                                         kprintf("\n");
908                                 }
909                                 kfree(sk, M_PFSTATEKEYPL);
910                                 error = -1;
911                                 goto failed;    /* collision! */
912                         }
913                 }
914                 kfree(sk, M_PFSTATEKEYPL);
915
916                 s->key[idx] = cur;
917         } else {
918                 s->key[idx] = sk;
919         }
920
921         if ((si = kmalloc(sizeof(struct pf_state_item),
922                           M_PFSTATEITEMPL, M_NOWAIT)) == NULL) {
923                 pf_state_key_detach(s, idx);
924                 error = -1;
925                 goto failed;    /* collision! */
926         }
927         si->s = s;
928
929         /* list is sorted, if-bound states before floating */
930         if (s->kif == pfi_all)
931                 TAILQ_INSERT_TAIL(&s->key[idx]->states, si, entry);
932         else
933                 TAILQ_INSERT_HEAD(&s->key[idx]->states, si, entry);
934
935         error = 0;
936 failed:
937         lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
938         return error;
939 }
940
941 /*
942  * NOTE: Can only be called indirectly via the purge thread with pf_token
943  *       exclusively locked.
944  */
945 void
946 pf_detach_state(struct pf_state *s)
947 {
948         if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
949                 s->key[PF_SK_WIRE] = NULL;
950
951         if (s->key[PF_SK_STACK] != NULL)
952                 pf_state_key_detach(s, PF_SK_STACK);
953
954         if (s->key[PF_SK_WIRE] != NULL)
955                 pf_state_key_detach(s, PF_SK_WIRE);
956 }
957
958 /*
959  * NOTE: Can only be called indirectly via the purge thread with pf_token
960  *       exclusively locked.
961  */
962 void
963 pf_state_key_detach(struct pf_state *s, int idx)
964 {
965         struct pf_state_item    *si;
966         int cpu;
967
968         /*
969          * PFSTATE_STACK_GLOBAL is set for translations when the translated
970          * address/port is not localized to the same cpu that the untranslated
971          * address/port is on.  The wire pf_state_key is managed on the global
972          * statetbl tree for this case.
973          */
974         if (s->state_flags & PFSTATE_STACK_GLOBAL) {
975                 cpu = MAXCPU;
976                 lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
977         } else {
978                 cpu = mycpu->gd_cpuid;
979         }
980
981         si = TAILQ_FIRST(&s->key[idx]->states);
982         while (si && si->s != s)
983                 si = TAILQ_NEXT(si, entry);
984
985         if (si) {
986                 TAILQ_REMOVE(&s->key[idx]->states, si, entry);
987                 kfree(si, M_PFSTATEITEMPL);
988         }
989
990         if (TAILQ_EMPTY(&s->key[idx]->states)) {
991                 RB_REMOVE(pf_state_tree, &pf_statetbl[cpu], s->key[idx]);
992                 if (s->key[idx]->reverse)
993                         s->key[idx]->reverse->reverse = NULL;
994                 if (s->key[idx]->inp)
995                         s->key[idx]->inp->inp_pf_sk = NULL;
996                 kfree(s->key[idx], M_PFSTATEKEYPL);
997         }
998         s->key[idx] = NULL;
999
1000         if (s->state_flags & PFSTATE_STACK_GLOBAL)
1001                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1002 }
1003
1004 struct pf_state_key *
1005 pf_alloc_state_key(int pool_flags)
1006 {
1007         struct pf_state_key     *sk;
1008
1009         sk = kmalloc(sizeof(struct pf_state_key), M_PFSTATEKEYPL, pool_flags);
1010         if (sk) {
1011                 TAILQ_INIT(&sk->states);
1012         }
1013         return (sk);
1014 }
1015
1016 int
1017 pf_state_key_setup(struct pf_pdesc *pd, struct pf_rule *nr,
1018         struct pf_state_key **skw, struct pf_state_key **sks,
1019         struct pf_state_key **skp, struct pf_state_key **nkp,
1020         struct pf_addr *saddr, struct pf_addr *daddr,
1021         u_int16_t sport, u_int16_t dport)
1022 {
1023         KKASSERT((*skp == NULL && *nkp == NULL));
1024
1025         if ((*skp = pf_alloc_state_key(M_NOWAIT | M_ZERO)) == NULL)
1026                 return (ENOMEM);
1027
1028         PF_ACPY(&(*skp)->addr[pd->sidx], saddr, pd->af);
1029         PF_ACPY(&(*skp)->addr[pd->didx], daddr, pd->af);
1030         (*skp)->port[pd->sidx] = sport;
1031         (*skp)->port[pd->didx] = dport;
1032         (*skp)->proto = pd->proto;
1033         (*skp)->af = pd->af;
1034
1035         if (nr != NULL) {
1036                 if ((*nkp = pf_alloc_state_key(M_NOWAIT | M_ZERO)) == NULL)
1037                         return (ENOMEM); /* caller must handle cleanup */
1038
1039                 /* XXX maybe just bcopy and TAILQ_INIT(&(*nkp)->states) */
1040                 PF_ACPY(&(*nkp)->addr[0], &(*skp)->addr[0], pd->af);
1041                 PF_ACPY(&(*nkp)->addr[1], &(*skp)->addr[1], pd->af);
1042                 (*nkp)->port[0] = (*skp)->port[0];
1043                 (*nkp)->port[1] = (*skp)->port[1];
1044                 (*nkp)->proto = pd->proto;
1045                 (*nkp)->af = pd->af;
1046         } else {
1047                 *nkp = *skp;
1048         }
1049
1050         if (pd->dir == PF_IN) {
1051                 *skw = *skp;
1052                 *sks = *nkp;
1053         } else {
1054                 *sks = *skp;
1055                 *skw = *nkp;
1056         }
1057         return (0);
1058 }
1059
1060 /*
1061  * Insert pf_state with one or two state keys (allowing a reverse path lookup
1062  * which is used by NAT).  In the NAT case skw is the initiator (?) and
1063  * sks is the target.
1064  */
1065 int
1066 pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1067                 struct pf_state_key *sks, struct pf_state *s)
1068 {
1069         int cpu = mycpu->gd_cpuid;
1070
1071         s->kif = kif;
1072         s->cpuid = cpu;
1073
1074         if (skw == sks) {
1075                 if (pf_state_key_attach(skw, s, PF_SK_WIRE))
1076                         return (-1);
1077                 s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1078         } else {
1079                 /*
1080                 skw->reverse = sks;
1081                 sks->reverse = skw;
1082                 */
1083                 if (pf_state_key_attach(skw, s, PF_SK_WIRE)) {
1084                         kfree(sks, M_PFSTATEKEYPL);
1085                         return (-1);
1086                 }
1087                 if (pf_state_key_attach(sks, s, PF_SK_STACK)) {
1088                         pf_state_key_detach(s, PF_SK_WIRE);
1089                         return (-1);
1090                 }
1091         }
1092
1093         if (s->id == 0 && s->creatorid == 0) {
1094                 u_int64_t sid;
1095
1096 #if __SIZEOF_LONG__ == 8
1097                 sid = atomic_fetchadd_long(&pf_status.stateid, 1);
1098 #else
1099                 spin_lock(&pf_spin);
1100                 sid = pf_status.stateid++;
1101                 spin_unlock(&pf_spin);
1102 #endif
1103                 s->id = htobe64(sid);
1104                 s->creatorid = pf_status.hostid;
1105         }
1106
1107         /*
1108          * Calculate hash code for altq
1109          */
1110         s->hash = crc32(s->key[PF_SK_WIRE], PF_STATE_KEY_HASH_LENGTH);
1111
1112         if (RB_INSERT(pf_state_tree_id, &tree_id[cpu], s) != NULL) {
1113                 if (pf_status.debug >= PF_DEBUG_MISC) {
1114                         kprintf("pf: state insert failed: "
1115                             "id: %016jx creatorid: %08x",
1116                               (uintmax_t)be64toh(s->id), ntohl(s->creatorid));
1117                         if (s->sync_flags & PFSTATE_FROMSYNC)
1118                                 kprintf(" (from sync)");
1119                         kprintf("\n");
1120                 }
1121                 pf_detach_state(s);
1122                 return (-1);
1123         }
1124         TAILQ_INSERT_TAIL(&state_list[cpu], s, entry_list);
1125         pf_status.fcounters[FCNT_STATE_INSERT]++;
1126         atomic_add_int(&pf_status.states, 1);
1127         pfi_kif_ref(kif, PFI_KIF_REF_STATE);
1128         pfsync_insert_state(s);
1129         return (0);
1130 }
1131
1132 struct pf_state *
1133 pf_find_state_byid(struct pf_state_cmp *key)
1134 {
1135         int cpu = mycpu->gd_cpuid;
1136
1137         pf_status.fcounters[FCNT_STATE_SEARCH]++;
1138
1139         return (RB_FIND(pf_state_tree_id, &tree_id[cpu],
1140                         (struct pf_state *)key));
1141 }
1142
1143 /*
1144  * WARNING! May return a state structure that was localized to another cpu,
1145  *          destruction is typically protected by the callers pf_token.
1146  *          The element can only be destroyed
1147  */
1148 struct pf_state *
1149 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir,
1150               struct mbuf *m)
1151 {
1152         struct pf_state_key     *skey = (void *)key;
1153         struct pf_state_key     *sk;
1154         struct pf_state_item    *si;
1155         struct pf_state *s;
1156         int cpu = mycpu->gd_cpuid;
1157         int globalstl = 0;
1158
1159         pf_status.fcounters[FCNT_STATE_SEARCH]++;
1160
1161         if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
1162             ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse) {
1163                 sk = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse;
1164         } else {
1165                 sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
1166                 if (sk == NULL) {
1167                         lockmgr(&pf_global_statetbl_lock, LK_SHARED);
1168                         sk = RB_FIND(pf_state_tree, &pf_statetbl[MAXCPU], skey);
1169                         if (sk == NULL) {
1170                                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1171                                 return (NULL);
1172                         }
1173                         globalstl = 1;
1174                 }
1175                 if (dir == PF_OUT && m->m_pkthdr.pf.statekey) {
1176                         ((struct pf_state_key *)
1177                             m->m_pkthdr.pf.statekey)->reverse = sk;
1178                         sk->reverse = m->m_pkthdr.pf.statekey;
1179                 }
1180         }
1181         if (dir == PF_OUT)
1182                 m->m_pkthdr.pf.statekey = NULL;
1183
1184         /* list is sorted, if-bound states before floating ones */
1185         TAILQ_FOREACH(si, &sk->states, entry) {
1186                 if ((si->s->kif == pfi_all || si->s->kif == kif) &&
1187                     sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
1188                                           si->s->key[PF_SK_STACK])) {
1189                         break;
1190                 }
1191         }
1192
1193         /*
1194          * Extract state before potentially releasing the global statetbl
1195          * lock.  Ignore the state if the create is still in-progress as
1196          * it can be deleted out from under us by the owning localized cpu.
1197          * However, if CREATEINPROG is not set, state can only be deleted
1198          * by the purge thread which we are protected from via our shared
1199          * pf_token.
1200          */
1201         if (si) {
1202                 s = si->s;
1203                 if (s && (s->state_flags & PFSTATE_CREATEINPROG))
1204                         s = NULL;
1205         } else {
1206                 s = NULL;
1207         }
1208         if (globalstl)
1209                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1210         return s;
1211 }
1212
1213 /*
1214  * WARNING! May return a state structure that was localized to another cpu,
1215  *          destruction is typically protected by the callers pf_token.
1216  */
1217 struct pf_state *
1218 pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1219 {
1220         struct pf_state_key     *skey = (void *)key;
1221         struct pf_state_key     *sk;
1222         struct pf_state_item    *si, *ret = NULL;
1223         struct pf_state         *s;
1224         int cpu = mycpu->gd_cpuid;
1225         int globalstl = 0;
1226
1227         pf_status.fcounters[FCNT_STATE_SEARCH]++;
1228
1229         sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
1230         if (sk == NULL) {
1231                 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
1232                 sk = RB_FIND(pf_state_tree, &pf_statetbl[MAXCPU], skey);
1233                 globalstl = 1;
1234         }
1235         if (sk != NULL) {
1236                 TAILQ_FOREACH(si, &sk->states, entry)
1237                         if (dir == PF_INOUT ||
1238                             (sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
1239                             si->s->key[PF_SK_STACK]))) {
1240                                 if (more == NULL) {
1241                                         ret = si;
1242                                         break;
1243                                 }
1244                                 if (ret)
1245                                         (*more)++;
1246                                 else
1247                                         ret = si;
1248                         }
1249         }
1250
1251         /*
1252          * Extract state before potentially releasing the global statetbl
1253          * lock.  Ignore the state if the create is still in-progress as
1254          * it can be deleted out from under us by the owning localized cpu.
1255          * However, if CREATEINPROG is not set, state can only be deleted
1256          * by the purge thread which we are protected from via our shared
1257          * pf_token.
1258          */
1259         if (ret) {
1260                 s = ret->s;
1261                 if (s && (s->state_flags & PFSTATE_CREATEINPROG))
1262                         s = NULL;
1263         } else {
1264                 s = NULL;
1265         }
1266         if (globalstl)
1267                 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1268         return s;
1269 }
1270
1271 /* END state table stuff */
1272
1273 void
1274 pf_purge_thread(void *v)
1275 {
1276         globaldata_t save_gd = mycpu;
1277         int nloops = 0;
1278         int locked = 0;
1279         int nn;
1280         int endingit;
1281
1282         for (;;) {
1283                 tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);
1284
1285                 endingit = pf_end_threads;
1286
1287                 for (nn = 0; nn < ncpus; ++nn) {
1288                         lwkt_setcpu_self(globaldata_find(nn));
1289
1290                         lwkt_gettoken(&pf_token);
1291                         lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1292                         crit_enter();
1293
1294                         /*
1295                          * process a fraction of the state table every second
1296                          */
1297                         if(!pf_purge_expired_states(
1298                                 1 + (pf_status.states /
1299                                      pf_default_rule.timeout[
1300                                         PFTM_INTERVAL]), 0)) {
1301                                 pf_purge_expired_states(
1302                                         1 + (pf_status.states /
1303                                              pf_default_rule.timeout[
1304                                                 PFTM_INTERVAL]), 1);
1305                         }
1306
1307                         /*
1308                          * purge other expired types every PFTM_INTERVAL
1309                          * seconds
1310                          */
1311                         if (++nloops >=
1312                             pf_default_rule.timeout[PFTM_INTERVAL]) {
1313                                 pf_purge_expired_fragments();
1314                                 if (!pf_purge_expired_src_nodes(locked)) {
1315                                         pf_purge_expired_src_nodes(1);
1316                                 }
1317                                 nloops = 0;
1318                         }
1319
1320                         /*
1321                          * If terminating the thread, clean everything out
1322                          * (on all cpus).
1323                          */
1324                         if (endingit) {
1325                                 pf_purge_expired_states(pf_status.states, 0);
1326                                 pf_purge_expired_fragments();
1327                                 pf_purge_expired_src_nodes(1);
1328                         }
1329
1330                         crit_exit();
1331                         lockmgr(&pf_consistency_lock, LK_RELEASE);
1332                         lwkt_reltoken(&pf_token);
1333                 }
1334                 lwkt_setcpu_self(save_gd);
1335                 if (endingit)
1336                         break;
1337         }
1338
1339         /*
1340          * Thread termination
1341          */
1342         pf_end_threads++;
1343         wakeup(pf_purge_thread);
1344         kthread_exit();
1345 }
1346
1347 u_int32_t
1348 pf_state_expires(const struct pf_state *state)
1349 {
1350         u_int32_t       timeout;
1351         u_int32_t       start;
1352         u_int32_t       end;
1353         u_int32_t       states;
1354
1355         /* handle all PFTM_* > PFTM_MAX here */
1356         if (state->timeout == PFTM_PURGE)
1357                 return (time_second);
1358         if (state->timeout == PFTM_UNTIL_PACKET)
1359                 return (0);
1360         KKASSERT(state->timeout != PFTM_UNLINKED);
1361         KKASSERT(state->timeout < PFTM_MAX);
1362         timeout = state->rule.ptr->timeout[state->timeout];
1363         if (!timeout)
1364                 timeout = pf_default_rule.timeout[state->timeout];
1365         start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1366         if (start) {
1367                 end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1368                 states = state->rule.ptr->states_cur;
1369         } else {
1370                 start = pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1371                 end = pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1372                 states = pf_status.states;
1373         }
1374         if (end && states > start && start < end) {
1375                 if (states < end)
1376                         return (state->expire + timeout * (end - states) /
1377                             (end - start));
1378                 else
1379                         return (time_second);
1380         }
1381         return (state->expire + timeout);
1382 }
1383
1384 /*
1385  * (called with exclusive pf_token)
1386  */
1387 int
1388 pf_purge_expired_src_nodes(int waslocked)
1389 {
1390         struct pf_src_node *cur, *next;
1391         int locked = waslocked;
1392         int cpu = mycpu->gd_cpuid;
1393
1394         for (cur = RB_MIN(pf_src_tree, &tree_src_tracking[cpu]);
1395              cur;
1396              cur = next) {
1397                 next = RB_NEXT(pf_src_tree, &tree_src_tracking[cpu], cur);
1398
1399                 if (cur->states <= 0 && cur->expire <= time_second) {
1400                          if (!locked) {
1401                                  lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1402                                  next = RB_NEXT(pf_src_tree,
1403                                      &tree_src_tracking[cpu], cur);
1404                                  locked = 1;
1405                          }
1406                          if (cur->rule.ptr != NULL) {
1407                                 /*
1408                                  * decrements in rule should be ok, token is
1409                                  * held exclusively in this code path.
1410                                  */
1411                                  cur->rule.ptr->src_nodes--;
1412                                  if (cur->rule.ptr->states_cur <= 0 &&
1413                                      cur->rule.ptr->max_src_nodes <= 0)
1414                                          pf_rm_rule(NULL, cur->rule.ptr);
1415                          }
1416                          RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], cur);
1417                          pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
1418                          atomic_add_int(&pf_status.src_nodes, -1);
1419                          kfree(cur, M_PFSRCTREEPL);
1420                 }
1421         }
1422         if (locked && !waslocked)
1423                 lockmgr(&pf_consistency_lock, LK_RELEASE);
1424         return(1);
1425 }
1426
1427 void
1428 pf_src_tree_remove_state(struct pf_state *s)
1429 {
1430         u_int32_t timeout;
1431
1432         if (s->src_node != NULL) {
1433                 if (s->src.tcp_est)
1434                         --s->src_node->conn;
1435                 if (--s->src_node->states <= 0) {
1436                         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1437                         if (!timeout) {
1438                                 timeout =
1439                                     pf_default_rule.timeout[PFTM_SRC_NODE];
1440                         }
1441                         s->src_node->expire = time_second + timeout;
1442                 }
1443         }
1444         if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1445                 if (--s->nat_src_node->states <= 0) {
1446                         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1447                         if (!timeout)
1448                                 timeout =
1449                                     pf_default_rule.timeout[PFTM_SRC_NODE];
1450                         s->nat_src_node->expire = time_second + timeout;
1451                 }
1452         }
1453         s->src_node = s->nat_src_node = NULL;
1454 }
1455
1456 /* callers should be at crit_enter() */
1457 void
1458 pf_unlink_state(struct pf_state *cur)
1459 {
1460         int cpu = mycpu->gd_cpuid;
1461
1462         if (cur->src.state == PF_TCPS_PROXY_DST) {
1463                 /* XXX wire key the right one? */
1464                 pf_send_tcp(cur->rule.ptr, cur->key[PF_SK_WIRE]->af,
1465                     &cur->key[PF_SK_WIRE]->addr[1],
1466                     &cur->key[PF_SK_WIRE]->addr[0],
1467                     cur->key[PF_SK_WIRE]->port[1],
1468                     cur->key[PF_SK_WIRE]->port[0],
1469                     cur->src.seqhi, cur->src.seqlo + 1,
1470                     TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL);
1471         }
1472         RB_REMOVE(pf_state_tree_id, &tree_id[cpu], cur);
1473         if (cur->creatorid == pf_status.hostid)
1474                 pfsync_delete_state(cur);
1475         cur->timeout = PFTM_UNLINKED;
1476         pf_src_tree_remove_state(cur);
1477         pf_detach_state(cur);
1478 }
1479
1480 static struct pf_state  *purge_cur[MAXCPU];
1481
1482 /*
1483  * callers should be at crit_enter() and hold pf_consistency_lock exclusively.
1484  * pf_token must also be held exclusively.
1485  */
1486 void
1487 pf_free_state(struct pf_state *cur)
1488 {
1489         int cpu = mycpu->gd_cpuid;
1490
1491         KKASSERT(cur->cpuid == cpu);
1492
1493         if (pfsyncif != NULL &&
1494             (pfsyncif->sc_bulk_send_next == cur ||
1495             pfsyncif->sc_bulk_terminator == cur))
1496                 return;
1497         KKASSERT(cur->timeout == PFTM_UNLINKED);
1498         /*
1499          * decrements in rule should be ok, token is
1500          * held exclusively in this code path.
1501          */
1502         if (--cur->rule.ptr->states_cur <= 0 &&
1503             cur->rule.ptr->src_nodes <= 0)
1504                 pf_rm_rule(NULL, cur->rule.ptr);
1505         if (cur->nat_rule.ptr != NULL) {
1506                 if (--cur->nat_rule.ptr->states_cur <= 0 &&
1507                         cur->nat_rule.ptr->src_nodes <= 0) {
1508                         pf_rm_rule(NULL, cur->nat_rule.ptr);
1509                 }
1510         }
1511         if (cur->anchor.ptr != NULL) {
1512                 if (--cur->anchor.ptr->states_cur <= 0)
1513                         pf_rm_rule(NULL, cur->anchor.ptr);
1514         }
1515         pf_normalize_tcp_cleanup(cur);
1516         pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
1517
1518         /*
1519          * We may be freeing pf_purge_expired_states()'s saved scan entry,
1520          * adjust it if necessary.
1521          */
1522         if (purge_cur[cpu] == cur) {
1523                 kprintf("PURGE CONFLICT\n");
1524                 purge_cur[cpu] = TAILQ_NEXT(purge_cur[cpu], entry_list);
1525         }
1526         TAILQ_REMOVE(&state_list[cpu], cur, entry_list);
1527         if (cur->tag)
1528                 pf_tag_unref(cur->tag);
1529         kfree(cur, M_PFSTATEPL);
1530         pf_status.fcounters[FCNT_STATE_REMOVALS]++;
1531         atomic_add_int(&pf_status.states, -1);
1532 }
1533
1534 int
1535 pf_purge_expired_states(u_int32_t maxcheck, int waslocked)
1536 {
1537         struct pf_state         *cur;
1538         int locked = waslocked;
1539         int cpu = mycpu->gd_cpuid;
1540
1541         while (maxcheck--) {
1542                 /*
1543                  * Wrap to start of list when we hit the end
1544                  */
1545                 cur = purge_cur[cpu];
1546                 if (cur == NULL) {
1547                         cur = TAILQ_FIRST(&state_list[cpu]);
1548                         if (cur == NULL)
1549                                 break;  /* list empty */
1550                 }
1551
1552                 /*
1553                  * Setup next (purge_cur) while we process this one.  If
1554                  * we block and something else deletes purge_cur,
1555                  * pf_free_state() will adjust it further ahead.
1556                  */
1557                 purge_cur[cpu] = TAILQ_NEXT(cur, entry_list);
1558
1559                 if (cur->timeout == PFTM_UNLINKED) {
1560                         /* free unlinked state */
1561                         if (! locked) {
1562                                 lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1563                                 locked = 1;
1564                         }
1565                         pf_free_state(cur);
1566                 } else if (pf_state_expires(cur) <= time_second) {
1567                         /* unlink and free expired state */
1568                         pf_unlink_state(cur);
1569                         if (! locked) {
1570                                 if (!lockmgr(&pf_consistency_lock, LK_EXCLUSIVE))
1571                                         return (0);
1572                                 locked = 1;
1573                         }
1574                         pf_free_state(cur);
1575                 }
1576         }
1577
1578         if (locked)
1579                 lockmgr(&pf_consistency_lock, LK_RELEASE);
1580         return (1);
1581 }
1582
1583 int
1584 pf_tbladdr_setup(struct pf_ruleset *rs, struct pf_addr_wrap *aw)
1585 {
1586         if (aw->type != PF_ADDR_TABLE)
1587                 return (0);
1588         if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL)
1589                 return (1);
1590         return (0);
1591 }
1592
1593 void
1594 pf_tbladdr_remove(struct pf_addr_wrap *aw)
1595 {
1596         if (aw->type != PF_ADDR_TABLE || aw->p.tbl == NULL)
1597                 return;
1598         pfr_detach_table(aw->p.tbl);
1599         aw->p.tbl = NULL;
1600 }
1601
1602 void
1603 pf_tbladdr_copyout(struct pf_addr_wrap *aw)
1604 {
1605         struct pfr_ktable *kt = aw->p.tbl;
1606
1607         if (aw->type != PF_ADDR_TABLE || kt == NULL)
1608                 return;
1609         if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
1610                 kt = kt->pfrkt_root;
1611         aw->p.tbl = NULL;
1612         aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
1613                 kt->pfrkt_cnt : -1;
1614 }
1615
1616 void
1617 pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1618 {
1619         switch (af) {
1620 #ifdef INET
1621         case AF_INET: {
1622                 u_int32_t a = ntohl(addr->addr32[0]);
1623                 kprintf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1624                     (a>>8)&255, a&255);
1625                 if (p) {
1626                         p = ntohs(p);
1627                         kprintf(":%u", p);
1628                 }
1629                 break;
1630         }
1631 #endif /* INET */
1632 #ifdef INET6
1633         case AF_INET6: {
1634                 u_int16_t b;
1635                 u_int8_t i, curstart = 255, curend = 0,
1636                     maxstart = 0, maxend = 0;
1637                 for (i = 0; i < 8; i++) {
1638                         if (!addr->addr16[i]) {
1639                                 if (curstart == 255)
1640                                         curstart = i;
1641                                 else
1642                                         curend = i;
1643                         } else {
1644                                 if (curstart) {
1645                                         if ((curend - curstart) >
1646                                             (maxend - maxstart)) {
1647                                                 maxstart = curstart;
1648                                                 maxend = curend;
1649                                                 curstart = 255;
1650                                         }
1651                                 }
1652                         }
1653                 }
1654                 for (i = 0; i < 8; i++) {
1655                         if (i >= maxstart && i <= maxend) {
1656                                 if (maxend != 7) {
1657                                         if (i == maxstart)
1658                                                 kprintf(":");
1659                                 } else {
1660                                         if (i == maxend)
1661                                                 kprintf(":");
1662                                 }
1663                         } else {
1664                                 b = ntohs(addr->addr16[i]);
1665                                 kprintf("%x", b);
1666                                 if (i < 7)
1667                                         kprintf(":");
1668                         }
1669                 }
1670                 if (p) {
1671                         p = ntohs(p);
1672                         kprintf("[%u]", p);
1673                 }
1674                 break;
1675         }
1676 #endif /* INET6 */
1677         }
1678 }
1679
1680 void
1681 pf_print_state(struct pf_state *s)
1682 {
1683         pf_print_state_parts(s, NULL, NULL);
1684 }
1685
1686 void
1687 pf_print_state_parts(struct pf_state *s,
1688     struct pf_state_key *skwp, struct pf_state_key *sksp)
1689 {
1690         struct pf_state_key *skw, *sks;
1691         u_int8_t proto, dir;
1692
1693         /* Do our best to fill these, but they're skipped if NULL */
1694         skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1695         sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1696         proto = skw ? skw->proto : (sks ? sks->proto : 0);
1697         dir = s ? s->direction : 0;
1698
1699         switch (proto) {
1700         case IPPROTO_TCP:
1701                 kprintf("TCP ");
1702                 break;
1703         case IPPROTO_UDP:
1704                 kprintf("UDP ");
1705                 break;
1706         case IPPROTO_ICMP:
1707                 kprintf("ICMP ");
1708                 break;
1709         case IPPROTO_ICMPV6:
1710                 kprintf("ICMPV6 ");
1711                 break;
1712         default:
1713                 kprintf("%u ", skw->proto);
1714                 break;
1715         }
1716         switch (dir) {
1717         case PF_IN:
1718                 kprintf(" in");
1719                 break;
1720         case PF_OUT:
1721                 kprintf(" out");
1722                 break;
1723         }
1724         if (skw) {
1725                 kprintf(" wire: ");
1726                 pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1727                 kprintf(" ");
1728                 pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1729         }
1730         if (sks) {
1731                 kprintf(" stack: ");
1732                 if (sks != skw) {
1733                         pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1734                         kprintf(" ");
1735                         pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1736                 } else
1737                         kprintf("-");
1738         }
1739         if (s) {
1740                 if (proto == IPPROTO_TCP) {
1741                         kprintf(" [lo=%u high=%u win=%u modulator=%u",
1742                             s->src.seqlo, s->src.seqhi,
1743                             s->src.max_win, s->src.seqdiff);
1744                         if (s->src.wscale && s->dst.wscale)
1745                                 kprintf(" wscale=%u",
1746                                     s->src.wscale & PF_WSCALE_MASK);
1747                         kprintf("]");
1748                         kprintf(" [lo=%u high=%u win=%u modulator=%u",
1749                             s->dst.seqlo, s->dst.seqhi,
1750                             s->dst.max_win, s->dst.seqdiff);
1751                         if (s->src.wscale && s->dst.wscale)
1752                                 kprintf(" wscale=%u",
1753                                 s->dst.wscale & PF_WSCALE_MASK);
1754                         kprintf("]");
1755                 }
1756                 kprintf(" %u:%u", s->src.state, s->dst.state);
1757         }
1758 }
1759
1760 void
1761 pf_print_flags(u_int8_t f)
1762 {
1763         if (f)
1764                 kprintf(" ");
1765         if (f & TH_FIN)
1766                 kprintf("F");
1767         if (f & TH_SYN)
1768                 kprintf("S");
1769         if (f & TH_RST)
1770                 kprintf("R");
1771         if (f & TH_PUSH)
1772                 kprintf("P");
1773         if (f & TH_ACK)
1774                 kprintf("A");
1775         if (f & TH_URG)
1776                 kprintf("U");
1777         if (f & TH_ECE)
1778                 kprintf("E");
1779         if (f & TH_CWR)
1780                 kprintf("W");
1781 }
1782
1783 #define PF_SET_SKIP_STEPS(i)                                    \
1784         do {                                                    \
1785                 while (head[i] != cur) {                        \
1786                         head[i]->skip[i].ptr = cur;             \
1787                         head[i] = TAILQ_NEXT(head[i], entries); \
1788                 }                                               \
1789         } while (0)
1790
1791 void
1792 pf_calc_skip_steps(struct pf_rulequeue *rules)
1793 {
1794         struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1795         int i;
1796
1797         cur = TAILQ_FIRST(rules);
1798         prev = cur;
1799         for (i = 0; i < PF_SKIP_COUNT; ++i)
1800                 head[i] = cur;
1801         while (cur != NULL) {
1802
1803                 if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1804                         PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1805                 if (cur->direction != prev->direction)
1806                         PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1807                 if (cur->af != prev->af)
1808                         PF_SET_SKIP_STEPS(PF_SKIP_AF);
1809                 if (cur->proto != prev->proto)
1810                         PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1811                 if (cur->src.neg != prev->src.neg ||
1812                     pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1813                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1814                 if (cur->src.port[0] != prev->src.port[0] ||
1815                     cur->src.port[1] != prev->src.port[1] ||
1816                     cur->src.port_op != prev->src.port_op)
1817                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1818                 if (cur->dst.neg != prev->dst.neg ||
1819                     pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1820                         PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1821                 if (cur->dst.port[0] != prev->dst.port[0] ||
1822                     cur->dst.port[1] != prev->dst.port[1] ||
1823                     cur->dst.port_op != prev->dst.port_op)
1824                         PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1825
1826                 prev = cur;
1827                 cur = TAILQ_NEXT(cur, entries);
1828         }
1829         for (i = 0; i < PF_SKIP_COUNT; ++i)
1830                 PF_SET_SKIP_STEPS(i);
1831 }
1832
1833 int
1834 pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1835 {
1836         if (aw1->type != aw2->type)
1837                 return (1);
1838         switch (aw1->type) {
1839         case PF_ADDR_ADDRMASK:
1840         case PF_ADDR_RANGE:
1841                 if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1842                         return (1);
1843                 if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1844                         return (1);
1845                 return (0);
1846         case PF_ADDR_DYNIFTL:
1847                 return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1848         case PF_ADDR_NOROUTE:
1849         case PF_ADDR_URPFFAILED:
1850                 return (0);
1851         case PF_ADDR_TABLE:
1852                 return (aw1->p.tbl != aw2->p.tbl);
1853         case PF_ADDR_RTLABEL:
1854                 return (aw1->v.rtlabel != aw2->v.rtlabel);
1855         default:
1856                 kprintf("invalid address type: %d\n", aw1->type);
1857                 return (1);
1858         }
1859 }
1860
1861 u_int16_t
1862 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1863 {
1864         u_int32_t       l;
1865
1866         if (udp && !cksum)
1867                 return (0x0000);
1868         l = cksum + old - new;
1869         l = (l >> 16) + (l & 65535);
1870         l = l & 65535;
1871         if (udp && !l)
1872                 return (0xFFFF);
1873         return (l);
1874 }
1875
1876 void
1877 pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1878     struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1879 {
1880         struct pf_addr  ao;
1881         u_int16_t       po = *p;
1882
1883         PF_ACPY(&ao, a, af);
1884         PF_ACPY(a, an, af);
1885
1886         *p = pn;
1887
1888         switch (af) {
1889 #ifdef INET
1890         case AF_INET:
1891                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1892                     ao.addr16[0], an->addr16[0], 0),
1893                     ao.addr16[1], an->addr16[1], 0);
1894                 *p = pn;
1895                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1896                     ao.addr16[0], an->addr16[0], u),
1897                     ao.addr16[1], an->addr16[1], u),
1898                     po, pn, u);
1899                 break;
1900 #endif /* INET */
1901 #ifdef INET6
1902         case AF_INET6:
1903                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1904                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1905                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1906                     ao.addr16[0], an->addr16[0], u),
1907                     ao.addr16[1], an->addr16[1], u),
1908                     ao.addr16[2], an->addr16[2], u),
1909                     ao.addr16[3], an->addr16[3], u),
1910                     ao.addr16[4], an->addr16[4], u),
1911                     ao.addr16[5], an->addr16[5], u),
1912                     ao.addr16[6], an->addr16[6], u),
1913                     ao.addr16[7], an->addr16[7], u),
1914                     po, pn, u);
1915                 break;
1916 #endif /* INET6 */
1917         }
1918 }
1919
1920
1921 /* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
1922 void
1923 pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
1924 {
1925         u_int32_t       ao;
1926
1927         memcpy(&ao, a, sizeof(ao));
1928         memcpy(a, &an, sizeof(u_int32_t));
1929         *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
1930             ao % 65536, an % 65536, u);
1931 }
1932
1933 #ifdef INET6
1934 void
1935 pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
1936 {
1937         struct pf_addr  ao;
1938
1939         PF_ACPY(&ao, a, AF_INET6);
1940         PF_ACPY(a, an, AF_INET6);
1941
1942         *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1943             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1944             pf_cksum_fixup(pf_cksum_fixup(*c,
1945             ao.addr16[0], an->addr16[0], u),
1946             ao.addr16[1], an->addr16[1], u),
1947             ao.addr16[2], an->addr16[2], u),
1948             ao.addr16[3], an->addr16[3], u),
1949             ao.addr16[4], an->addr16[4], u),
1950             ao.addr16[5], an->addr16[5], u),
1951             ao.addr16[6], an->addr16[6], u),
1952             ao.addr16[7], an->addr16[7], u);
1953 }
1954 #endif /* INET6 */
1955
1956 void
1957 pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
1958     struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
1959     u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
1960 {
1961         struct pf_addr  oia, ooa;
1962
1963         PF_ACPY(&oia, ia, af);
1964         if (oa)
1965                 PF_ACPY(&ooa, oa, af);
1966
1967         /* Change inner protocol port, fix inner protocol checksum. */
1968         if (ip != NULL) {
1969                 u_int16_t       oip = *ip;
1970                 u_int32_t       opc = 0;
1971
1972                 if (pc != NULL)
1973                         opc = *pc;
1974                 *ip = np;
1975                 if (pc != NULL)
1976                         *pc = pf_cksum_fixup(*pc, oip, *ip, u);
1977                 *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
1978                 if (pc != NULL)
1979                         *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
1980         }
1981         /* Change inner ip address, fix inner ip and icmp checksums. */
1982         PF_ACPY(ia, na, af);
1983         switch (af) {
1984 #ifdef INET
1985         case AF_INET: {
1986                 u_int32_t        oh2c = *h2c;
1987
1988                 *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
1989                     oia.addr16[0], ia->addr16[0], 0),
1990                     oia.addr16[1], ia->addr16[1], 0);
1991                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1992                     oia.addr16[0], ia->addr16[0], 0),
1993                     oia.addr16[1], ia->addr16[1], 0);
1994                 *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
1995                 break;
1996         }
1997 #endif /* INET */
1998 #ifdef INET6
1999         case AF_INET6:
2000                 *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2001                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2002                     pf_cksum_fixup(pf_cksum_fixup(*ic,
2003                     oia.addr16[0], ia->addr16[0], u),
2004                     oia.addr16[1], ia->addr16[1], u),
2005                     oia.addr16[2], ia->addr16[2], u),
2006                     oia.addr16[3], ia->addr16[3], u),
2007                     oia.addr16[4], ia->addr16[4], u),
2008                     oia.addr16[5], ia->addr16[5], u),
2009                     oia.addr16[6], ia->addr16[6], u),
2010                     oia.addr16[7], ia->addr16[7], u);
2011                 break;
2012 #endif /* INET6 */
2013         }
2014         /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2015         if (oa) {
2016                 PF_ACPY(oa, na, af);
2017                 switch (af) {
2018 #ifdef INET
2019                 case AF_INET:
2020                         *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2021                             ooa.addr16[0], oa->addr16[0], 0),
2022                             ooa.addr16[1], oa->addr16[1], 0);
2023                         break;
2024 #endif /* INET */
2025 #ifdef INET6
2026                 case AF_INET6:
2027                         *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2028                             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2029                             pf_cksum_fixup(pf_cksum_fixup(*ic,
2030                             ooa.addr16[0], oa->addr16[0], u),
2031                             ooa.addr16[1], oa->addr16[1], u),
2032                             ooa.addr16[2], oa->addr16[2], u),
2033                             ooa.addr16[3], oa->addr16[3], u),
2034                             ooa.addr16[4], oa->addr16[4], u),
2035                             ooa.addr16[5], oa->addr16[5], u),
2036                             ooa.addr16[6], oa->addr16[6], u),
2037                             ooa.addr16[7], oa->addr16[7], u);
2038                         break;
2039 #endif /* INET6 */
2040                 }
2041         }
2042 }
2043
2044
2045 /*
2046  * Need to modulate the sequence numbers in the TCP SACK option
2047  * (credits to Krzysztof Pfaff for report and patch)
2048  */
2049 int
2050 pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2051     struct tcphdr *th, struct pf_state_peer *dst)
2052 {
2053         int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2054         u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2055         int copyback = 0, i, olen;
2056         struct raw_sackblock sack;
2057
2058 #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
2059         if (hlen < TCPOLEN_SACKLEN ||
2060             !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2061                 return 0;
2062
2063         while (hlen >= TCPOLEN_SACKLEN) {
2064                 olen = opt[1];
2065                 switch (*opt) {
2066                 case TCPOPT_EOL:        /* FALLTHROUGH */
2067                 case TCPOPT_NOP:
2068                         opt++;
2069                         hlen--;
2070                         break;
2071                 case TCPOPT_SACK:
2072                         if (olen > hlen)
2073                                 olen = hlen;
2074                         if (olen >= TCPOLEN_SACKLEN) {
2075                                 for (i = 2; i + TCPOLEN_SACK <= olen;
2076                                     i += TCPOLEN_SACK) {
2077                                         memcpy(&sack, &opt[i], sizeof(sack));
2078                                         pf_change_a(&sack.rblk_start, &th->th_sum,
2079                                             htonl(ntohl(sack.rblk_start) -
2080                                             dst->seqdiff), 0);
2081                                         pf_change_a(&sack.rblk_end, &th->th_sum,
2082                                             htonl(ntohl(sack.rblk_end) -
2083                                             dst->seqdiff), 0);
2084                                         memcpy(&opt[i], &sack, sizeof(sack));
2085                                 }
2086                                 copyback = 1;
2087                         }
2088                         /* FALLTHROUGH */
2089                 default:
2090                         if (olen < 2)
2091                                 olen = 2;
2092                         hlen -= olen;
2093                         opt += olen;
2094                 }
2095         }
2096
2097         if (copyback)
2098                 m_copyback(m, off + sizeof(*th), thoptlen, opts);
2099         return (copyback);
2100 }
2101
2102 void
2103 pf_send_tcp(const struct pf_rule *r, sa_family_t af,
2104     const struct pf_addr *saddr, const struct pf_addr *daddr,
2105     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2106     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2107     u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp)
2108 {
2109         struct mbuf     *m;
2110         int              len = 0, tlen;
2111 #ifdef INET
2112         struct ip       *h = NULL;
2113 #endif /* INET */
2114 #ifdef INET6
2115         struct ip6_hdr  *h6 = NULL;
2116 #endif /* INET6 */
2117         struct tcphdr   *th = NULL;
2118         char            *opt;
2119
2120         ASSERT_LWKT_TOKEN_HELD(&pf_token);
2121
2122         /* maximum segment size tcp option */
2123         tlen = sizeof(struct tcphdr);
2124         if (mss)
2125                 tlen += 4;
2126
2127         switch (af) {
2128 #ifdef INET
2129         case AF_INET:
2130                 len = sizeof(struct ip) + tlen;
2131                 break;
2132 #endif /* INET */
2133 #ifdef INET6
2134         case AF_INET6:
2135                 len = sizeof(struct ip6_hdr) + tlen;
2136                 break;
2137 #endif /* INET6 */
2138         }
2139
2140         /*
2141          * Create outgoing mbuf.
2142          *
2143          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
2144          * so make sure pf.flags is clear.
2145          */
2146         m = m_gethdr(M_NOWAIT, MT_HEADER);
2147         if (m == NULL) {
2148                 return;
2149         }
2150         if (tag)
2151                 m->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
2152         m->m_pkthdr.pf.flags = 0;
2153         m->m_pkthdr.pf.tag = rtag;
2154         /* XXX Recheck when upgrading to > 4.4 */
2155         m->m_pkthdr.pf.statekey = NULL;
2156         if (r != NULL && r->rtableid >= 0)
2157                 m->m_pkthdr.pf.rtableid = r->rtableid;
2158
2159 #ifdef ALTQ
2160         if (r != NULL && r->qid) {
2161                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
2162                 m->m_pkthdr.pf.qid = r->qid;
2163                 m->m_pkthdr.pf.ecn_af = af;
2164                 m->m_pkthdr.pf.hdr = mtod(m, struct ip *);
2165         }
2166 #endif /* ALTQ */
2167         m->m_data += max_linkhdr;
2168         m->m_pkthdr.len = m->m_len = len;
2169         m->m_pkthdr.rcvif = NULL;
2170         bzero(m->m_data, len);
2171         switch (af) {
2172 #ifdef INET
2173         case AF_INET:
2174                 h = mtod(m, struct ip *);
2175
2176                 /* IP header fields included in the TCP checksum */
2177                 h->ip_p = IPPROTO_TCP;
2178                 h->ip_len = tlen;
2179                 h->ip_src.s_addr = saddr->v4.s_addr;
2180                 h->ip_dst.s_addr = daddr->v4.s_addr;
2181
2182                 th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2183                 break;
2184 #endif /* INET */
2185 #ifdef INET6
2186         case AF_INET6:
2187                 h6 = mtod(m, struct ip6_hdr *);
2188
2189                 /* IP header fields included in the TCP checksum */
2190                 h6->ip6_nxt = IPPROTO_TCP;
2191                 h6->ip6_plen = htons(tlen);
2192                 memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2193                 memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2194
2195                 th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2196                 break;
2197 #endif /* INET6 */
2198         }
2199
2200         /* TCP header */
2201         th->th_sport = sport;
2202         th->th_dport = dport;
2203         th->th_seq = htonl(seq);
2204         th->th_ack = htonl(ack);
2205         th->th_off = tlen >> 2;
2206         th->th_flags = flags;
2207         th->th_win = htons(win);
2208
2209         if (mss) {
2210                 opt = (char *)(th + 1);
2211                 opt[0] = TCPOPT_MAXSEG;
2212                 opt[1] = 4;
2213                 mss = htons(mss);
2214                 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2215         }
2216
2217         switch (af) {
2218 #ifdef INET
2219         case AF_INET:
2220                 /* TCP checksum */
2221                 th->th_sum = in_cksum(m, len);
2222
2223                 /* Finish the IP header */
2224                 h->ip_v = 4;
2225                 h->ip_hl = sizeof(*h) >> 2;
2226                 h->ip_tos = IPTOS_LOWDELAY;
2227                 h->ip_len = len;
2228                 h->ip_off = path_mtu_discovery ? IP_DF : 0;
2229                 h->ip_ttl = ttl ? ttl : ip_defttl;
2230                 h->ip_sum = 0;
2231                 if (eh == NULL) {
2232                         lwkt_reltoken(&pf_token);
2233                         ip_output(m, NULL, NULL, 0, NULL, NULL);
2234                         lwkt_gettoken(&pf_token);
2235                 } else {
2236                         struct route             ro;
2237                         struct rtentry           rt;
2238                         struct ether_header     *e = (void *)ro.ro_dst.sa_data;
2239
2240                         if (ifp == NULL) {
2241                                 m_freem(m);
2242                                 return;
2243                         }
2244                         rt.rt_ifp = ifp;
2245                         ro.ro_rt = &rt;
2246                         ro.ro_dst.sa_len = sizeof(ro.ro_dst);
2247                         ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT;
2248                         bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN);
2249                         bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN);
2250                         e->ether_type = eh->ether_type;
2251                         /* XXX_IMPORT: later */
2252                         lwkt_reltoken(&pf_token);
2253                         ip_output(m, NULL, &ro, 0, NULL, NULL);
2254                         lwkt_gettoken(&pf_token);
2255                 }
2256                 break;
2257 #endif /* INET */
2258 #ifdef INET6
2259         case AF_INET6:
2260                 /* TCP checksum */
2261                 th->th_sum = in6_cksum(m, IPPROTO_TCP,
2262                     sizeof(struct ip6_hdr), tlen);
2263
2264                 h6->ip6_vfc |= IPV6_VERSION;
2265                 h6->ip6_hlim = IPV6_DEFHLIM;
2266
2267                 lwkt_reltoken(&pf_token);
2268                 ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
2269                 lwkt_gettoken(&pf_token);
2270                 break;
2271 #endif /* INET6 */
2272         }
2273 }
2274
2275 void
2276 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2277     struct pf_rule *r)
2278 {
2279         struct mbuf     *m0;
2280
2281         /*
2282          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
2283          * so make sure pf.flags is clear.
2284          */
2285         if ((m0 = m_copy(m, 0, M_COPYALL)) == NULL)
2286                 return;
2287
2288         m0->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
2289         m0->m_pkthdr.pf.flags = 0;
2290         /* XXX Re-Check when Upgrading to > 4.4 */
2291         m0->m_pkthdr.pf.statekey = NULL;
2292
2293         if (r->rtableid >= 0)
2294                 m0->m_pkthdr.pf.rtableid = r->rtableid;
2295
2296 #ifdef ALTQ
2297         if (r->qid) {
2298                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
2299                 m0->m_pkthdr.pf.qid = r->qid;
2300                 m0->m_pkthdr.pf.ecn_af = af;
2301                 m0->m_pkthdr.pf.hdr = mtod(m0, struct ip *);
2302         }
2303 #endif /* ALTQ */
2304
2305         switch (af) {
2306 #ifdef INET
2307         case AF_INET:
2308                 icmp_error(m0, type, code, 0, 0);
2309                 break;
2310 #endif /* INET */
2311 #ifdef INET6
2312         case AF_INET6:
2313                 icmp6_error(m0, type, code, 0);
2314                 break;
2315 #endif /* INET6 */
2316         }
2317 }
2318
2319 /*
2320  * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2321  * If n is 0, they match if they are equal. If n is != 0, they match if they
2322  * are different.
2323  */
2324 int
2325 pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2326     struct pf_addr *b, sa_family_t af)
2327 {
2328         int     match = 0;
2329
2330         switch (af) {
2331 #ifdef INET
2332         case AF_INET:
2333                 if ((a->addr32[0] & m->addr32[0]) ==
2334                     (b->addr32[0] & m->addr32[0]))
2335                         match++;
2336                 break;
2337 #endif /* INET */
2338 #ifdef INET6
2339         case AF_INET6:
2340                 if (((a->addr32[0] & m->addr32[0]) ==
2341                      (b->addr32[0] & m->addr32[0])) &&
2342                     ((a->addr32[1] & m->addr32[1]) ==
2343                      (b->addr32[1] & m->addr32[1])) &&
2344                     ((a->addr32[2] & m->addr32[2]) ==
2345                      (b->addr32[2] & m->addr32[2])) &&
2346                     ((a->addr32[3] & m->addr32[3]) ==
2347                      (b->addr32[3] & m->addr32[3])))
2348                         match++;
2349                 break;
2350 #endif /* INET6 */
2351         }
2352         if (match) {
2353                 if (n)
2354                         return (0);
2355                 else
2356                         return (1);
2357         } else {
2358                 if (n)
2359                         return (1);
2360                 else
2361                         return (0);
2362         }
2363 }
2364
2365 /*
2366  * Return 1 if b <= a <= e, otherwise return 0.
2367  */
2368 int
2369 pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2370     struct pf_addr *a, sa_family_t af)
2371 {
2372         switch (af) {
2373 #ifdef INET
2374         case AF_INET:
2375                 if ((a->addr32[0] < b->addr32[0]) ||
2376                     (a->addr32[0] > e->addr32[0]))
2377                         return (0);
2378                 break;
2379 #endif /* INET */
2380 #ifdef INET6
2381         case AF_INET6: {
2382                 int     i;
2383
2384                 /* check a >= b */
2385                 for (i = 0; i < 4; ++i)
2386                         if (a->addr32[i] > b->addr32[i])
2387                                 break;
2388                         else if (a->addr32[i] < b->addr32[i])
2389                                 return (0);
2390                 /* check a <= e */
2391                 for (i = 0; i < 4; ++i)
2392                         if (a->addr32[i] < e->addr32[i])
2393                                 break;
2394                         else if (a->addr32[i] > e->addr32[i])
2395                                 return (0);
2396                 break;
2397         }
2398 #endif /* INET6 */
2399         }
2400         return (1);
2401 }
2402
2403 int
2404 pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2405 {
2406         switch (op) {
2407         case PF_OP_IRG:
2408                 return ((p > a1) && (p < a2));
2409         case PF_OP_XRG:
2410                 return ((p < a1) || (p > a2));
2411         case PF_OP_RRG:
2412                 return ((p >= a1) && (p <= a2));
2413         case PF_OP_EQ:
2414                 return (p == a1);
2415         case PF_OP_NE:
2416                 return (p != a1);
2417         case PF_OP_LT:
2418                 return (p < a1);
2419         case PF_OP_LE:
2420                 return (p <= a1);
2421         case PF_OP_GT:
2422                 return (p > a1);
2423         case PF_OP_GE:
2424                 return (p >= a1);
2425         }
2426         return (0); /* never reached */
2427 }
2428
2429 int
2430 pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2431 {
2432         a1 = ntohs(a1);
2433         a2 = ntohs(a2);
2434         p = ntohs(p);
2435         return (pf_match(op, a1, a2, p));
2436 }
2437
2438 int
2439 pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2440 {
2441         if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2442                 return (0);
2443         return (pf_match(op, a1, a2, u));
2444 }
2445
2446 int
2447 pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2448 {
2449         if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2450                 return (0);
2451         return (pf_match(op, a1, a2, g));
2452 }
2453
2454 int
2455 pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag)
2456 {
2457         if (*tag == -1)
2458                 *tag = m->m_pkthdr.pf.tag;
2459
2460         return ((!r->match_tag_not && r->match_tag == *tag) ||
2461             (r->match_tag_not && r->match_tag != *tag));
2462 }
2463
2464 int
2465 pf_tag_packet(struct mbuf *m, int tag, int rtableid)
2466 {
2467         if (tag <= 0 && rtableid < 0)
2468                 return (0);
2469
2470         if (tag > 0)
2471                 m->m_pkthdr.pf.tag = tag;
2472         if (rtableid >= 0)
2473                 m->m_pkthdr.pf.rtableid = rtableid;
2474
2475         return (0);
2476 }
2477
2478 void
2479 pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
2480     struct pf_rule **r, struct pf_rule **a, int *match)
2481 {
2482         struct pf_anchor_stackframe     *f;
2483
2484         (*r)->anchor->match = 0;
2485         if (match)
2486                 *match = 0;
2487         if (*depth >= NELEM(pf_anchor_stack)) {
2488                 kprintf("pf_step_into_anchor: stack overflow\n");
2489                 *r = TAILQ_NEXT(*r, entries);
2490                 return;
2491         } else if (*depth == 0 && a != NULL)
2492                 *a = *r;
2493         f = pf_anchor_stack + (*depth)++;
2494         f->rs = *rs;
2495         f->r = *r;
2496         if ((*r)->anchor_wildcard) {
2497                 f->parent = &(*r)->anchor->children;
2498                 if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
2499                     NULL) {
2500                         *r = NULL;
2501                         return;
2502                 }
2503                 *rs = &f->child->ruleset;
2504         } else {
2505                 f->parent = NULL;
2506                 f->child = NULL;
2507                 *rs = &(*r)->anchor->ruleset;
2508         }
2509         *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2510 }
2511
2512 int
2513 pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
2514     struct pf_rule **r, struct pf_rule **a, int *match)
2515 {
2516         struct pf_anchor_stackframe     *f;
2517         int quick = 0;
2518
2519         do {
2520                 if (*depth <= 0)
2521                         break;
2522                 f = pf_anchor_stack + *depth - 1;
2523                 if (f->parent != NULL && f->child != NULL) {
2524                         if (f->child->match ||
2525                             (match != NULL && *match)) {
2526                                 f->r->anchor->match = 1;
2527                                 *match = 0;
2528                         }
2529                         f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
2530                         if (f->child != NULL) {
2531                                 *rs = &f->child->ruleset;
2532                                 *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2533                                 if (*r == NULL)
2534                                         continue;
2535                                 else
2536                                         break;
2537                         }
2538                 }
2539                 (*depth)--;
2540                 if (*depth == 0 && a != NULL)
2541                         *a = NULL;
2542                 *rs = f->rs;
2543                 if (f->r->anchor->match || (match != NULL && *match))
2544                         quick = f->r->quick;
2545                 *r = TAILQ_NEXT(f->r, entries);
2546         } while (*r == NULL);
2547
2548         return (quick);
2549 }
2550
2551 #ifdef INET6
2552 void
2553 pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2554     struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2555 {
2556         switch (af) {
2557 #ifdef INET
2558         case AF_INET:
2559                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2560                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2561                 break;
2562 #endif /* INET */
2563         case AF_INET6:
2564                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2565                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2566                 naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2567                 ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2568                 naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2569                 ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2570                 naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2571                 ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2572                 break;
2573         }
2574 }
2575
2576 void
2577 pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2578 {
2579         switch (af) {
2580 #ifdef INET
2581         case AF_INET:
2582                 addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2583                 break;
2584 #endif /* INET */
2585         case AF_INET6:
2586                 if (addr->addr32[3] == 0xffffffff) {
2587                         addr->addr32[3] = 0;
2588                         if (addr->addr32[2] == 0xffffffff) {
2589                                 addr->addr32[2] = 0;
2590                                 if (addr->addr32[1] == 0xffffffff) {
2591                                         addr->addr32[1] = 0;
2592                                         addr->addr32[0] =
2593                                             htonl(ntohl(addr->addr32[0]) + 1);
2594                                 } else
2595                                         addr->addr32[1] =
2596                                             htonl(ntohl(addr->addr32[1]) + 1);
2597                         } else
2598                                 addr->addr32[2] =
2599                                     htonl(ntohl(addr->addr32[2]) + 1);
2600                 } else
2601                         addr->addr32[3] =
2602                             htonl(ntohl(addr->addr32[3]) + 1);
2603                 break;
2604         }
2605 }
2606 #endif /* INET6 */
2607
2608 #define mix(a,b,c) \
2609         do {                                    \
2610                 a -= b; a -= c; a ^= (c >> 13); \
2611                 b -= c; b -= a; b ^= (a << 8);  \
2612                 c -= a; c -= b; c ^= (b >> 13); \
2613                 a -= b; a -= c; a ^= (c >> 12); \
2614                 b -= c; b -= a; b ^= (a << 16); \
2615                 c -= a; c -= b; c ^= (b >> 5);  \
2616                 a -= b; a -= c; a ^= (c >> 3);  \
2617                 b -= c; b -= a; b ^= (a << 10); \
2618                 c -= a; c -= b; c ^= (b >> 15); \
2619         } while (0)
2620
2621 /*
2622  * hash function based on bridge_hash in if_bridge.c
2623  */
2624 void
2625 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
2626     struct pf_poolhashkey *key, sa_family_t af)
2627 {
2628         u_int32_t       a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
2629
2630         switch (af) {
2631 #ifdef INET
2632         case AF_INET:
2633                 a += inaddr->addr32[0];
2634                 b += key->key32[1];
2635                 mix(a, b, c);
2636                 hash->addr32[0] = c + key->key32[2];
2637                 break;
2638 #endif /* INET */
2639 #ifdef INET6
2640         case AF_INET6:
2641                 a += inaddr->addr32[0];
2642                 b += inaddr->addr32[2];
2643                 mix(a, b, c);
2644                 hash->addr32[0] = c;
2645                 a += inaddr->addr32[1];
2646                 b += inaddr->addr32[3];
2647                 c += key->key32[1];
2648                 mix(a, b, c);
2649                 hash->addr32[1] = c;
2650                 a += inaddr->addr32[2];
2651                 b += inaddr->addr32[1];
2652                 c += key->key32[2];
2653                 mix(a, b, c);
2654                 hash->addr32[2] = c;
2655                 a += inaddr->addr32[3];
2656                 b += inaddr->addr32[0];
2657                 c += key->key32[3];
2658                 mix(a, b, c);
2659                 hash->addr32[3] = c;
2660                 break;
2661 #endif /* INET6 */
2662         }
2663 }
2664
2665 int
2666 pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
2667     struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
2668 {
2669         unsigned char            hash[16];
2670         struct pf_pool          *rpool = &r->rpool;
2671         struct pf_pooladdr      *acur = rpool->cur;
2672         struct pf_pooladdr      *cur;
2673         struct pf_addr          *raddr;
2674         struct pf_addr          *rmask;
2675         struct pf_addr          counter;
2676         struct pf_src_node       k;
2677         int cpu = mycpu->gd_cpuid;
2678         int tblidx;
2679
2680         /*
2681          * NOTE! rpool->cur and rpool->tblidx can be iterators and thus
2682          *       may represent a SMP race due to the shared nature of the
2683          *       rpool structure.  We allow the race and ensure that updates
2684          *       do not create a fatal condition.
2685          */
2686         cpu_ccfence();
2687         cur = acur;
2688         raddr = &cur->addr.v.a.addr;
2689         rmask = &cur->addr.v.a.mask;
2690
2691         if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
2692             (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
2693                 k.af = af;
2694                 PF_ACPY(&k.addr, saddr, af);
2695                 if (r->rule_flag & PFRULE_RULESRCTRACK ||
2696                     r->rpool.opts & PF_POOL_STICKYADDR)
2697                         k.rule.ptr = r;
2698                 else
2699                         k.rule.ptr = NULL;
2700                 pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
2701                 *sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
2702                 if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
2703                         PF_ACPY(naddr, &(*sn)->raddr, af);
2704                         if (pf_status.debug >= PF_DEBUG_MISC) {
2705                                 kprintf("pf_map_addr: src tracking maps ");
2706                                 pf_print_host(&k.addr, 0, af);
2707                                 kprintf(" to ");
2708                                 pf_print_host(naddr, 0, af);
2709                                 kprintf("\n");
2710                         }
2711                         return (0);
2712                 }
2713         }
2714
2715         if (cur->addr.type == PF_ADDR_NOROUTE)
2716                 return (1);
2717         if (cur->addr.type == PF_ADDR_DYNIFTL) {
2718                 switch (af) {
2719 #ifdef INET
2720                 case AF_INET:
2721                         if (cur->addr.p.dyn->pfid_acnt4 < 1 &&
2722                             (rpool->opts & PF_POOL_TYPEMASK) !=
2723                             PF_POOL_ROUNDROBIN)
2724                                 return (1);
2725                         raddr = &cur->addr.p.dyn->pfid_addr4;
2726                         rmask = &cur->addr.p.dyn->pfid_mask4;
2727                         break;
2728 #endif /* INET */
2729 #ifdef INET6
2730                 case AF_INET6:
2731                         if (cur->addr.p.dyn->pfid_acnt6 < 1 &&
2732                             (rpool->opts & PF_POOL_TYPEMASK) !=
2733                             PF_POOL_ROUNDROBIN)
2734                                 return (1);
2735                         raddr = &cur->addr.p.dyn->pfid_addr6;
2736                         rmask = &cur->addr.p.dyn->pfid_mask6;
2737                         break;
2738 #endif /* INET6 */
2739                 }
2740         } else if (cur->addr.type == PF_ADDR_TABLE) {
2741                 if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
2742                         return (1); /* unsupported */
2743         } else {
2744                 raddr = &cur->addr.v.a.addr;
2745                 rmask = &cur->addr.v.a.mask;
2746         }
2747
2748         switch (rpool->opts & PF_POOL_TYPEMASK) {
2749         case PF_POOL_NONE:
2750                 PF_ACPY(naddr, raddr, af);
2751                 break;
2752         case PF_POOL_BITMASK:
2753                 PF_POOLMASK(naddr, raddr, rmask, saddr, af);
2754                 break;
2755         case PF_POOL_RANDOM:
2756                 if (init_addr != NULL && PF_AZERO(init_addr, af)) {
2757                         switch (af) {
2758 #ifdef INET
2759                         case AF_INET:
2760                                 counter.addr32[0] = htonl(karc4random());
2761                                 break;
2762 #endif /* INET */
2763 #ifdef INET6
2764                         case AF_INET6:
2765                                 if (rmask->addr32[3] != 0xffffffff)
2766                                         counter.addr32[3] =
2767                                                 htonl(karc4random());
2768                                 else
2769                                         break;
2770                                 if (rmask->addr32[2] != 0xffffffff)
2771                                         counter.addr32[2] =
2772                                                 htonl(karc4random());
2773                                 else
2774                                         break;
2775                                 if (rmask->addr32[1] != 0xffffffff)
2776                                         counter.addr32[1] =
2777                                                 htonl(karc4random());
2778                                 else
2779                                         break;
2780                                 if (rmask->addr32[0] != 0xffffffff)
2781                                         counter.addr32[0] =
2782                                                 htonl(karc4random());
2783                                 break;
2784 #endif /* INET6 */
2785                         }
2786                         PF_POOLMASK(naddr, raddr, rmask, &counter, af);
2787                         PF_ACPY(init_addr, naddr, af);
2788
2789                 } else {
2790                         counter = rpool->counter;
2791                         cpu_ccfence();
2792                         PF_AINC(&counter, af);
2793                         PF_POOLMASK(naddr, raddr, rmask, &counter, af);
2794                         rpool->counter = counter;
2795                 }
2796                 break;
2797         case PF_POOL_SRCHASH:
2798                 pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
2799                 PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
2800                 break;
2801         case PF_POOL_ROUNDROBIN:
2802                 tblidx = rpool->tblidx;
2803                 counter = rpool->counter;
2804                 if (cur->addr.type == PF_ADDR_TABLE) {
2805                         if (!pfr_pool_get(cur->addr.p.tbl,
2806                             &tblidx, &counter,
2807                             &raddr, &rmask, af)) {
2808                                 goto get_addr;
2809                         }
2810                 } else if (cur->addr.type == PF_ADDR_DYNIFTL) {
2811                         if (!pfr_pool_get(cur->addr.p.dyn->pfid_kt,
2812                             &tblidx, &counter,
2813                             &raddr, &rmask, af)) {
2814                                 goto get_addr;
2815                         }
2816                 } else if (pf_match_addr(0, raddr, rmask,
2817                                          &counter, af)) {
2818                         goto get_addr;
2819                 }
2820
2821         try_next:
2822                 if ((cur = TAILQ_NEXT(cur, entries)) == NULL)
2823                         cur = TAILQ_FIRST(&rpool->list);
2824                 if (cur->addr.type == PF_ADDR_TABLE) {
2825                         tblidx = -1;
2826                         if (pfr_pool_get(cur->addr.p.tbl,
2827                             &tblidx, &counter,
2828                             &raddr, &rmask, af)) {
2829                                 /* table contains no address of type 'af' */
2830                                 if (cur != acur)
2831                                         goto try_next;
2832                                 return (1);
2833                         }
2834                 } else if (cur->addr.type == PF_ADDR_DYNIFTL) {
2835                         tblidx = -1;
2836                         if (pfr_pool_get(cur->addr.p.dyn->pfid_kt,
2837                             &tblidx, &counter,
2838                             &raddr, &rmask, af)) {
2839                                 /* table contains no address of type 'af' */
2840                                 if (cur != acur)
2841                                         goto try_next;
2842                                 return (1);
2843                         }
2844                 } else {
2845                         raddr = &cur->addr.v.a.addr;
2846                         rmask = &cur->addr.v.a.mask;
2847                         PF_ACPY(&counter, raddr, af);
2848                 }
2849
2850         get_addr:
2851                 rpool->cur = cur;
2852                 rpool->tblidx = tblidx;
2853                 PF_ACPY(naddr, &counter, af);
2854                 if (init_addr != NULL && PF_AZERO(init_addr, af))
2855                         PF_ACPY(init_addr, naddr, af);
2856                 PF_AINC(&counter, af);
2857                 rpool->counter = counter;
2858                 break;
2859         }
2860         if (*sn != NULL)
2861                 PF_ACPY(&(*sn)->raddr, naddr, af);
2862
2863         if (pf_status.debug >= PF_DEBUG_MISC &&
2864             (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
2865                 kprintf("pf_map_addr: selected address ");
2866                 pf_print_host(naddr, 0, af);
2867                 kprintf("\n");
2868         }
2869
2870         return (0);
2871 }
2872
2873 int
2874 pf_get_sport(struct pf_pdesc *pd, sa_family_t af,
2875              u_int8_t proto, struct pf_rule *r,
2876              struct pf_addr *saddr, struct pf_addr *daddr,
2877              u_int16_t sport, u_int16_t dport,
2878              struct pf_addr *naddr, u_int16_t *nport,
2879              u_int16_t low, u_int16_t high, struct pf_src_node **sn)
2880 {
2881         struct pf_state_key_cmp key;
2882         struct pf_addr          init_addr;
2883         u_int16_t               cut;
2884         u_int32_t               toeplitz_sport;
2885
2886         bzero(&init_addr, sizeof(init_addr));
2887         if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
2888                 return (1);
2889
2890         if (proto == IPPROTO_ICMP) {
2891                 low = 1;
2892                 high = 65535;
2893         }
2894
2895         bzero(&key, sizeof(key));
2896         key.af = af;
2897         key.proto = proto;
2898         key.port[0] = dport;
2899         PF_ACPY(&key.addr[0], daddr, key.af);
2900
2901         do {
2902                 PF_ACPY(&key.addr[1], naddr, key.af);
2903
2904                 /*
2905                  * We want to select a port that calculates to a toeplitz hash
2906                  * that masks to the same cpu, otherwise the response may
2907                  * not see the new state.
2908                  *
2909                  * We can still do this even if the kernel is disregarding
2910                  * the hash and vectoring the packets to a specific cpu,
2911                  * but it will reduce the number of ports we can use.
2912                  */
2913                 switch(af) {
2914                 case AF_INET:
2915                         toeplitz_sport =
2916                                 toeplitz_piecemeal_port(sport) ^
2917                                 toeplitz_piecemeal_addr(saddr->v4.s_addr) ^
2918                                 toeplitz_piecemeal_addr(naddr->v4.s_addr);
2919                         break;
2920                 case AF_INET6:
2921                         /* XXX TODO XXX */
2922                 default:
2923                         /* XXX TODO XXX */
2924                         toeplitz_sport = 0;
2925                         break;
2926                 }
2927
2928                 /*
2929                  * port search; start random, step;
2930                  * similar 2 portloop in in_pcbbind
2931                  *
2932                  * WARNING! We try to match such that the kernel will
2933                  *          dispatch the translated host/port to the same
2934                  *          cpu, but this might not be possible.
2935                  *
2936                  *          In the case where the port is fixed, or for the
2937                  *          UDP case (whos toeplitz does not incorporate the
2938                  *          port), we set not_cpu_localized which ultimately
2939                  *          causes the pf_state_tree element
2940                  *
2941                  * XXX fixed ports present a problem for cpu localization.
2942                  */
2943                 if (!(proto == IPPROTO_TCP ||
2944                       proto == IPPROTO_UDP ||
2945                       proto == IPPROTO_ICMP)) {
2946                         /*
2947                          * non-specific protocol, leave port intact.
2948                          */
2949                         key.port[1] = sport;
2950                         if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2951                                 *nport = sport;
2952                                 pd->not_cpu_localized = 1;
2953                                 return (0);
2954                         }
2955                 } else if (low == 0 && high == 0) {
2956                         /*
2957                          * static-port same as originator.
2958                          */
2959                         key.port[1] = sport;
2960                         if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2961                                 *nport = sport;
2962                                 pd->not_cpu_localized = 1;
2963                                 return (0);
2964                         }
2965                 } else if (low == high) {
2966                         /*
2967                          * specific port as specified.
2968                          */
2969                         key.port[1] = htons(low);
2970                         if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2971                                 *nport = htons(low);
2972                                 pd->not_cpu_localized = 1;
2973                                 return (0);
2974                         }
2975                 } else {
2976                         /*
2977                          * normal dynamic port
2978                          */
2979                         u_int16_t tmp;
2980
2981                         if (low > high) {
2982                                 tmp = low;
2983                                 low = high;
2984                                 high = tmp;
2985                         }
2986                         /* low < high */
2987                         cut = htonl(karc4random()) % (1 + high - low) + low;
2988                         /* low <= cut <= high */
2989                         for (tmp = cut; tmp <= high; ++(tmp)) {
2990                                 key.port[1] = htons(tmp);
2991                                 if ((toeplitz_piecemeal_port(key.port[1]) ^
2992                                      toeplitz_sport) & ncpus2_mask) {
2993                                         continue;
2994                                 }
2995                                 if (pf_find_state_all(&key, PF_IN, NULL) ==
2996                                     NULL && !in_baddynamic(tmp, proto)) {
2997                                         if (proto == IPPROTO_UDP)
2998                                                 pd->not_cpu_localized = 1;
2999                                         *nport = htons(tmp);
3000                                         return (0);
3001                                 }
3002                         }
3003                         for (tmp = cut - 1; tmp >= low; --(tmp)) {
3004                                 key.port[1] = htons(tmp);
3005                                 if ((toeplitz_piecemeal_port(key.port[1]) ^
3006                                      toeplitz_sport) & ncpus2_mask) {
3007                                         continue;
3008                                 }
3009                                 if (pf_find_state_all(&key, PF_IN, NULL) ==
3010                                     NULL && !in_baddynamic(tmp, proto)) {
3011                                         if (proto == IPPROTO_UDP)
3012                                                 pd->not_cpu_localized = 1;
3013                                         *nport = htons(tmp);
3014                                         return (0);
3015                                 }
3016                         }
3017                 }
3018
3019                 /*
3020                  * Next address
3021                  */
3022                 switch (r->rpool.opts & PF_POOL_TYPEMASK) {
3023                 case PF_POOL_RANDOM:
3024                 case PF_POOL_ROUNDROBIN:
3025                         if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
3026                                 return (1);
3027                         break;
3028                 case PF_POOL_NONE:
3029                 case PF_POOL_SRCHASH:
3030                 case PF_POOL_BITMASK:
3031                 default:
3032                         return (1);
3033                 }
3034         } while (! PF_AEQ(&init_addr, naddr, af) );
3035         return (1);                                     /* none available */
3036 }
3037
3038 struct pf_rule *
3039 pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
3040     int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
3041     struct pf_addr *daddr, u_int16_t dport, int rs_num)
3042 {
3043         struct pf_rule          *r, *rm = NULL;
3044         struct pf_ruleset       *ruleset = NULL;
3045         int                      tag = -1;
3046         int                      rtableid = -1;
3047         int                      asd = 0;
3048
3049         r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
3050         while (r && rm == NULL) {
3051                 struct pf_rule_addr     *src = NULL, *dst = NULL;
3052                 struct pf_addr_wrap     *xdst = NULL;
3053                 struct pf_pooladdr      *cur;
3054
3055                 if (r->action == PF_BINAT && direction == PF_IN) {
3056                         src = &r->dst;
3057                         cur = r->rpool.cur;     /* SMP race possible */
3058                         cpu_ccfence();
3059                         if (cur)
3060                                 xdst = &cur->addr;
3061                 } else {
3062                         src = &r->src;
3063                         dst = &r->dst;
3064                 }
3065
3066                 r->evaluations++;
3067                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3068                         r = r->skip[PF_SKIP_IFP].ptr;
3069                 else if (r->direction && r->direction != direction)
3070                         r = r->skip[PF_SKIP_DIR].ptr;
3071                 else if (r->af && r->af != pd->af)
3072                         r = r->skip[PF_SKIP_AF].ptr;
3073                 else if (r->proto && r->proto != pd->proto)
3074                         r = r->skip[PF_SKIP_PROTO].ptr;
3075                 else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
3076                     src->neg, kif))
3077                         r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
3078                             PF_SKIP_DST_ADDR].ptr;
3079                 else if (src->port_op && !pf_match_port(src->port_op,
3080                     src->port[0], src->port[1], sport))
3081                         r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
3082                             PF_SKIP_DST_PORT].ptr;
3083                 else if (dst != NULL &&
3084                     PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL))
3085                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3086                 else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
3087                     0, NULL))
3088                         r = TAILQ_NEXT(r, entries);
3089                 else if (dst != NULL && dst->port_op &&
3090                     !pf_match_port(dst->port_op, dst->port[0],
3091                     dst->port[1], dport))
3092                         r = r->skip[PF_SKIP_DST_PORT].ptr;
3093                 else if (r->match_tag && !pf_match_tag(m, r, &tag))
3094                         r = TAILQ_NEXT(r, entries);
3095                 else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
3096                     IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
3097                     off, pd->hdr.tcp), r->os_fingerprint)))
3098                         r = TAILQ_NEXT(r, entries);
3099                 else {
3100                         if (r->tag)
3101                                 tag = r->tag;
3102                         if (r->rtableid >= 0)
3103                                 rtableid = r->rtableid;
3104                         if (r->anchor == NULL) {
3105                                 rm = r;
3106                         } else
3107                                 pf_step_into_anchor(&asd, &ruleset, rs_num,
3108                                     &r, NULL, NULL);
3109                 }
3110                 if (r == NULL)
3111                         pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
3112                             NULL, NULL);
3113         }
3114         if (pf_tag_packet(m, tag, rtableid))
3115                 return (NULL);
3116         if (rm != NULL && (rm->action == PF_NONAT ||
3117             rm->action == PF_NORDR || rm->action == PF_NOBINAT))
3118                 return (NULL);
3119         return (rm);
3120 }
3121
3122 struct pf_rule *
3123 pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
3124     struct pfi_kif *kif, struct pf_src_node **sn,
3125     struct pf_state_key **skw, struct pf_state_key **sks,
3126     struct pf_state_key **skp, struct pf_state_key **nkp,
3127     struct pf_addr *saddr, struct pf_addr *daddr,
3128     u_int16_t sport, u_int16_t dport)
3129 {
3130         struct pf_rule  *r = NULL;
3131
3132         if (direction == PF_OUT) {
3133                 r = pf_match_translation(pd, m, off, direction, kif, saddr,
3134                     sport, daddr, dport, PF_RULESET_BINAT);
3135                 if (r == NULL)
3136                         r = pf_match_translation(pd, m, off, direction, kif,
3137                             saddr, sport, daddr, dport, PF_RULESET_NAT);
3138         } else {
3139                 r = pf_match_translation(pd, m, off, direction, kif, saddr,
3140                     sport, daddr, dport, PF_RULESET_RDR);
3141                 if (r == NULL)
3142                         r = pf_match_translation(pd, m, off, direction, kif,
3143                             saddr, sport, daddr, dport, PF_RULESET_BINAT);
3144         }
3145
3146         if (r != NULL) {
3147                 struct pf_addr  *naddr;
3148                 u_int16_t       *nport;
3149
3150                 if (pf_state_key_setup(pd, r, skw, sks, skp, nkp,
3151                     saddr, daddr, sport, dport))
3152                         return r;
3153
3154                 /* XXX We only modify one side for now. */
3155                 naddr = &(*nkp)->addr[1];
3156                 nport = &(*nkp)->port[1];
3157
3158                 /*
3159                  * NOTE: Currently all translations will clear
3160                  *       BRIDGE_MBUF_TAGGED, telling the bridge to
3161                  *       ignore the original input encapsulation.
3162                  */
3163                 switch (r->action) {
3164                 case PF_NONAT:
3165                 case PF_NOBINAT:
3166                 case PF_NORDR:
3167                         return (NULL);
3168                 case PF_NAT:
3169                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3170                         if (pf_get_sport(pd, pd->af, pd->proto, r,
3171                             saddr, daddr, sport, dport,
3172                             naddr, nport, r->rpool.proxy_port[0],
3173                             r->rpool.proxy_port[1], sn)) {
3174                                 DPFPRINTF(PF_DEBUG_MISC,
3175                                     ("pf: NAT proxy port allocation "
3176                                     "(%u-%u) failed\n",
3177                                     r->rpool.proxy_port[0],
3178                                     r->rpool.proxy_port[1]));
3179                                 return (NULL);
3180                         }
3181                         break;
3182                 case PF_BINAT:
3183                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3184                         switch (direction) {
3185                         case PF_OUT:
3186                                 if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
3187                                         switch (pd->af) {
3188 #ifdef INET
3189                                         case AF_INET:
3190                                                 if (r->rpool.cur->addr.p.dyn->
3191                                                     pfid_acnt4 < 1)
3192                                                         return (NULL);
3193                                                 PF_POOLMASK(naddr,
3194                                                     &r->rpool.cur->addr.p.dyn->
3195                                                     pfid_addr4,
3196                                                     &r->rpool.cur->addr.p.dyn->
3197                                                     pfid_mask4,
3198                                                     saddr, AF_INET);
3199                                                 break;
3200 #endif /* INET */
3201 #ifdef INET6
3202                                         case AF_INET6:
3203                                                 if (r->rpool.cur->addr.p.dyn->
3204                                                     pfid_acnt6 < 1)
3205                                                         return (NULL);
3206                                                 PF_POOLMASK(naddr,
3207                                                     &r->rpool.cur->addr.p.dyn->
3208                                                     pfid_addr6,
3209                                                     &r->rpool.cur->addr.p.dyn->
3210                                                     pfid_mask6,
3211                                                     saddr, AF_INET6);
3212                                                 break;
3213 #endif /* INET6 */
3214                                         }
3215                                 } else
3216                                         PF_POOLMASK(naddr,
3217                                             &r->rpool.cur->addr.v.a.addr,
3218                                             &r->rpool.cur->addr.v.a.mask,
3219                                             saddr, pd->af);
3220                                 break;
3221                         case PF_IN:
3222                                 if (r->src.addr.type == PF_ADDR_DYNIFTL) {
3223                                         switch (pd->af) {
3224 #ifdef INET
3225                                         case AF_INET:
3226                                                 if (r->src.addr.p.dyn->
3227                                                     pfid_acnt4 < 1)
3228                                                         return (NULL);
3229                                                 PF_POOLMASK(naddr,
3230                                                     &r->src.addr.p.dyn->
3231                                                     pfid_addr4,
3232                                                     &r->src.addr.p.dyn->
3233                                                     pfid_mask4,
3234                                                     daddr, AF_INET);
3235                                                 break;
3236 #endif /* INET */
3237 #ifdef INET6
3238                                         case AF_INET6:
3239                                                 if (r->src.addr.p.dyn->
3240                                                     pfid_acnt6 < 1)
3241                                                         return (NULL);
3242                                                 PF_POOLMASK(naddr,
3243                                                     &r->src.addr.p.dyn->
3244                                                     pfid_addr6,
3245                                                     &r->src.addr.p.dyn->
3246                                                     pfid_mask6,
3247                                                     daddr, AF_INET6);
3248                                                 break;
3249 #endif /* INET6 */
3250                                         }
3251                                 } else
3252                                         PF_POOLMASK(naddr,
3253                                             &r->src.addr.v.a.addr,
3254                                             &r->src.addr.v.a.mask, daddr,
3255                                             pd->af);
3256                                 break;
3257                         }
3258                         break;
3259                 case PF_RDR: {
3260                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3261                         if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
3262                                 return (NULL);
3263                         if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
3264                             PF_POOL_BITMASK)
3265                                 PF_POOLMASK(naddr, naddr,
3266                                     &r->rpool.cur->addr.v.a.mask, daddr,
3267                                     pd->af);
3268
3269                         if (r->rpool.proxy_port[1]) {
3270                                 u_int32_t       tmp_nport;
3271
3272                                 tmp_nport = ((ntohs(dport) -
3273                                     ntohs(r->dst.port[0])) %
3274                                     (r->rpool.proxy_port[1] -
3275                                     r->rpool.proxy_port[0] + 1)) +
3276                                     r->rpool.proxy_port[0];
3277
3278                                 /* wrap around if necessary */
3279                                 if (tmp_nport > 65535)
3280                                         tmp_nport -= 65535;
3281                                 *nport = htons((u_int16_t)tmp_nport);
3282                         } else if (r->rpool.proxy_port[0]) {
3283                                 *nport = htons(r->rpool.proxy_port[0]);
3284                         }
3285                         pd->not_cpu_localized = 1;
3286                         break;
3287                 }
3288                 default:
3289                         return (NULL);
3290                 }
3291         }
3292
3293         return (r);
3294 }
3295
3296 struct netmsg_hashlookup {
3297         struct netmsg_base      base;
3298         struct inpcb            **nm_pinp;
3299         struct inpcbinfo        *nm_pcbinfo;
3300         struct pf_addr          *nm_saddr;
3301         struct pf_addr          *nm_daddr;
3302         uint16_t                nm_sport;
3303         uint16_t                nm_dport;
3304         sa_family_t             nm_af;
3305 };
3306
3307 #ifdef PF_SOCKET_LOOKUP_DOMSG
3308 static void
3309 in_pcblookup_hash_handler(netmsg_t msg)
3310 {
3311         struct netmsg_hashlookup *rmsg = (struct netmsg_hashlookup *)msg;
3312
3313         if (rmsg->nm_af == AF_INET)
3314                 *rmsg->nm_pinp = in_pcblookup_hash(rmsg->nm_pcbinfo,
3315                     rmsg->nm_saddr->v4, rmsg->nm_sport, rmsg->nm_daddr->v4,
3316                     rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
3317 #ifdef INET6
3318         else
3319                 *rmsg->nm_pinp = in6_pcblookup_hash(rmsg->nm_pcbinfo,
3320                     &rmsg->nm_saddr->v6, rmsg->nm_sport, &rmsg->nm_daddr->v6,
3321                     rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
3322 #endif /* INET6 */
3323         lwkt_replymsg(&rmsg->base.lmsg, 0);
3324 }
3325 #endif  /* PF_SOCKET_LOOKUP_DOMSG */
3326
3327 int
3328 pf_socket_lookup(int direction, struct pf_pdesc *pd)
3329 {
3330         struct pf_addr          *saddr, *daddr;
3331         u_int16_t                sport, dport;
3332         struct inpcbinfo        *pi;
3333         struct inpcb            *inp;
3334         struct netmsg_hashlookup *msg = NULL;
3335 #ifdef PF_SOCKET_LOOKUP_DOMSG
3336         struct netmsg_hashlookup msg0;
3337 #endif
3338         int                      pi_cpu = 0;
3339
3340         if (pd == NULL)
3341                 return (-1);
3342         pd->lookup.uid = UID_MAX;
3343         pd->lookup.gid = GID_MAX;
3344         pd->lookup.pid = NO_PID;
3345         if (direction == PF_IN) {
3346                 saddr = pd->src;
3347                 daddr = pd->dst;
3348         } else {
3349                 saddr = pd->dst;
3350                 daddr = pd->src;
3351         }
3352         switch (pd->proto) {
3353         case IPPROTO_TCP:
3354                 if (pd->hdr.tcp == NULL)
3355                         return (-1);
3356                 sport = pd->hdr.tcp->th_sport;
3357                 dport = pd->hdr.tcp->th_dport;
3358
3359                 pi_cpu = tcp_addrcpu(saddr->v4.s_addr, sport, daddr->v4.s_addr, dport);
3360                 pi = &tcbinfo[pi_cpu];
3361                 /*
3362                  * Our netstack runs lockless on MP systems
3363                  * (only for TCP connections at the moment).
3364                  * 
3365                  * As we are not allowed to read another CPU's tcbinfo,
3366                  * we have to ask that CPU via remote call to search the
3367                  * table for us.
3368                  * 
3369                  * Prepare a msg iff data belongs to another CPU.
3370                  */
3371                 if (pi_cpu != mycpu->gd_cpuid) {
3372 #ifdef PF_SOCKET_LOOKUP_DOMSG
3373                         /*
3374                          * NOTE:
3375                          *
3376                          * Following lwkt_domsg() is dangerous and could
3377                          * lockup the network system, e.g.
3378                          *
3379                          * On 2 CPU system:
3380                          * netisr0 domsg to netisr1 (due to lookup)
3381                          * netisr1 domsg to netisr0 (due to lookup)
3382                          *
3383                          * We simply return -1 here, since we are probably
3384                          * called before NAT, so the TCP packet should
3385                          * already be on the correct CPU.
3386                          */
3387                         msg = &msg0;
3388                         netmsg_init(&msg->base, NULL, &curthread->td_msgport,
3389                                     0, in_pcblookup_hash_handler);
3390                         msg->nm_pinp = &inp;
3391                         msg->nm_pcbinfo = pi;
3392                         msg->nm_saddr = saddr;
3393                         msg->nm_sport = sport;
3394                         msg->nm_daddr = daddr;
3395                         msg->nm_dport = dport;
3396                         msg->nm_af = pd->af;
3397 #else   /* !PF_SOCKET_LOOKUP_DOMSG */
3398                         kprintf("pf_socket_lookup: tcp packet not on the "
3399                                 "correct cpu %d, cur cpu %d\n",
3400                                 pi_cpu, mycpuid);
3401                         print_backtrace(-1);
3402                         return -1;
3403 #endif  /* PF_SOCKET_LOOKUP_DOMSG */
3404                 }
3405                 break;
3406         case IPPROTO_UDP:
3407                 if (pd->hdr.udp == NULL)
3408                         return (-1);
3409                 sport = pd->hdr.udp->uh_sport;
3410                 dport = pd->hdr.udp->uh_dport;
3411                 pi = &udbinfo[mycpuid];
3412                 break;
3413         default:
3414                 return (-1);
3415         }
3416         if (direction != PF_IN) {
3417                 u_int16_t       p;
3418
3419                 p = sport;
3420                 sport = dport;
3421                 dport = p;
3422         }
3423         switch (pd->af) {
3424 #ifdef INET6
3425         case AF_INET6:
3426                 /*
3427                  * Query other CPU, second part
3428                  * 
3429                  * msg only gets initialized when:
3430                  * 1) packet is TCP
3431                  * 2) the info belongs to another CPU
3432                  *
3433                  * Use some switch/case magic to avoid code duplication.
3434                  */
3435                 if (msg == NULL) {
3436                         inp = in6_pcblookup_hash(pi, &saddr->v6, sport,
3437                             &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL);
3438
3439                         if (inp == NULL)
3440                                 return (-1);
3441                         break;
3442                 }
3443                 /* FALLTHROUGH if SMP and on other CPU */
3444 #endif /* INET6 */
3445         case AF_INET:
3446                 if (msg != NULL) {
3447                         lwkt_domsg(netisr_cpuport(pi_cpu),
3448                                      &msg->base.lmsg, 0);
3449                 } else
3450                 {
3451                         inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4,
3452                             dport, INPLOOKUP_WILDCARD, NULL);
3453                 }
3454                 if (inp == NULL)
3455                         return (-1);
3456                 break;
3457
3458         default:
3459                 return (-1);
3460         }
3461         pd->lookup.uid = inp->inp_socket->so_cred->cr_uid;
3462         pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0];
3463         return (1);
3464 }
3465
3466 u_int8_t
3467 pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3468 {
3469         int              hlen;
3470         u_int8_t         hdr[60];
3471         u_int8_t        *opt, optlen;
3472         u_int8_t         wscale = 0;
3473
3474         hlen = th_off << 2;             /* hlen <= sizeof(hdr) */
3475         if (hlen <= sizeof(struct tcphdr))
3476                 return (0);
3477         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3478                 return (0);
3479         opt = hdr + sizeof(struct tcphdr);
3480         hlen -= sizeof(struct tcphdr);
3481         while (hlen >= 3) {
3482                 switch (*opt) {
3483                 case TCPOPT_EOL:
3484                 case TCPOPT_NOP:
3485                         ++opt;
3486                         --hlen;
3487                         break;
3488                 case TCPOPT_WINDOW:
3489                         wscale = opt[2];
3490                         if (wscale > TCP_MAX_WINSHIFT)
3491                                 wscale = TCP_MAX_WINSHIFT;
3492                         wscale |= PF_WSCALE_FLAG;
3493                         /* FALLTHROUGH */
3494                 default:
3495                         optlen = opt[1];
3496                         if (optlen < 2)
3497                                 optlen = 2;
3498                         hlen -= optlen;
3499                         opt += optlen;
3500                         break;
3501                 }
3502         }
3503         return (wscale);
3504 }
3505
3506 u_int16_t
3507 pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3508 {
3509         int              hlen;
3510         u_int8_t         hdr[60];
3511         u_int8_t        *opt, optlen;
3512         u_int16_t        mss = tcp_mssdflt;
3513
3514         hlen = th_off << 2;     /* hlen <= sizeof(hdr) */
3515         if (hlen <= sizeof(struct tcphdr))
3516                 return (0);
3517         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3518                 return (0);
3519         opt = hdr + sizeof(struct tcphdr);
3520         hlen -= sizeof(struct tcphdr);
3521         while (hlen >= TCPOLEN_MAXSEG) {
3522                 switch (*opt) {
3523                 case TCPOPT_EOL:
3524                 case TCPOPT_NOP:
3525                         ++opt;
3526                         --hlen;
3527                         break;
3528                 case TCPOPT_MAXSEG:
3529                         bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
3530                         /* FALLTHROUGH */
3531                 default:
3532                         optlen = opt[1];
3533                         if (optlen < 2)
3534                                 optlen = 2;
3535                         hlen -= optlen;
3536                         opt += optlen;
3537                         break;
3538                 }
3539         }
3540         return (mss);
3541 }
3542
3543 u_int16_t
3544 pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer)
3545 {
3546 #ifdef INET
3547         struct sockaddr_in      *dst;
3548         struct route             ro;
3549 #endif /* INET */
3550 #ifdef INET6
3551         struct sockaddr_in6     *dst6;
3552         struct route_in6         ro6;
3553 #endif /* INET6 */
3554         struct rtentry          *rt = NULL;
3555         int                      hlen = 0;
3556         u_int16_t                mss = tcp_mssdflt;
3557
3558         switch (af) {
3559 #ifdef INET
3560         case AF_INET:
3561                 hlen = sizeof(struct ip);
3562                 bzero(&ro, sizeof(ro));
3563                 dst = (struct sockaddr_in *)&ro.ro_dst;
3564                 dst->sin_family = AF_INET;
3565                 dst->sin_len = sizeof(*dst);
3566                 dst->sin_addr = addr->v4;
3567                 rtalloc_ign(&ro, (RTF_CLONING | RTF_PRCLONING));
3568                 rt = ro.ro_rt;
3569                 break;
3570 #endif /* INET */
3571 #ifdef INET6
3572         case AF_INET6:
3573                 hlen = sizeof(struct ip6_hdr);
3574                 bzero(&ro6, sizeof(ro6));
3575                 dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
3576                 dst6->sin6_family = AF_INET6;
3577                 dst6->sin6_len = sizeof(*dst6);
3578                 dst6->sin6_addr = addr->v6;
3579                 rtalloc_ign((struct route *)&ro6, (RTF_CLONING | RTF_PRCLONING));
3580                 rt = ro6.ro_rt;
3581                 break;
3582 #endif /* INET6 */
3583         }
3584
3585         if (rt && rt->rt_ifp) {
3586                 mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
3587                 mss = max(tcp_mssdflt, mss);
3588                 RTFREE(rt);
3589         }
3590         mss = min(mss, offer);
3591         mss = max(mss, 64);             /* sanity - at least max opt space */
3592         return (mss);
3593 }
3594
3595 void
3596 pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
3597 {
3598         struct pf_rule *r = s->rule.ptr;
3599
3600         s->rt_kif = NULL;
3601         if (!r->rt || r->rt == PF_FASTROUTE)
3602                 return;
3603         switch (s->key[PF_SK_WIRE]->af) {
3604 #ifdef INET
3605         case AF_INET:
3606                 pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL,
3607                     &s->nat_src_node);
3608                 s->rt_kif = r->rpool.cur->kif;
3609                 break;
3610 #endif /* INET */
3611 #ifdef INET6
3612         case AF_INET6:
3613                 pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL,
3614                     &s->nat_src_node);
3615                 s->rt_kif = r->rpool.cur->kif;
3616                 break;
3617 #endif /* INET6 */
3618         }
3619 }
3620
3621 u_int32_t
3622 pf_tcp_iss(struct pf_pdesc *pd)
3623 {
3624         MD5_CTX ctx;
3625         u_int32_t digest[4];
3626
3627         if (pf_tcp_secret_init == 0) {
3628                 lwkt_gettoken(&pf_gtoken);
3629                 if (pf_tcp_secret_init == 0) {
3630                         karc4rand(pf_tcp_secret, sizeof(pf_tcp_secret));
3631                         MD5Init(&pf_tcp_secret_ctx);
3632                         MD5Update(&pf_tcp_secret_ctx, pf_tcp_secret,
3633                             sizeof(pf_tcp_secret));
3634                         pf_tcp_secret_init = 1;
3635                 }
3636                 lwkt_reltoken(&pf_gtoken);
3637         }
3638         ctx = pf_tcp_secret_ctx;
3639
3640         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
3641         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
3642         if (pd->af == AF_INET6) {
3643                 MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
3644                 MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
3645         } else {
3646                 MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
3647                 MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
3648         }
3649         MD5Final((u_char *)digest, &ctx);
3650         pf_tcp_iss_off += 4096;
3651
3652         return (digest[0] + pd->hdr.tcp->th_seq + pf_tcp_iss_off);
3653 }
3654
3655 int
3656 pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3657     struct pfi_kif *kif, struct mbuf *m, int off, void *h,
3658     struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm,
3659     struct ifqueue *ifq, struct inpcb *inp)
3660 {
3661         struct pf_rule          *nr = NULL;
3662         struct pf_addr          *saddr = pd->src, *daddr = pd->dst;
3663         sa_family_t              af = pd->af;
3664         struct pf_rule          *r, *a = NULL;
3665         struct pf_ruleset       *ruleset = NULL;
3666         struct pf_src_node      *nsn = NULL;
3667         struct tcphdr           *th = pd->hdr.tcp;
3668         struct pf_state_key     *skw = NULL, *sks = NULL;
3669         struct pf_state_key     *sk = NULL, *nk = NULL;
3670         u_short                  reason;
3671         int                      rewrite = 0, hdrlen = 0;
3672         int                      tag = -1, rtableid = -1;
3673         int                      asd = 0;
3674         int                      match = 0;
3675         int                      state_icmp = 0;
3676         u_int16_t                sport = 0, dport = 0;
3677         u_int16_t                bproto_sum = 0, bip_sum = 0;
3678         u_int8_t                 icmptype = 0, icmpcode = 0;
3679
3680
3681         if (direction == PF_IN && pf_check_congestion(ifq)) {
3682                 REASON_SET(&reason, PFRES_CONGEST);
3683                 return (PF_DROP);
3684         }
3685
3686         if (inp != NULL)
3687                 pd->lookup.done = pf_socket_lookup(direction, pd);
3688         else if (debug_pfugidhack) { 
3689                 DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
3690                 pd->lookup.done = pf_socket_lookup(direction, pd);
3691         }
3692
3693         switch (pd->proto) {
3694         case IPPROTO_TCP:
3695                 sport = th->th_sport;
3696                 dport = th->th_dport;
3697                 hdrlen = sizeof(*th);
3698                 break;
3699         case IPPROTO_UDP:
3700                 sport = pd->hdr.udp->uh_sport;
3701                 dport = pd->hdr.udp->uh_dport;
3702                 hdrlen = sizeof(*pd->hdr.udp);
3703                 break;
3704 #ifdef INET
3705         case IPPROTO_ICMP:
3706                 if (pd->af != AF_INET)
3707                         break;
3708                 sport = dport = pd->hdr.icmp->icmp_id;
3709                 hdrlen = sizeof(*pd->hdr.icmp);
3710                 icmptype = pd->hdr.icmp->icmp_type;
3711                 icmpcode = pd->hdr.icmp->icmp_code;
3712
3713                 if (icmptype == ICMP_UNREACH ||
3714                     icmptype == ICMP_SOURCEQUENCH ||
3715                     icmptype == ICMP_REDIRECT ||
3716                     icmptype == ICMP_TIMXCEED ||
3717                     icmptype == ICMP_PARAMPROB)
3718                         state_icmp++;
3719                 break;
3720 #endif /* INET */
3721 #ifdef INET6
3722         case IPPROTO_ICMPV6:
3723                 if (af != AF_INET6)
3724                         break;
3725                 sport = dport = pd->hdr.icmp6->icmp6_id;
3726                 hdrlen = sizeof(*pd->hdr.icmp6);
3727                 icmptype = pd->hdr.icmp6->icmp6_type;
3728                 icmpcode = pd->hdr.icmp6->icmp6_code;
3729
3730                 if (icmptype == ICMP6_DST_UNREACH ||
3731                     icmptype == ICMP6_PACKET_TOO_BIG ||
3732                     icmptype == ICMP6_TIME_EXCEEDED ||
3733                     icmptype == ICMP6_PARAM_PROB)
3734                         state_icmp++;
3735                 break;
3736 #endif /* INET6 */
3737         default:
3738                 sport = dport = hdrlen = 0;
3739                 break;
3740         }
3741
3742         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3743
3744         /* check packet for BINAT/NAT/RDR */
3745         if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn,
3746             &skw, &sks, &sk, &nk, saddr, daddr, sport, dport)) != NULL) {
3747                 if (nk == NULL || sk == NULL) {
3748                         REASON_SET(&reason, PFRES_MEMORY);
3749                         goto cleanup;
3750                 }
3751
3752                 if (pd->ip_sum)
3753                         bip_sum = *pd->ip_sum;
3754
3755                 m->m_flags &= ~M_HASH;
3756                 switch (pd->proto) {
3757                 case IPPROTO_TCP:
3758                         bproto_sum = th->th_sum;
3759                         pd->proto_sum = &th->th_sum;
3760
3761                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3762                             nk->port[pd->sidx] != sport) {
3763                                 pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
3764                                     &th->th_sum, &nk->addr[pd->sidx],
3765                                     nk->port[pd->sidx], 0, af);
3766                                 pd->sport = &th->th_sport;
3767                                 sport = th->th_sport;
3768                         }
3769
3770                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3771                             nk->port[pd->didx] != dport) {
3772                                 pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
3773                                     &th->th_sum, &nk->addr[pd->didx],
3774                                     nk->port[pd->didx], 0, af);
3775                                 dport = th->th_dport;
3776                                 pd->dport = &th->th_dport;
3777                         }
3778                         rewrite++;
3779                         break;
3780                 case IPPROTO_UDP:
3781                         bproto_sum = pd->hdr.udp->uh_sum;
3782                         pd->proto_sum = &pd->hdr.udp->uh_sum;
3783
3784                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3785                             nk->port[pd->sidx] != sport) {
3786                                 pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
3787                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3788                                     &nk->addr[pd->sidx],
3789                                     nk->port[pd->sidx], 1, af);
3790                                 sport = pd->hdr.udp->uh_sport;
3791                                 pd->sport = &pd->hdr.udp->uh_sport;
3792                         }
3793
3794                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3795                             nk->port[pd->didx] != dport) {
3796                                 pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3797                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3798                                     &nk->addr[pd->didx],
3799                                     nk->port[pd->didx], 1, af);
3800                                 dport = pd->hdr.udp->uh_dport;
3801                                 pd->dport = &pd->hdr.udp->uh_dport;
3802                         }
3803                         rewrite++;
3804                         break;
3805 #ifdef INET
3806                 case IPPROTO_ICMP:
3807                         nk->port[0] = nk->port[1];
3808                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3809                                 pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3810                                     nk->addr[pd->sidx].v4.s_addr, 0);
3811
3812                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3813                                 pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3814                                     nk->addr[pd->didx].v4.s_addr, 0);
3815
3816                         if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3817                                 pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3818                                     pd->hdr.icmp->icmp_cksum, sport,
3819                                     nk->port[1], 0);
3820                                 pd->hdr.icmp->icmp_id = nk->port[1];
3821                                 pd->sport = &pd->hdr.icmp->icmp_id;
3822                         }
3823                         m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3824                         break;
3825 #endif /* INET */
3826 #ifdef INET6
3827                 case IPPROTO_ICMPV6:
3828                         nk->port[0] = nk->port[1];
3829                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3830                                 pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3831                                     &nk->addr[pd->sidx], 0);
3832
3833                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3834                                 pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3835                                     &nk->addr[pd->didx], 0);
3836                         rewrite++;
3837                         break;
3838 #endif /* INET */
3839                 default:
3840                         switch (af) {
3841 #ifdef INET
3842                         case AF_INET:
3843                                 if (PF_ANEQ(saddr,
3844                                     &nk->addr[pd->sidx], AF_INET))
3845                                         pf_change_a(&saddr->v4.s_addr,
3846                                             pd->ip_sum,
3847                                             nk->addr[pd->sidx].v4.s_addr, 0);
3848
3849                                 if (PF_ANEQ(daddr,
3850                                     &nk->addr[pd->didx], AF_INET))
3851                                         pf_change_a(&daddr->v4.s_addr,
3852                                             pd->ip_sum,
3853                                             nk->addr[pd->didx].v4.s_addr, 0);
3854                                 break;
3855 #endif /* INET */
3856 #ifdef INET6
3857                         case AF_INET6:
3858                                 if (PF_ANEQ(saddr,
3859                                     &nk->addr[pd->sidx], AF_INET6))
3860                                         PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3861
3862                                 if (PF_ANEQ(daddr,
3863                                     &nk->addr[pd->didx], AF_INET6))
3864                                         PF_ACPY(saddr, &nk->addr[pd->didx], af);
3865                                 break;
3866 #endif /* INET */
3867                         }
3868                         break;
3869                 }
3870                 if (nr->natpass)
3871                         r = NULL;
3872                 pd->nat_rule = nr;
3873         }
3874
3875         while (r != NULL) {
3876                 r->evaluations++;
3877                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3878                         r = r->skip[PF_SKIP_IFP].ptr;
3879                 else if (r->direction && r->direction != direction)
3880                         r = r->skip[PF_SKIP_DIR].ptr;
3881                 else if (r->af && r->af != af)
3882                         r = r->skip[PF_SKIP_AF].ptr;
3883                 else if (r->proto && r->proto != pd->proto)
3884                         r = r->skip[PF_SKIP_PROTO].ptr;
3885                 else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3886                     r->src.neg, kif))
3887                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3888                 /* tcp/udp only. port_op always 0 in other cases */
3889                 else if (r->src.port_op && !pf_match_port(r->src.port_op,
3890                     r->src.port[0], r->src.port[1], sport))
3891                         r = r->skip[PF_SKIP_SRC_PORT].ptr;
3892                 else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3893                     r->dst.neg, NULL))
3894                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3895                 /* tcp/udp only. port_op always 0 in other cases */
3896                 else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3897                     r->dst.port[0], r->dst.port[1], dport))
3898                         r = r->skip[PF_SKIP_DST_PORT].ptr;
3899                 /* icmp only. type always 0 in other cases */
3900                 else if (r->type && r->type != icmptype + 1)
3901                         r = TAILQ_NEXT(r, entries);
3902                 /* icmp only. type always 0 in other cases */
3903                 else if (r->code && r->code != icmpcode + 1)
3904                         r = TAILQ_NEXT(r, entries);
3905                 else if (r->tos && !(r->tos == pd->tos))
3906                         r = TAILQ_NEXT(r, entries);
3907                 else if (r->rule_flag & PFRULE_FRAGMENT)
3908                         r = TAILQ_NEXT(r, entries);
3909                 else if (pd->proto == IPPROTO_TCP &&
3910                     (r->flagset & th->th_flags) != r->flags)
3911                         r = TAILQ_NEXT(r, entries);
3912                 /* tcp/udp only. uid.op always 0 in other cases */
3913                 else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3914                     pf_socket_lookup(direction, pd), 1)) &&
3915                     !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3916                     pd->lookup.uid))
3917                         r = TAILQ_NEXT(r, entries);
3918                 /* tcp/udp only. gid.op always 0 in other cases */
3919                 else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3920                     pf_socket_lookup(direction, pd), 1)) &&
3921                     !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3922                     pd->lookup.gid))
3923                         r = TAILQ_NEXT(r, entries);
3924                 else if (r->prob &&
3925                   r->prob <= karc4random())
3926                         r = TAILQ_NEXT(r, entries);
3927                 else if (r->match_tag && !pf_match_tag(m, r, &tag))
3928                         r = TAILQ_NEXT(r, entries);
3929                 else if (r->os_fingerprint != PF_OSFP_ANY &&
3930                     (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3931                     pf_osfp_fingerprint(pd, m, off, th),
3932                     r->os_fingerprint)))
3933                         r = TAILQ_NEXT(r, entries);
3934                 else {
3935                         if (r->tag)
3936                                 tag = r->tag;
3937                         if (r->rtableid >= 0)
3938                                 rtableid = r->rtableid;
3939                         if (r->anchor == NULL) {
3940                                 match = 1;
3941                                 *rm = r;
3942                                 *am = a;
3943                                 *rsm = ruleset;
3944                                 if ((*rm)->quick)
3945                                         break;
3946                                 r = TAILQ_NEXT(r, entries);
3947                         } else
3948                                 pf_step_into_anchor(&asd, &ruleset,
3949                                     PF_RULESET_FILTER, &r, &a, &match);
3950                 }
3951                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3952                     PF_RULESET_FILTER, &r, &a, &match))
3953                         break;
3954         }
3955         r = *rm;
3956         a = *am;
3957         ruleset = *rsm;
3958
3959         REASON_SET(&reason, PFRES_MATCH);
3960
3961         if (r->log || (nr != NULL && nr->log)) {
3962                 if (rewrite)
3963                         m_copyback(m, off, hdrlen, pd->hdr.any);
3964                 PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
3965                     a, ruleset, pd);
3966         }
3967
3968         if ((r->action == PF_DROP) &&
3969             ((r->rule_flag & PFRULE_RETURNRST) ||
3970             (r->rule_flag & PFRULE_RETURNICMP) ||
3971             (r->rule_flag & PFRULE_RETURN))) {
3972                 /* undo NAT changes, if they have taken place */
3973                 if (nr != NULL) {
3974                         PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3975                         PF_ACPY(daddr, &sk->addr[pd->didx], af);
3976                         if (pd->sport)
3977                                 *pd->sport = sk->port[pd->sidx];
3978                         if (pd->dport)
3979                                 *pd->dport = sk->port[pd->didx];
3980                         if (pd->proto_sum)
3981                                 *pd->proto_sum = bproto_sum;
3982                         if (pd->ip_sum)
3983                                 *pd->ip_sum = bip_sum;
3984                         m_copyback(m, off, hdrlen, pd->hdr.any);
3985                 }
3986                 if (pd->proto == IPPROTO_TCP &&
3987                     ((r->rule_flag & PFRULE_RETURNRST) ||
3988                     (r->rule_flag & PFRULE_RETURN)) &&
3989                     !(th->th_flags & TH_RST)) {
3990                         u_int32_t        ack = ntohl(th->th_seq) + pd->p_len;
3991                         int              len = 0;
3992                         struct ip       *h4;
3993 #ifdef INET6
3994                         struct ip6_hdr  *h6;
3995 #endif
3996                         switch (af) {
3997                         case AF_INET:
3998                                 h4 = mtod(m, struct ip *);
3999                                 len = h4->ip_len - off;
4000                                 break;
4001 #ifdef INET6
4002                         case AF_INET6:
4003                                 h6 = mtod(m, struct ip6_hdr *);
4004                                 len = h6->ip6_plen - (off - sizeof(*h6));
4005                                 break;
4006 #endif
4007                         }
4008
4009                         if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
4010                                 REASON_SET(&reason, PFRES_PROTCKSUM);
4011                         else {
4012                                 if (th->th_flags & TH_SYN)
4013                                         ack++;
4014                                 if (th->th_flags & TH_FIN)
4015                                         ack++;
4016                                 pf_send_tcp(r, af, pd->dst,
4017                                     pd->src, th->th_dport, th->th_sport,
4018                                     ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
4019                                     r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp);
4020                         }
4021                 } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
4022                     r->return_icmp)
4023                         pf_send_icmp(m, r->return_icmp >> 8,
4024                             r->return_icmp & 255, af, r);
4025                 else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
4026                     r->return_icmp6)
4027                         pf_send_icmp(m, r->return_icmp6 >> 8,
4028                             r->return_icmp6 & 255, af, r);
4029         }
4030
4031         if (r->action == PF_DROP)
4032                 goto cleanup;
4033
4034         if (pf_tag_packet(m, tag, rtableid)) {
4035                 REASON_SET(&reason, PFRES_MEMORY);
4036                 goto cleanup;
4037         }
4038
4039         if (!state_icmp && (r->keep_state || nr != NULL ||
4040             (pd->flags & PFDESC_TCP_NORM))) {
4041                 int action;
4042                 action = pf_create_state(r, nr, a, pd, nsn, skw, sks, nk, sk, m,
4043                     off, sport, dport, &rewrite, kif, sm, tag, bproto_sum,
4044                     bip_sum, hdrlen);
4045                 if (action != PF_PASS)
4046                         return (action);
4047         }
4048
4049         /* copy back packet headers if we performed NAT operations */
4050         if (rewrite)
4051                 m_copyback(m, off, hdrlen, pd->hdr.any);
4052
4053         return (PF_PASS);
4054
4055 cleanup:
4056         if (sk != NULL)
4057                 kfree(sk, M_PFSTATEKEYPL);
4058         if (nk != NULL)
4059                 kfree(nk, M_PFSTATEKEYPL);
4060         return (PF_DROP);
4061 }
4062
4063 static __inline int
4064 pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
4065     struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *skw,
4066     struct pf_state_key *sks, struct pf_state_key *nk, struct pf_state_key *sk,
4067     struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite,
4068     struct pfi_kif *kif, struct pf_state **sm, int tag, u_int16_t bproto_sum,
4069     u_int16_t bip_sum, int hdrlen)
4070 {
4071         struct pf_state         *s = NULL;
4072         struct pf_src_node      *sn = NULL;
4073         struct tcphdr           *th = pd->hdr.tcp;
4074         u_int16_t                mss = tcp_mssdflt;
4075         u_short                  reason;
4076         int cpu = mycpu->gd_cpuid;
4077
4078         /* check maximums */
4079         if (r->max_states && (r->states_cur >= r->max_states)) {
4080                 pf_status.lcounters[LCNT_STATES]++;
4081                 REASON_SET(&reason, PFRES_MAXSTATES);
4082                 return (PF_DROP);
4083         }
4084         /* src node for filter rule */
4085         if ((r->rule_flag & PFRULE_SRCTRACK ||
4086             r->rpool.opts & PF_POOL_STICKYADDR) &&
4087             pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
4088                 REASON_SET(&reason, PFRES_SRCLIMIT);
4089                 goto csfailed;
4090         }
4091         /* src node for translation rule */
4092         if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
4093             pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
4094                 REASON_SET(&reason, PFRES_SRCLIMIT);
4095                 goto csfailed;
4096         }
4097         s = kmalloc(sizeof(struct pf_state), M_PFSTATEPL, M_NOWAIT|M_ZERO);
4098         if (s == NULL) {
4099                 REASON_SET(&reason, PFRES_MEMORY);
4100                 goto csfailed;
4101         }
4102         lockinit(&s->lk, "pfstlk", 0, 0);
4103         s->id = 0; /* XXX Do we really need that? not in OpenBSD */
4104         s->creatorid = 0;
4105         s->rule.ptr = r;
4106         s->nat_rule.ptr = nr;
4107         s->anchor.ptr = a;
4108         s->state_flags = PFSTATE_CREATEINPROG;
4109         STATE_INC_COUNTERS(s);
4110         if (r->allow_opts)
4111                 s->state_flags |= PFSTATE_ALLOWOPTS;
4112         if (r->rule_flag & PFRULE_STATESLOPPY)
4113                 s->state_flags |= PFSTATE_SLOPPY;
4114         if (pd->not_cpu_localized)
4115                 s->state_flags |= PFSTATE_STACK_GLOBAL;
4116
4117         s->log = r->log & PF_LOG_ALL;
4118         if (nr != NULL)
4119                 s->log |= nr->log & PF_LOG_ALL;
4120         switch (pd->proto) {
4121         case IPPROTO_TCP:
4122                 s->src.seqlo = ntohl(th->th_seq);
4123                 s->src.seqhi = s->src.seqlo + pd->p_len + 1;
4124                 if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
4125                     r->keep_state == PF_STATE_MODULATE) {
4126                         /* Generate sequence number modulator */
4127                         if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
4128                             0)
4129                                 s->src.seqdiff = 1;
4130                         pf_change_a(&th->th_seq, &th->th_sum,
4131                             htonl(s->src.seqlo + s->src.seqdiff), 0);
4132                         *rewrite = 1;
4133                 } else
4134                         s->src.seqdiff = 0;
4135                 if (th->th_flags & TH_SYN) {
4136                         s->src.seqhi++;
4137                         s->src.wscale = pf_get_wscale(m, off,
4138                             th->th_off, pd->af);
4139                 }
4140                 s->src.max_win = MAX(ntohs(th->th_win), 1);
4141                 if (s->src.wscale & PF_WSCALE_MASK) {
4142                         /* Remove scale factor from initial window */
4143                         int win = s->src.max_win;
4144                         win += 1 << (s->src.wscale & PF_WSCALE_MASK);
4145                         s->src.max_win = (win - 1) >>
4146                             (s->src.wscale & PF_WSCALE_MASK);
4147                 }
4148                 if (th->th_flags & TH_FIN)
4149                         s->src.seqhi++;
4150                 s->dst.seqhi = 1;
4151                 s->dst.max_win = 1;
4152                 s->src.state = TCPS_SYN_SENT;
4153                 s->dst.state = TCPS_CLOSED;
4154                 s->timeout = PFTM_TCP_FIRST_PACKET;
4155                 break;
4156         case IPPROTO_UDP:
4157                 s->src.state = PFUDPS_SINGLE;
4158                 s->dst.state = PFUDPS_NO_TRAFFIC;
4159                 s->timeout = PFTM_UDP_FIRST_PACKET;
4160                 break;
4161         case IPPROTO_ICMP:
4162 #ifdef INET6
4163         case IPPROTO_ICMPV6:
4164 #endif
4165                 s->timeout = PFTM_ICMP_FIRST_PACKET;
4166                 break;
4167         default:
4168                 s->src.state = PFOTHERS_SINGLE;
4169                 s->dst.state = PFOTHERS_NO_TRAFFIC;
4170                 s->timeout = PFTM_OTHER_FIRST_PACKET;
4171         }
4172
4173         s->creation = time_second;
4174         s->expire = time_second;
4175
4176         if (sn != NULL) {
4177                 s->src_node = sn;
4178                 s->src_node->states++;
4179         }
4180         if (nsn != NULL) {
4181                 /* XXX We only modify one side for now. */
4182                 PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
4183                 s->nat_src_node = nsn;
4184                 s->nat_src_node->states++;
4185         }
4186         if (pd->proto == IPPROTO_TCP) {
4187                 if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
4188                     off, pd, th, &s->src, &s->dst)) {
4189                         REASON_SET(&reason, PFRES_MEMORY);
4190                         pf_src_tree_remove_state(s);
4191                         STATE_DEC_COUNTERS(s);
4192                         kfree(s, M_PFSTATEPL);
4193                         return (PF_DROP);
4194                 }
4195                 if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
4196                     pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
4197                     &s->src, &s->dst, rewrite)) {
4198                         /* This really shouldn't happen!!! */
4199                         DPFPRINTF(PF_DEBUG_URGENT,
4200                             ("pf_normalize_tcp_stateful failed on first pkt"));
4201                         pf_normalize_tcp_cleanup(s);
4202                         pf_src_tree_remove_state(s);
4203                         STATE_DEC_COUNTERS(s);
4204                         kfree(s, M_PFSTATEPL);
4205                         return (PF_DROP);
4206                 }
4207         }
4208         s->direction = pd->dir;
4209
4210         if (sk == NULL && pf_state_key_setup(pd, nr, &skw, &sks, &sk, &nk,
4211                                              pd->src, pd->dst, sport, dport)) {
4212                 REASON_SET(&reason, PFRES_MEMORY);
4213                 goto csfailed;
4214         }
4215
4216         if (pf_state_insert(BOUND_IFACE(r, kif), skw, sks, s)) {
4217                 if (pd->proto == IPPROTO_TCP)
4218                         pf_normalize_tcp_cleanup(s);
4219                 REASON_SET(&reason, PFRES_STATEINS);
4220                 pf_src_tree_remove_state(s);
4221                 STATE_DEC_COUNTERS(s);
4222                 kfree(s, M_PFSTATEPL);
4223                 return (PF_DROP);
4224         } else
4225                 *sm = s;
4226
4227         pf_set_rt_ifp(s, pd->src);      /* needs s->state_key set */
4228         if (tag > 0) {
4229                 pf_tag_ref(tag);
4230                 s->tag = tag;
4231         }
4232         if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
4233             TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
4234                 s->src.state = PF_TCPS_PROXY_SRC;
4235                 /* undo NAT changes, if they have taken place */
4236                 if (nr != NULL) {
4237                         struct pf_state_key *skt = s->key[PF_SK_WIRE];
4238                         if (pd->dir == PF_OUT)
4239                                 skt = s->key[PF_SK_STACK];
4240                         PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
4241                         PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
4242                         if (pd->sport)
4243                                 *pd->sport = skt->port[pd->sidx];
4244                         if (pd->dport)
4245                                 *pd->dport = skt->port[pd->didx];
4246                         if (pd->proto_sum)
4247                                 *pd->proto_sum = bproto_sum;
4248                         if (pd->ip_sum)
4249                                 *pd->ip_sum = bip_sum;
4250                         m->m_flags &= ~M_HASH;
4251                         m_copyback(m, off, hdrlen, pd->hdr.any);
4252                 }
4253                 s->src.seqhi = htonl(karc4random());
4254                 /* Find mss option */
4255                 mss = pf_get_mss(m, off, th->th_off, pd->af);
4256                 mss = pf_calc_mss(pd->src, pd->af, mss);
4257                 mss = pf_calc_mss(pd->dst, pd->af, mss);
4258                 s->src.mss = mss;
4259                 s->state_flags &= ~PFSTATE_CREATEINPROG;
4260                 pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
4261                             th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
4262                             TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL);
4263                 REASON_SET(&reason, PFRES_SYNPROXY);
4264                 return (PF_SYNPROXY_DROP);
4265         }
4266
4267         s->state_flags &= ~PFSTATE_CREATEINPROG;
4268         return (PF_PASS);
4269
4270 csfailed:
4271         if (sk != NULL)
4272                 kfree(sk, M_PFSTATEKEYPL);
4273         if (nk != NULL)
4274                 kfree(nk, M_PFSTATEKEYPL);
4275
4276         if (sn != NULL && sn->states == 0 && sn->expire == 0) {
4277                 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], sn);
4278                 pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
4279                 atomic_add_int(&pf_status.src_nodes, -1);
4280                 kfree(sn, M_PFSRCTREEPL);
4281         }
4282         if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
4283                 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], nsn);
4284                 pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
4285                 atomic_add_int(&pf_status.src_nodes, -1);
4286                 kfree(nsn, M_PFSRCTREEPL);
4287         }
4288         if (s) {
4289                 pf_src_tree_remove_state(s);
4290                 STATE_DEC_COUNTERS(s);
4291                 kfree(s, M_PFSTATEPL);
4292         }
4293
4294         return (PF_DROP);
4295 }
4296
4297 int
4298 pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
4299     struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
4300     struct pf_ruleset **rsm)
4301 {
4302         struct pf_rule          *r, *a = NULL;
4303         struct pf_ruleset       *ruleset = NULL;
4304         sa_family_t              af = pd->af;
4305         u_short                  reason;
4306         int                      tag = -1;
4307         int                      asd = 0;
4308         int                      match = 0;
4309
4310         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
4311         while (r != NULL) {
4312                 r->evaluations++;
4313                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
4314                         r = r->skip[PF_SKIP_IFP].ptr;
4315                 else if (r->direction && r->direction != direction)
4316                         r = r->skip[PF_SKIP_DIR].ptr;
4317                 else if (r->af && r->af != af)
4318                         r = r->skip[PF_SKIP_AF].ptr;
4319                 else if (r->proto && r->proto != pd->proto)
4320                         r = r->skip[PF_SKIP_PROTO].ptr;
4321                 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
4322                     r->src.neg, kif))
4323                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
4324                 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
4325                     r->dst.neg, NULL))
4326                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
4327                 else if (r->tos && !(r->tos == pd->tos))
4328                         r = TAILQ_NEXT(r, entries);
4329                 else if (r->os_fingerprint != PF_OSFP_ANY)
4330                         r = TAILQ_NEXT(r, entries);
4331                 else if (pd->proto == IPPROTO_UDP &&
4332                     (r->src.port_op || r->dst.port_op))
4333                         r = TAILQ_NEXT(r, entries);
4334                 else if (pd->proto == IPPROTO_TCP &&
4335                     (r->src.port_op || r->dst.port_op || r->flagset))
4336                         r = TAILQ_NEXT(r, entries);
4337                 else if ((pd->proto == IPPROTO_ICMP ||
4338                     pd->proto == IPPROTO_ICMPV6) &&
4339                     (r->type || r->code))
4340                         r = TAILQ_NEXT(r, entries);
4341                 else if (r->prob && r->prob <= karc4random())
4342                         r = TAILQ_NEXT(r, entries);
4343                 else if (r->match_tag && !pf_match_tag(m, r, &tag))
4344                         r = TAILQ_NEXT(r, entries);
4345                 else {
4346                         if (r->anchor == NULL) {
4347                                 match = 1;
4348                                 *rm = r;
4349                                 *am = a;
4350                                 *rsm = ruleset;
4351                                 if ((*rm)->quick)
4352                                         break;
4353                                 r = TAILQ_NEXT(r, entries);
4354                         } else
4355                                 pf_step_into_anchor(&asd, &ruleset,
4356                                     PF_RULESET_FILTER, &r, &a, &match);
4357                 }
4358                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
4359                     PF_RULESET_FILTER, &r, &a, &match))
4360                         break;
4361         }
4362         r = *rm;
4363         a = *am;
4364         ruleset = *rsm;
4365
4366         REASON_SET(&reason, PFRES_MATCH);
4367
4368         if (r->log)
4369                 PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset,
4370                     pd);
4371
4372         if (r->action != PF_PASS)
4373                 return (PF_DROP);
4374
4375         if (pf_tag_packet(m, tag, -1)) {
4376                 REASON_SET(&reason, PFRES_MEMORY);
4377                 return (PF_DROP);
4378         }
4379
4380         return (PF_PASS);
4381 }
4382
4383 /*
4384  * Called with state locked
4385  */
4386 int
4387 pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
4388         struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
4389         struct pf_pdesc *pd, u_short *reason, int *copyback)
4390 {
4391         struct tcphdr           *th = pd->hdr.tcp;
4392         u_int16_t                win = ntohs(th->th_win);
4393         u_int32_t                ack, end, seq, orig_seq;
4394         u_int8_t                 sws, dws;
4395         int                      ackskew;
4396
4397         if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
4398                 sws = src->wscale & PF_WSCALE_MASK;
4399                 dws = dst->wscale & PF_WSCALE_MASK;
4400         } else {
4401                 sws = dws = 0;
4402         }
4403
4404         /*
4405          * Sequence tracking algorithm from Guido van Rooij's paper:
4406          *   http://www.madison-gurkha.com/publications/tcp_filtering/
4407          *      tcp_filtering.ps
4408          */
4409
4410         orig_seq = seq = ntohl(th->th_seq);
4411         if (src->seqlo == 0) {
4412                 /* First packet from this end. Set its state */
4413
4414                 if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
4415                     src->scrub == NULL) {
4416                         if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
4417                                 REASON_SET(reason, PFRES_MEMORY);
4418                                 return (PF_DROP);
4419                         }
4420                 }
4421
4422                 /* Deferred generation of sequence number modulator */
4423                 if (dst->seqdiff && !src->seqdiff) {
4424                         /* use random iss for the TCP server */
4425                         while ((src->seqdiff = karc4random() - seq) == 0)
4426                                 ;
4427                         ack = ntohl(th->th_ack) - dst->seqdiff;
4428                         pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
4429                             src->seqdiff), 0);
4430                         pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
4431                         *copyback = 1;
4432                 } else {
4433                         ack = ntohl(th->th_ack);
4434                 }
4435
4436                 end = seq + pd->p_len;
4437                 if (th->th_flags & TH_SYN) {
4438                         end++;
4439                         (*state)->sync_flags |= PFSTATE_GOT_SYN2;
4440                         if (dst->wscale & PF_WSCALE_FLAG) {
4441                                 src->wscale = pf_get_wscale(m, off, th->th_off,
4442                                     pd->af);
4443                                 if (src->wscale & PF_WSCALE_FLAG) {
4444                                         /* Remove scale factor from initial
4445                                          * window */
4446                                         sws = src->wscale & PF_WSCALE_MASK;
4447                                         win = ((u_int32_t)win + (1 << sws) - 1)
4448                                             >> sws;
4449                                         dws = dst->wscale & PF_WSCALE_MASK;
4450                                 } else {
4451                                         /* fixup other window */
4452                                         dst->max_win <<= dst->wscale &
4453                                             PF_WSCALE_MASK;
4454                                         /* in case of a retrans SYN|ACK */
4455                                         dst->wscale = 0;
4456                                 }
4457                         }
4458                 }
4459                 if (th->th_flags & TH_FIN)
4460                         end++;
4461
4462                 src->seqlo = seq;
4463                 if (src->state < TCPS_SYN_SENT)
4464                         src->state = TCPS_SYN_SENT;
4465
4466                 /*
4467                  * May need to slide the window (seqhi may have been set by
4468                  * the crappy stack check or if we picked up the connection
4469                  * after establishment)
4470                  */
4471                 if (src->seqhi == 1 ||
4472                     SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
4473                         src->seqhi = end + MAX(1, dst->max_win << dws);
4474                 if (win > src->max_win)
4475                         src->max_win = win;
4476
4477         } else {
4478                 ack = ntohl(th->th_ack) - dst->seqdiff;
4479                 if (src->seqdiff) {
4480                         /* Modulate sequence numbers */
4481                         pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
4482                             src->seqdiff), 0);
4483                         pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
4484                         *copyback = 1;
4485                 }
4486                 end = seq + pd->p_len;
4487                 if (th->th_flags & TH_SYN)
4488                         end++;
4489                 if (th->th_flags & TH_FIN)
4490                         end++;
4491         }
4492
4493         if ((th->th_flags & TH_ACK) == 0) {
4494                 /* Let it pass through the ack skew check */
4495                 ack = dst->seqlo;
4496         } else if ((ack == 0 &&
4497             (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
4498             /* broken tcp stacks do not set ack */
4499             (dst->state < TCPS_SYN_SENT)) {
4500                 /*
4501                  * Many stacks (ours included) will set the ACK number in an
4502                  * FIN|ACK if the SYN times out -- no sequence to ACK.
4503                  */
4504                 ack = dst->seqlo;
4505         }
4506
4507         if (seq == end) {
4508                 /* Ease sequencing restrictions on no data packets */
4509                 seq = src->seqlo;
4510                 end = seq;
4511         }
4512
4513         ackskew = dst->seqlo - ack;
4514
4515
4516         /*
4517          * Need to demodulate the sequence numbers in any TCP SACK options
4518          * (Selective ACK). We could optionally validate the SACK values
4519          * against the current ACK window, either forwards or backwards, but
4520          * I'm not confident that SACK has been implemented properly
4521          * everywhere. It wouldn't surprise me if several stacks accidently
4522          * SACK too far backwards of previously ACKed data. There really aren't
4523          * any security implications of bad SACKing unless the target stack
4524          * doesn't validate the option length correctly. Someone trying to
4525          * spoof into a TCP connection won't bother blindly sending SACK
4526          * options anyway.
4527          */
4528         if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
4529                 if (pf_modulate_sack(m, off, pd, th, dst))
4530                         *copyback = 1;
4531         }
4532
4533
4534 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
4535         if (SEQ_GEQ(src->seqhi, end) &&
4536             /* Last octet inside other's window space */
4537             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
4538             /* Retrans: not more than one window back */
4539             (ackskew >= -MAXACKWINDOW) &&
4540             /* Acking not more than one reassembled fragment backwards */
4541             (ackskew <= (MAXACKWINDOW << sws)) &&
4542             /* Acking not more than one window forward */
4543             ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
4544             (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
4545             (pd->flags & PFDESC_IP_REAS) == 0)) {
4546             /* Require an exact/+1 sequence match on resets when possible */
4547
4548                 if (dst->scrub || src->scrub) {
4549                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4550                             *state, src, dst, copyback))
4551                                 return (PF_DROP);
4552                 }
4553
4554                 /* update max window */
4555                 if (src->max_win < win)
4556                         src->max_win = win;
4557                 /* synchronize sequencing */
4558                 if (SEQ_GT(end, src->seqlo))
4559                         src->seqlo = end;
4560                 /* slide the window of what the other end can send */
4561                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4562                         dst->seqhi = ack + MAX((win << sws), 1);
4563
4564
4565                 /* update states */
4566                 if (th->th_flags & TH_SYN)
4567                         if (src->state < TCPS_SYN_SENT)
4568                                 src->state = TCPS_SYN_SENT;
4569                 if (th->th_flags & TH_FIN)
4570                         if (src->state < TCPS_CLOSING)
4571                                 src->state = TCPS_CLOSING;
4572                 if (th->th_flags & TH_ACK) {
4573                         if (dst->state == TCPS_SYN_SENT) {
4574                                 dst->state = TCPS_ESTABLISHED;
4575                                 if (src->state == TCPS_ESTABLISHED &&
4576                                     (*state)->src_node != NULL &&
4577                                     pf_src_connlimit(*state)) {
4578                                         REASON_SET(reason, PFRES_SRCLIMIT);
4579                                         return (PF_DROP);
4580                                 }
4581                         } else if (dst->state == TCPS_CLOSING)
4582                                 dst->state = TCPS_FIN_WAIT_2;
4583                 }
4584                 if (th->th_flags & TH_RST)
4585                         src->state = dst->state = TCPS_TIME_WAIT;
4586
4587                 /* update expire time */
4588                 (*state)->expire = time_second;
4589                 if (src->state >= TCPS_FIN_WAIT_2 &&
4590                     dst->state >= TCPS_FIN_WAIT_2)
4591                         (*state)->timeout = PFTM_TCP_CLOSED;
4592                 else if (src->state >= TCPS_CLOSING &&
4593                     dst->state >= TCPS_CLOSING)
4594                         (*state)->timeout = PFTM_TCP_FIN_WAIT;
4595                 else if (src->state < TCPS_ESTABLISHED ||
4596                     dst->state < TCPS_ESTABLISHED)
4597                         (*state)->timeout = PFTM_TCP_OPENING;
4598                 else if (src->state >= TCPS_CLOSING ||
4599                     dst->state >= TCPS_CLOSING)
4600                         (*state)->timeout = PFTM_TCP_CLOSING;
4601                 else
4602                         (*state)->timeout = PFTM_TCP_ESTABLISHED;
4603
4604                 /* Fall through to PASS packet */
4605
4606         } else if ((dst->state < TCPS_SYN_SENT ||
4607                 dst->state >= TCPS_FIN_WAIT_2 ||
4608                 src->state >= TCPS_FIN_WAIT_2) &&
4609             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
4610             /* Within a window forward of the originating packet */
4611             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
4612             /* Within a window backward of the originating packet */
4613
4614                 /*
4615                  * This currently handles three situations:
4616                  *  1) Stupid stacks will shotgun SYNs before their peer
4617                  *     replies.
4618                  *  2) When PF catches an already established stream (the
4619                  *     firewall rebooted, the state table was flushed, routes
4620                  *     changed...)
4621                  *  3) Packets get funky immediately after the connection
4622                  *     closes (this should catch Solaris spurious ACK|FINs
4623                  *     that web servers like to spew after a close)
4624                  *
4625                  * This must be a little more careful than the above code
4626                  * since packet floods will also be caught here. We don't
4627                  * update the TTL here to mitigate the damage of a packet
4628                  * flood and so the same code can handle awkward establishment
4629                  * and a loosened connection close.
4630                  * In the establishment case, a correct peer response will
4631                  * validate the connection, go through the normal state code
4632                  * and keep updating the state TTL.
4633                  */
4634
4635                 if (pf_status.debug >= PF_DEBUG_MISC) {
4636                         kprintf("pf: loose state match: ");
4637                         pf_print_state(*state);
4638                         pf_print_flags(th->th_flags);
4639                         kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4640                             "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len,
4641                             ackskew, (unsigned long long)(*state)->packets[0],
4642                             (unsigned long long)(*state)->packets[1],
4643                             pd->dir == PF_IN ? "in" : "out",
4644                             pd->dir == (*state)->direction ? "fwd" : "rev");
4645                 }
4646
4647                 if (dst->scrub || src->scrub) {
4648                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4649                             *state, src, dst, copyback))
4650                                 return (PF_DROP);
4651                 }
4652
4653                 /* update max window */
4654                 if (src->max_win < win)
4655                         src->max_win = win;
4656                 /* synchronize sequencing */
4657                 if (SEQ_GT(end, src->seqlo))
4658                         src->seqlo = end;
4659                 /* slide the window of what the other end can send */
4660                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4661                         dst->seqhi = ack + MAX((win << sws), 1);
4662
4663                 /*
4664                  * Cannot set dst->seqhi here since this could be a shotgunned
4665                  * SYN and not an already established connection.
4666                  */
4667
4668                 if (th->th_flags & TH_FIN)
4669                         if (src->state < TCPS_CLOSING)
4670                                 src->state = TCPS_CLOSING;
4671                 if (th->th_flags & TH_RST)
4672                         src->state = dst->state = TCPS_TIME_WAIT;
4673
4674                 /* Fall through to PASS packet */
4675
4676         } else if ((*state)->pickup_mode == PF_PICKUPS_HASHONLY ||
4677                     ((*state)->pickup_mode == PF_PICKUPS_ENABLED &&
4678                      ((*state)->sync_flags & PFSTATE_GOT_SYN_MASK) !=
4679                       PFSTATE_GOT_SYN_MASK)) {
4680                 /*
4681                  * If pickup mode is hash only, do not fail on sequence checks.
4682                  *
4683                  * If pickup mode is enabled and we did not see the SYN in
4684                  * both direction, do not fail on sequence checks because
4685                  * we do not have complete information on window scale.
4686                  *
4687                  * Adjust expiration and fall through to PASS packet.
4688                  * XXX Add a FIN check to reduce timeout?
4689                  */
4690                 (*state)->expire = time_second;
4691         } else  {
4692                 /*
4693                  * Failure processing
4694                  */
4695                 if ((*state)->dst.state == TCPS_SYN_SENT &&
4696                     (*state)->src.state == TCPS_SYN_SENT) {
4697                         /* Send RST for state mismatches during handshake */
4698                         if (!(th->th_flags & TH_RST))
4699                                 pf_send_tcp((*state)->rule.ptr, pd->af,
4700                                     pd->dst, pd->src, th->th_dport,
4701                                     th->th_sport, ntohl(th->th_ack), 0,
4702                                     TH_RST, 0, 0,
4703                                     (*state)->rule.ptr->return_ttl, 1, 0,
4704                                     pd->eh, kif->pfik_ifp);
4705                         src->seqlo = 0;
4706                         src->seqhi = 1;
4707                         src->max_win = 1;
4708                 } else if (pf_status.debug >= PF_DEBUG_MISC) {
4709                         kprintf("pf: BAD state: ");
4710                         pf_print_state(*state);
4711                         pf_print_flags(th->th_flags);
4712                         kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4713                             "pkts=%llu:%llu dir=%s,%s\n",
4714                             seq, orig_seq, ack, pd->p_len, ackskew,
4715                             (unsigned long long)(*state)->packets[0],
4716                                 (unsigned long long)(*state)->packets[1],
4717                             pd->dir == PF_IN ? "in" : "out",
4718                             pd->dir == (*state)->direction ? "fwd" : "rev");
4719                         kprintf("pf: State failure on: %c %c %c %c | %c %c\n",
4720                             SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4721                             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4722                             ' ': '2',
4723                             (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4724                             (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4725                             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4726                             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4727                 }
4728                 REASON_SET(reason, PFRES_BADSTATE);
4729                 return (PF_DROP);
4730         }
4731
4732         return (PF_PASS);
4733 }
4734
4735 /*
4736  * Called with state locked
4737  */
4738 int
4739 pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4740         struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4741 {
4742         struct tcphdr           *th = pd->hdr.tcp;
4743
4744         if (th->th_flags & TH_SYN)
4745                 if (src->state < TCPS_SYN_SENT)
4746                         src->state = TCPS_SYN_SENT;
4747         if (th->th_flags & TH_FIN)
4748                 if (src->state < TCPS_CLOSING)
4749                         src->state = TCPS_CLOSING;
4750         if (th->th_flags & TH_ACK) {
4751                 if (dst->state == TCPS_SYN_SENT) {
4752                         dst->state = TCPS_ESTABLISHED;
4753                         if (src->state == TCPS_ESTABLISHED &&
4754                             (*state)->src_node != NULL &&
4755                             pf_src_connlimit(*state)) {
4756                                 REASON_SET(reason, PFRES_SRCLIMIT);
4757                                 return (PF_DROP);
4758                         }
4759                 } else if (dst->state == TCPS_CLOSING) {
4760                         dst->state = TCPS_FIN_WAIT_2;
4761                 } else if (src->state == TCPS_SYN_SENT &&
4762                     dst->state < TCPS_SYN_SENT) {
4763                         /*
4764                          * Handle a special sloppy case where we only see one
4765                          * half of the connection. If there is a ACK after
4766                          * the initial SYN without ever seeing a packet from
4767                          * the destination, set the connection to established.
4768                          */
4769                         dst->state = src->state = TCPS_ESTABLISHED;
4770                         if ((*state)->src_node != NULL &&
4771                             pf_src_connlimit(*state)) {
4772                                 REASON_SET(reason, PFRES_SRCLIMIT);
4773                                 return (PF_DROP);
4774                         }
4775                 } else if (src->state == TCPS_CLOSING &&
4776                     dst->state == TCPS_ESTABLISHED &&
4777                     dst->seqlo == 0) {
4778                         /*
4779                          * Handle the closing of half connections where we
4780                          * don't see the full bidirectional FIN/ACK+ACK
4781                          * handshake.
4782                          */
4783                         dst->state = TCPS_CLOSING;
4784                 }
4785         }
4786         if (th->th_flags & TH_RST)
4787                 src->state = dst->state = TCPS_TIME_WAIT;
4788
4789         /* update expire time */
4790         (*state)->expire = time_second;
4791         if (src->state >= TCPS_FIN_WAIT_2 &&
4792             dst->state >= TCPS_FIN_WAIT_2)
4793                 (*state)->timeout = PFTM_TCP_CLOSED;
4794         else if (src->state >= TCPS_CLOSING &&
4795             dst->state >= TCPS_CLOSING)
4796                 (*state)->timeout = PFTM_TCP_FIN_WAIT;
4797         else if (src->state < TCPS_ESTABLISHED ||
4798             dst->state < TCPS_ESTABLISHED)
4799                 (*state)->timeout = PFTM_TCP_OPENING;
4800         else if (src->state >= TCPS_CLOSING ||
4801             dst->state >= TCPS_CLOSING)
4802                 (*state)->timeout = PFTM_TCP_CLOSING;
4803         else
4804                 (*state)->timeout = PFTM_TCP_ESTABLISHED;
4805
4806         return (PF_PASS);
4807 }
4808
4809 /*
4810  * Test TCP connection state.  Caller must hold the state locked.
4811  */
4812 int
4813 pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4814                   struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4815                   u_short *reason)
4816 {
4817         struct pf_state_key_cmp  key;
4818         struct tcphdr           *th = pd->hdr.tcp;
4819         int                      copyback = 0;
4820         int                      error;
4821         struct pf_state_peer    *src, *dst;
4822         struct pf_state_key     *sk;
4823
4824         bzero(&key, sizeof(key));
4825         key.af = pd->af;
4826         key.proto = IPPROTO_TCP;
4827         if (direction == PF_IN) {       /* wire side, straight */
4828                 PF_ACPY(&key.addr[0], pd->src, key.af);
4829                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4830                 key.port[0] = th->th_sport;
4831                 key.port[1] = th->th_dport;
4832                 if (pf_status.debug >= PF_DEBUG_MISC) {
4833                         kprintf("test-tcp IN (%08x:%d) -> (%08x:%d)\n",
4834                                 ntohl(key.addr[0].addr32[0]),
4835                                 ntohs(key.port[0]),
4836                                 ntohl(key.addr[1].addr32[0]),
4837                                 ntohs(key.port[1]));
4838                 }
4839         } else {                        /* stack side, reverse */
4840                 PF_ACPY(&key.addr[1], pd->src, key.af);
4841                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4842                 key.port[1] = th->th_sport;
4843                 key.port[0] = th->th_dport;
4844                 if (pf_status.debug >= PF_DEBUG_MISC) {
4845                         kprintf("test-tcp OUT (%08x:%d) <- (%08x:%d)\n",
4846                                 ntohl(key.addr[0].addr32[0]),
4847                                 ntohs(key.port[0]),
4848                                 ntohl(key.addr[1].addr32[0]),
4849                                 ntohs(key.port[1]));
4850                 }
4851         }
4852
4853         STATE_LOOKUP(kif, &key, direction, *state, m);
4854         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
4855
4856         if (direction == (*state)->direction) {
4857                 src = &(*state)->src;
4858                 dst = &(*state)->dst;
4859         } else {
4860                 src = &(*state)->dst;
4861                 dst = &(*state)->src;
4862         }
4863
4864         sk = (*state)->key[pd->didx];
4865
4866         if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4867                 if (direction != (*state)->direction) {
4868                         REASON_SET(reason, PFRES_SYNPROXY);
4869                         FAIL (PF_SYNPROXY_DROP);
4870                 }
4871                 if (th->th_flags & TH_SYN) {
4872                         if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4873                                 REASON_SET(reason, PFRES_SYNPROXY);
4874                                 FAIL (PF_DROP);
4875                         }
4876                         pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
4877                             pd->src, th->th_dport, th->th_sport,
4878                             (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4879                             TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1,
4880                             0, NULL, NULL);
4881                         REASON_SET(reason, PFRES_SYNPROXY);
4882                         FAIL (PF_SYNPROXY_DROP);
4883                 } else if (!(th->th_flags & TH_ACK) ||
4884                     (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4885                     (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4886                         REASON_SET(reason, PFRES_SYNPROXY);
4887                         FAIL (PF_DROP);
4888                 } else if ((*state)->src_node != NULL &&
4889                     pf_src_connlimit(*state)) {
4890                         REASON_SET(reason, PFRES_SRCLIMIT);
4891                         FAIL (PF_DROP);
4892                 } else
4893                         (*state)->src.state = PF_TCPS_PROXY_DST;
4894         }
4895         if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4896                 if (direction == (*state)->direction) {
4897                         if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4898                             (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4899                             (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4900                                 REASON_SET(reason, PFRES_SYNPROXY);
4901                                 FAIL (PF_DROP);
4902                         }
4903                         (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4904                         if ((*state)->dst.seqhi == 1)
4905                                 (*state)->dst.seqhi = htonl(karc4random());
4906                         pf_send_tcp((*state)->rule.ptr, pd->af,
4907                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4908                             sk->port[pd->sidx], sk->port[pd->didx],
4909                             (*state)->dst.seqhi, 0, TH_SYN, 0,
4910                             (*state)->src.mss, 0, 0, (*state)->tag, NULL, NULL);
4911                         REASON_SET(reason, PFRES_SYNPROXY);
4912                         FAIL (PF_SYNPROXY_DROP);
4913                 } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4914                     (TH_SYN|TH_ACK)) ||
4915                     (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4916                         REASON_SET(reason, PFRES_SYNPROXY);
4917                         FAIL (PF_DROP);
4918                 } else {
4919                         (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4920                         (*state)->dst.seqlo = ntohl(th->th_seq);
4921                         pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
4922                             pd->src, th->th_dport, th->th_sport,
4923                             ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4924                             TH_ACK, (*state)->src.max_win, 0, 0, 0,
4925                             (*state)->tag, NULL, NULL);
4926                         pf_send_tcp((*state)->rule.ptr, pd->af,
4927                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4928                             sk->port[pd->sidx], sk->port[pd->didx],
4929                             (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4930                             TH_ACK, (*state)->dst.max_win, 0, 0, 1,
4931                             0, NULL, NULL);
4932                         (*state)->src.seqdiff = (*state)->dst.seqhi -
4933                             (*state)->src.seqlo;
4934                         (*state)->dst.seqdiff = (*state)->src.seqhi -
4935                             (*state)->dst.seqlo;
4936                         (*state)->src.seqhi = (*state)->src.seqlo +
4937                             (*state)->dst.max_win;
4938                         (*state)->dst.seqhi = (*state)->dst.seqlo +
4939                             (*state)->src.max_win;
4940                         (*state)->src.wscale = (*state)->dst.wscale = 0;
4941                         (*state)->src.state = (*state)->dst.state =
4942                             TCPS_ESTABLISHED;
4943                         REASON_SET(reason, PFRES_SYNPROXY);
4944                         FAIL (PF_SYNPROXY_DROP);
4945                 }
4946         }
4947
4948         /*
4949          * Check for connection (addr+port pair) reuse.  We can't actually
4950          * unlink the state if we don't own it.
4951          */
4952         if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4953             dst->state >= TCPS_FIN_WAIT_2 &&
4954             src->state >= TCPS_FIN_WAIT_2) {
4955                 if (pf_status.debug >= PF_DEBUG_MISC) {
4956                         kprintf("pf: state reuse ");
4957                         pf_print_state(*state);
4958                         pf_print_flags(th->th_flags);
4959                         kprintf("\n");
4960                 }
4961                 /* XXX make sure it's the same direction ?? */
4962                 (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4963                 if ((*state)->cpuid == mycpu->gd_cpuid) {
4964                         pf_unlink_state(*state);
4965                         *state = NULL;
4966                 } else {
4967                         (*state)->timeout = PFTM_PURGE;
4968                 }
4969                 FAIL (PF_DROP);
4970         }
4971
4972         if ((*state)->state_flags & PFSTATE_SLOPPY) {
4973                 if (pf_tcp_track_sloppy(src, dst, state, pd,
4974                                         reason) == PF_DROP) {
4975                         FAIL (PF_DROP);
4976                 }
4977         } else {
4978                 if (pf_tcp_track_full(src, dst, state, kif, m, off, pd,
4979                                       reason, &copyback) == PF_DROP) {
4980                         FAIL (PF_DROP);
4981                 }
4982         }
4983
4984         /* translate source/destination address, if necessary */
4985         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4986                 struct pf_state_key *nk = (*state)->key[pd->didx];
4987
4988                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4989                     nk->port[pd->sidx] != th->th_sport)  {
4990                         /*
4991                          * The translated source address may be completely
4992                          * unrelated to the saved link header, make sure
4993                          * a bridge doesn't try to use it.
4994                          */
4995                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
4996                         pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
4997                             &th->th_sum, &nk->addr[pd->sidx],
4998                             nk->port[pd->sidx], 0, pd->af);
4999                 }
5000
5001                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5002                     nk->port[pd->didx] != th->th_dport) {
5003                         /*
5004                          * If we don't redispatch the packet will go into
5005                          * the protocol stack on the wrong cpu for the
5006                          * post-translated address.
5007                          */
5008                         pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
5009                             &th->th_sum, &nk->addr[pd->didx],
5010                             nk->port[pd->didx], 0, pd->af);
5011                 }
5012                 copyback = 1;
5013         }
5014
5015         /* Copyback sequence modulation or stateful scrub changes if needed */
5016         if (copyback) {
5017                 m->m_flags &= ~M_HASH;
5018                 m_copyback(m, off, sizeof(*th), (caddr_t)th);
5019         }
5020
5021         pfsync_update_state(*state);
5022         error = PF_PASS;
5023 done:
5024         if (*state)
5025                 lockmgr(&(*state)->lk, LK_RELEASE);
5026         return (error);
5027 }
5028
5029 /*
5030  * Test UDP connection state.  Caller must hold the state locked.
5031  */
5032 int
5033 pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
5034                   struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
5035 {
5036         struct pf_state_peer    *src, *dst;
5037         struct pf_state_key_cmp  key;
5038         struct udphdr           *uh = pd->hdr.udp;
5039
5040         bzero(&key, sizeof(key));
5041         key.af = pd->af;
5042         key.proto = IPPROTO_UDP;
5043         if (direction == PF_IN) {       /* wire side, straight */
5044                 PF_ACPY(&key.addr[0], pd->src, key.af);
5045                 PF_ACPY(&key.addr[1], pd->dst, key.af);
5046                 key.port[0] = uh->uh_sport;
5047                 key.port[1] = uh->uh_dport;
5048         } else {                        /* stack side, reverse */
5049                 PF_ACPY(&key.addr[1], pd->src, key.af);
5050                 PF_ACPY(&key.addr[0], pd->dst, key.af);
5051                 key.port[1] = uh->uh_sport;
5052                 key.port[0] = uh->uh_dport;
5053         }
5054
5055         STATE_LOOKUP(kif, &key, direction, *state, m);
5056         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5057
5058         if (direction == (*state)->direction) {
5059                 src = &(*state)->src;
5060                 dst = &(*state)->dst;
5061         } else {
5062                 src = &(*state)->dst;
5063                 dst = &(*state)->src;
5064         }
5065
5066         /* update states */
5067         if (src->state < PFUDPS_SINGLE)
5068                 src->state = PFUDPS_SINGLE;
5069         if (dst->state == PFUDPS_SINGLE)
5070                 dst->state = PFUDPS_MULTIPLE;
5071
5072         /* update expire time */
5073         (*state)->expire = time_second;
5074         if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
5075                 (*state)->timeout = PFTM_UDP_MULTIPLE;
5076         else
5077                 (*state)->timeout = PFTM_UDP_SINGLE;
5078
5079         /* translate source/destination address, if necessary */
5080         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5081                 struct pf_state_key *nk = (*state)->key[pd->didx];
5082
5083                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5084                     nk->port[pd->sidx] != uh->uh_sport) {
5085                         /*
5086                          * The translated source address may be completely
5087                          * unrelated to the saved link header, make sure
5088                          * a bridge doesn't try to use it.
5089                          */
5090                         m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
5091                         m->m_flags &= ~M_HASH;
5092                         pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
5093                             &uh->uh_sum, &nk->addr[pd->sidx],
5094                             nk->port[pd->sidx], 1, pd->af);
5095                 }
5096
5097                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5098                     nk->port[pd->didx] != uh->uh_dport) {
5099                         /*
5100                          * If we don't redispatch the packet will go into
5101                          * the protocol stack on the wrong cpu for the
5102                          * post-translated address.
5103                          */
5104                         m->m_flags &= ~M_HASH;
5105                         pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
5106                             &uh->uh_sum, &nk->addr[pd->didx],
5107                             nk->port[pd->didx], 1, pd->af);
5108                 }
5109                 m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
5110         }
5111
5112         pfsync_update_state(*state);
5113         lockmgr(&(*state)->lk, LK_RELEASE);
5114         return (PF_PASS);
5115 }
5116
5117 /*
5118  * Test ICMP connection state.  Caller must hold the state locked.
5119  */
5120 int
5121 pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
5122                    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
5123                    u_short *reason)
5124 {
5125         struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
5126         u_int16_t        icmpid = 0, *icmpsum = NULL;
5127         u_int8_t         icmptype = 0;
5128         int              state_icmp = 0;
5129         int              error;
5130         struct pf_state_key_cmp key;
5131
5132         bzero(&key, sizeof(key));
5133
5134         switch (pd->proto) {
5135 #ifdef INET
5136         case IPPROTO_ICMP:
5137                 icmptype = pd->hdr.icmp->icmp_type;
5138                 icmpid = pd->hdr.icmp->icmp_id;
5139                 icmpsum = &pd->hdr.icmp->icmp_cksum;
5140
5141                 if (icmptype == ICMP_UNREACH ||
5142                     icmptype == ICMP_SOURCEQUENCH ||
5143                     icmptype == ICMP_REDIRECT ||
5144                     icmptype == ICMP_TIMXCEED ||
5145                     icmptype == ICMP_PARAMPROB)
5146                         state_icmp++;
5147                 break;
5148 #endif /* INET */
5149 #ifdef INET6
5150         case IPPROTO_ICMPV6:
5151                 icmptype = pd->hdr.icmp6->icmp6_type;
5152                 icmpid = pd->hdr.icmp6->icmp6_id;
5153                 icmpsum = &pd->hdr.icmp6->icmp6_cksum;
5154
5155                 if (icmptype == ICMP6_DST_UNREACH ||
5156                     icmptype == ICMP6_PACKET_TOO_BIG ||
5157                     icmptype == ICMP6_TIME_EXCEEDED ||
5158                     icmptype == ICMP6_PARAM_PROB)
5159                         state_icmp++;
5160                 break;
5161 #endif /* INET6 */
5162         }
5163
5164         if (!state_icmp) {
5165
5166                 /*
5167                  * ICMP query/reply message not related to a TCP/UDP packet.
5168                  * Search for an ICMP state.
5169                  */
5170                 key.af = pd->af;
5171                 key.proto = pd->proto;
5172                 key.port[0] = key.port[1] = icmpid;
5173                 if (direction == PF_IN) {       /* wire side, straight */
5174                         PF_ACPY(&key.addr[0], pd->src, key.af);
5175                         PF_ACPY(&key.addr[1], pd->dst, key.af);
5176                 } else {                        /* stack side, reverse */
5177                         PF_ACPY(&key.addr[1], pd->src, key.af);
5178                         PF_ACPY(&key.addr[0], pd->dst, key.af);
5179                 }
5180
5181                 STATE_LOOKUP(kif, &key, direction, *state, m);
5182                 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5183
5184                 (*state)->expire = time_second;
5185                 (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
5186
5187                 /* translate source/destination address, if necessary */
5188                 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5189                         struct pf_state_key *nk = (*state)->key[pd->didx];
5190
5191                         switch (pd->af) {
5192 #ifdef INET
5193                         case AF_INET:
5194                                 if (PF_ANEQ(pd->src,
5195                                     &nk->addr[pd->sidx], AF_INET))
5196                                         pf_change_a(&saddr->v4.s_addr,
5197                                             pd->ip_sum,
5198                                             nk->addr[pd->sidx].v4.s_addr, 0);
5199
5200                                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
5201                                     AF_INET))
5202                                         pf_change_a(&daddr->v4.s_addr,
5203                                             pd->ip_sum,
5204                                             nk->addr[pd->didx].v4.s_addr, 0);
5205
5206                                 if (nk->port[0] !=
5207                                     pd->hdr.icmp->icmp_id) {
5208                                         pd->hdr.icmp->icmp_cksum =
5209                                             pf_cksum_fixup(
5210                                             pd->hdr.icmp->icmp_cksum, icmpid,
5211                                             nk->port[pd->sidx], 0);
5212                                         pd->hdr.icmp->icmp_id =
5213                                             nk->port[pd->sidx];
5214                                 }
5215
5216                                 m->m_flags &= ~M_HASH;
5217                                 m_copyback(m, off, ICMP_MINLEN,
5218                                     (caddr_t)pd->hdr.icmp);
5219                                 break;
5220 #endif /* INET */
5221 #ifdef INET6
5222                         case AF_INET6:
5223                                 if (PF_ANEQ(pd->src,
5224                                     &nk->addr[pd->sidx], AF_INET6))
5225                                         pf_change_a6(saddr,
5226                                             &pd->hdr.icmp6->icmp6_cksum,
5227                                             &nk->addr[pd->sidx], 0);
5228
5229                                 if (PF_ANEQ(pd->dst,
5230                                     &nk->addr[pd->didx], AF_INET6))
5231                                         pf_change_a6(daddr,
5232                                             &pd->hdr.icmp6->icmp6_cksum,
5233                                             &nk->addr[pd->didx], 0);
5234
5235                                 m->m_flags &= ~M_HASH;
5236                                 m_copyback(m, off,
5237                                         sizeof(struct icmp6_hdr),
5238                                         (caddr_t)pd->hdr.icmp6);
5239                                 break;
5240 #endif /* INET6 */
5241                         }
5242                 }
5243         } else {
5244                 /*
5245                  * ICMP error message in response to a TCP/UDP packet.
5246                  * Extract the inner TCP/UDP header and search for that state.
5247                  */
5248
5249                 struct pf_pdesc pd2;
5250 #ifdef INET
5251                 struct ip       h2;
5252 #endif /* INET */
5253 #ifdef INET6
5254                 struct ip6_hdr  h2_6;
5255                 int             terminal = 0;
5256 #endif /* INET6 */
5257                 int             ipoff2;
5258                 int             off2;
5259
5260                 pd2.not_cpu_localized = 1;
5261                 pd2.af = pd->af;
5262                 /* Payload packet is from the opposite direction. */
5263                 pd2.sidx = (direction == PF_IN) ? 1 : 0;
5264                 pd2.didx = (direction == PF_IN) ? 0 : 1;
5265                 switch (pd->af) {
5266 #ifdef INET
5267                 case AF_INET:
5268                         /* offset of h2 in mbuf chain */
5269                         ipoff2 = off + ICMP_MINLEN;
5270
5271                         if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
5272                             NULL, reason, pd2.af)) {
5273                                 DPFPRINTF(PF_DEBUG_MISC,
5274                                     ("pf: ICMP error message too short "
5275                                     "(ip)\n"));
5276                                 FAIL (PF_DROP);
5277                         }
5278                         /*
5279                          * ICMP error messages don't refer to non-first
5280                          * fragments
5281                          */
5282                         if (h2.ip_off & htons(IP_OFFMASK)) {
5283                                 REASON_SET(reason, PFRES_FRAG);
5284                                 FAIL (PF_DROP);
5285                         }
5286
5287                         /* offset of protocol header that follows h2 */
5288                         off2 = ipoff2 + (h2.ip_hl << 2);
5289
5290                         pd2.proto = h2.ip_p;
5291                         pd2.src = (struct pf_addr *)&h2.ip_src;
5292                         pd2.dst = (struct pf_addr *)&h2.ip_dst;
5293                         pd2.ip_sum = &h2.ip_sum;
5294                         break;
5295 #endif /* INET */
5296 #ifdef INET6
5297                 case AF_INET6:
5298                         ipoff2 = off + sizeof(struct icmp6_hdr);
5299
5300                         if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
5301                             NULL, reason, pd2.af)) {
5302                                 DPFPRINTF(PF_DEBUG_MISC,
5303                                     ("pf: ICMP error message too short "
5304                                     "(ip6)\n"));
5305                                 FAIL (PF_DROP);
5306                         }
5307                         pd2.proto = h2_6.ip6_nxt;
5308                         pd2.src = (struct pf_addr *)&h2_6.ip6_src;
5309                         pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
5310                         pd2.ip_sum = NULL;
5311                         off2 = ipoff2 + sizeof(h2_6);
5312                         do {
5313                                 switch (pd2.proto) {
5314                                 case IPPROTO_FRAGMENT:
5315                                         /*
5316                                          * ICMPv6 error messages for
5317                                          * non-first fragments
5318                                          */
5319                                         REASON_SET(reason, PFRES_FRAG);
5320                                         FAIL (PF_DROP);
5321                                 case IPPROTO_AH:
5322                                 case IPPROTO_HOPOPTS:
5323                                 case IPPROTO_ROUTING:
5324                                 case IPPROTO_DSTOPTS: {
5325                                         /* get next header and header length */
5326                                         struct ip6_ext opt6;
5327
5328                                         if (!pf_pull_hdr(m, off2, &opt6,
5329                                             sizeof(opt6), NULL, reason,
5330                                             pd2.af)) {
5331                                                 DPFPRINTF(PF_DEBUG_MISC,
5332                                                     ("pf: ICMPv6 short opt\n"));
5333                                                 FAIL (PF_DROP);
5334                                         }
5335                                         if (pd2.proto == IPPROTO_AH)
5336                                                 off2 += (opt6.ip6e_len + 2) * 4;
5337                                         else
5338                                                 off2 += (opt6.ip6e_len + 1) * 8;
5339                                         pd2.proto = opt6.ip6e_nxt;
5340                                         /* goto the next header */
5341                                         break;
5342                                 }
5343                                 default:
5344                                         terminal++;
5345                                         break;
5346                                 }
5347                         } while (!terminal);
5348                         break;
5349 #endif /* INET6 */
5350                 default:
5351                         DPFPRINTF(PF_DEBUG_MISC,
5352                             ("pf: ICMP AF %d unknown (ip6)\n", pd->af));
5353                         FAIL (PF_DROP);
5354                         break;
5355                 }
5356
5357                 switch (pd2.proto) {
5358                 case IPPROTO_TCP: {
5359                         struct tcphdr            th;
5360                         u_int32_t                seq;
5361                         struct pf_state_peer    *src, *dst;
5362                         u_int8_t                 dws;
5363                         int                      copyback = 0;
5364
5365                         /*
5366                          * Only the first 8 bytes of the TCP header can be
5367                          * expected. Don't access any TCP header fields after
5368                          * th_seq, an ackskew test is not possible.
5369                          */
5370                         if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
5371                             pd2.af)) {
5372                                 DPFPRINTF(PF_DEBUG_MISC,
5373                                     ("pf: ICMP error message too short "
5374                                     "(tcp)\n"));
5375                                 FAIL (PF_DROP);
5376                         }
5377
5378                         key.af = pd2.af;
5379                         key.proto = IPPROTO_TCP;
5380                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5381                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5382                         key.port[pd2.sidx] = th.th_sport;
5383                         key.port[pd2.didx] = th.th_dport;
5384
5385                         STATE_LOOKUP(kif, &key, direction, *state, m);
5386                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5387
5388                         if (direction == (*state)->direction) {
5389                                 src = &(*state)->dst;
5390                                 dst = &(*state)->src;
5391                         } else {
5392                                 src = &(*state)->src;
5393                                 dst = &(*state)->dst;
5394                         }
5395
5396                         if (src->wscale && dst->wscale)
5397                                 dws = dst->wscale & PF_WSCALE_MASK;
5398                         else
5399                                 dws = 0;
5400
5401                         /* Demodulate sequence number */
5402                         seq = ntohl(th.th_seq) - src->seqdiff;
5403                         if (src->seqdiff) {
5404                                 pf_change_a(&th.th_seq, icmpsum,
5405                                     htonl(seq), 0);
5406                                 copyback = 1;
5407                         }
5408
5409                         if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
5410                             (!SEQ_GEQ(src->seqhi, seq) ||
5411                             !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
5412                                 if (pf_status.debug >= PF_DEBUG_MISC) {
5413                                         kprintf("pf: BAD ICMP %d:%d ",
5414                                             icmptype, pd->hdr.icmp->icmp_code);
5415                                         pf_print_host(pd->src, 0, pd->af);
5416                                         kprintf(" -> ");
5417                                         pf_print_host(pd->dst, 0, pd->af);
5418                                         kprintf(" state: ");
5419                                         pf_print_state(*state);
5420                                         kprintf(" seq=%u\n", seq);
5421                                 }
5422                                 REASON_SET(reason, PFRES_BADSTATE);
5423                                 FAIL (PF_DROP);
5424                         } else {
5425                                 if (pf_status.debug >= PF_DEBUG_MISC) {
5426                                         kprintf("pf: OK ICMP %d:%d ",
5427                                             icmptype, pd->hdr.icmp->icmp_code);
5428                                         pf_print_host(pd->src, 0, pd->af);
5429                                         kprintf(" -> ");
5430                                         pf_print_host(pd->dst, 0, pd->af);
5431                                         kprintf(" state: ");
5432                                         pf_print_state(*state);
5433                                         kprintf(" seq=%u\n", seq);
5434                                 }
5435                         }
5436
5437                         /* translate source/destination address, if necessary */
5438                         if ((*state)->key[PF_SK_WIRE] !=
5439                             (*state)->key[PF_SK_STACK]) {
5440                                 struct pf_state_key *nk =
5441                                     (*state)->key[pd->didx];
5442
5443                                 if (PF_ANEQ(pd2.src,
5444                                     &nk->addr[pd2.sidx], pd2.af) ||
5445                                     nk->port[pd2.sidx] != th.th_sport)
5446                                         pf_change_icmp(pd2.src, &th.th_sport,
5447                                             daddr, &nk->addr[pd2.sidx],
5448                                             nk->port[pd2.sidx], NULL,
5449                                             pd2.ip_sum, icmpsum,
5450                                             pd->ip_sum, 0, pd2.af);
5451
5452                                 if (PF_ANEQ(pd2.dst,
5453                                     &nk->addr[pd2.didx], pd2.af) ||
5454                                     nk->port[pd2.didx] != th.th_dport)
5455                                         pf_change_icmp(pd2.dst, &th.th_dport,
5456                                             NULL, /* XXX Inbound NAT? */
5457                                             &nk->addr[pd2.didx],
5458                                             nk->port[pd2.didx], NULL,
5459                                             pd2.ip_sum, icmpsum,
5460                                             pd->ip_sum, 0, pd2.af);
5461                                 copyback = 1;
5462                         }
5463
5464                         if (copyback) {
5465                                 switch (pd2.af) {
5466 #ifdef INET
5467                                 case AF_INET:
5468                                         m_copyback(m, off, ICMP_MINLEN,
5469                                             (caddr_t)pd->hdr.icmp);
5470                                         m_copyback(m, ipoff2, sizeof(h2),
5471                                             (caddr_t)&h2);
5472                                         break;
5473 #endif /* INET */
5474 #ifdef INET6
5475                                 case AF_INET6:
5476                                         m_copyback(m, off,
5477                                             sizeof(struct icmp6_hdr),
5478                                             (caddr_t)pd->hdr.icmp6);
5479                                         m_copyback(m, ipoff2, sizeof(h2_6),
5480                                             (caddr_t)&h2_6);
5481                                         break;
5482 #endif /* INET6 */
5483                                 }
5484                                 m->m_flags &= ~M_HASH;
5485                                 m_copyback(m, off2, 8, (caddr_t)&th);
5486                         }
5487                         break;
5488                 }
5489                 case IPPROTO_UDP: {
5490                         struct udphdr           uh;
5491
5492                         if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
5493                             NULL, reason, pd2.af)) {
5494                                 DPFPRINTF(PF_DEBUG_MISC,
5495                                     ("pf: ICMP error message too short "
5496                                     "(udp)\n"));
5497                                 return (PF_DROP);
5498                         }
5499
5500                         key.af = pd2.af;
5501                         key.proto = IPPROTO_UDP;
5502                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5503                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5504                         key.port[pd2.sidx] = uh.uh_sport;
5505                         key.port[pd2.didx] = uh.uh_dport;
5506
5507                         STATE_LOOKUP(kif, &key, direction, *state, m);
5508                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5509
5510                         /* translate source/destination address, if necessary */
5511                         if ((*state)->key[PF_SK_WIRE] !=
5512                             (*state)->key[PF_SK_STACK]) {
5513                                 struct pf_state_key *nk =
5514                                     (*state)->key[pd->didx];
5515
5516                                 if (PF_ANEQ(pd2.src,
5517                                     &nk->addr[pd2.sidx], pd2.af) ||
5518                                     nk->port[pd2.sidx] != uh.uh_sport)
5519                                         pf_change_icmp(pd2.src, &uh.uh_sport,
5520                                             daddr, &nk->addr[pd2.sidx],
5521                                             nk->port[pd2.sidx], &uh.uh_sum,
5522                                             pd2.ip_sum, icmpsum,
5523                                             pd->ip_sum, 1, pd2.af);
5524
5525                                 if (PF_ANEQ(pd2.dst,
5526                                     &nk->addr[pd2.didx], pd2.af) ||
5527                                     nk->port[pd2.didx] != uh.uh_dport)
5528                                         pf_change_icmp(pd2.dst, &uh.uh_dport,
5529                                             NULL, /* XXX Inbound NAT? */
5530                                             &nk->addr[pd2.didx],
5531                                             nk->port[pd2.didx], &uh.uh_sum,
5532                                             pd2.ip_sum, icmpsum,
5533                                             pd->ip_sum, 1, pd2.af);
5534
5535                                 switch (pd2.af) {
5536 #ifdef INET
5537                                 case AF_INET:
5538                                         m_copyback(m, off, ICMP_MINLEN,
5539                                             (caddr_t)pd->hdr.icmp);
5540                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5541                                         break;
5542 #endif /* INET */
5543 #ifdef INET6
5544                                 case AF_INET6:
5545                                         m_copyback(m, off,
5546                                             sizeof(struct icmp6_hdr),
5547                                             (caddr_t)pd->hdr.icmp6);
5548                                         m_copyback(m, ipoff2, sizeof(h2_6),
5549                                             (caddr_t)&h2_6);
5550                                         break;
5551 #endif /* INET6 */
5552                                 }
5553                                 m->m_flags &= ~M_HASH;
5554                                 m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
5555                         }
5556                         break;
5557                 }
5558 #ifdef INET
5559                 case IPPROTO_ICMP: {
5560                         struct icmp             iih;
5561
5562                         if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
5563                             NULL, reason, pd2.af)) {
5564                                 DPFPRINTF(PF_DEBUG_MISC,
5565                                     ("pf: ICMP error message too short i"
5566                                     "(icmp)\n"));
5567                                 return (PF_DROP);
5568                         }
5569
5570                         key.af = pd2.af;
5571                         key.proto = IPPROTO_ICMP;
5572                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5573                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5574                         key.port[0] = key.port[1] = iih.icmp_id;
5575
5576                         STATE_LOOKUP(kif, &key, direction, *state, m);
5577                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5578
5579                         /* translate source/destination address, if necessary */
5580                         if ((*state)->key[PF_SK_WIRE] !=
5581                             (*state)->key[PF_SK_STACK]) {
5582                                 struct pf_state_key *nk =
5583                                     (*state)->key[pd->didx];
5584
5585                                 if (PF_ANEQ(pd2.src,
5586                                     &nk->addr[pd2.sidx], pd2.af) ||
5587                                     nk->port[pd2.sidx] != iih.icmp_id)
5588                                         pf_change_icmp(pd2.src, &iih.icmp_id,
5589                                             daddr, &nk->addr[pd2.sidx],
5590                                             nk->port[pd2.sidx], NULL,
5591                                             pd2.ip_sum, icmpsum,
5592                                             pd->ip_sum, 0, AF_INET);
5593
5594                                 if (PF_ANEQ(pd2.dst,
5595                                     &nk->addr[pd2.didx], pd2.af) ||
5596                                     nk->port[pd2.didx] != iih.icmp_id)
5597                                         pf_change_icmp(pd2.dst, &iih.icmp_id,
5598                                             NULL, /* XXX Inbound NAT? */
5599                                             &nk->addr[pd2.didx],
5600                                             nk->port[pd2.didx], NULL,
5601                                             pd2.ip_sum, icmpsum,
5602                                             pd->ip_sum, 0, AF_INET);
5603
5604                                 m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
5605                                 m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5606                                 m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
5607                                 m->m_flags &= ~M_HASH;
5608                         }
5609                         break;
5610                 }
5611 #endif /* INET */
5612 #ifdef INET6
5613                 case IPPROTO_ICMPV6: {
5614                         struct icmp6_hdr        iih;
5615
5616                         if (!pf_pull_hdr(m, off2, &iih,
5617                             sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
5618                                 DPFPRINTF(PF_DEBUG_MISC,
5619                                     ("pf: ICMP error message too short "
5620                                     "(icmp6)\n"));
5621                                 FAIL (PF_DROP);
5622                         }
5623
5624                         key.af = pd2.af;
5625                         key.proto = IPPROTO_ICMPV6;
5626                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5627                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5628                         key.port[0] = key.port[1] = iih.icmp6_id;
5629
5630                         STATE_LOOKUP(kif, &key, direction, *state, m);
5631                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5632
5633                         /* translate source/destination address, if necessary */
5634                         if ((*state)->key[PF_SK_WIRE] !=
5635                             (*state)->key[PF_SK_STACK]) {
5636                                 struct pf_state_key *nk =
5637                                     (*state)->key[pd->didx];
5638
5639                                 if (PF_ANEQ(pd2.src,
5640                                     &nk->addr[pd2.sidx], pd2.af) ||
5641                                     nk->port[pd2.sidx] != iih.icmp6_id)
5642                                         pf_change_icmp(pd2.src, &iih.icmp6_id,
5643                                             daddr, &nk->addr[pd2.sidx],
5644                                             nk->port[pd2.sidx], NULL,
5645                                             pd2.ip_sum, icmpsum,
5646                                             pd->ip_sum, 0, AF_INET6);
5647
5648                                 if (PF_ANEQ(pd2.dst,
5649                                     &nk->addr[pd2.didx], pd2.af) ||
5650                                     nk->port[pd2.didx] != iih.icmp6_id)
5651                                         pf_change_icmp(pd2.dst, &iih.icmp6_id,
5652                                             NULL, /* XXX Inbound NAT? */
5653                                             &nk->addr[pd2.didx],
5654                                             nk->port[pd2.didx], NULL,
5655                                             pd2.ip_sum, icmpsum,
5656                                             pd->ip_sum, 0, AF_INET6);
5657
5658                                 m_copyback(m, off, sizeof(struct icmp6_hdr),
5659                                     (caddr_t)pd->hdr.icmp6);
5660                                 m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
5661                                 m_copyback(m, off2, sizeof(struct icmp6_hdr),
5662                                     (caddr_t)&iih);
5663                                 m->m_flags &= ~M_HASH;
5664                         }
5665                         break;
5666                 }
5667 #endif /* INET6 */
5668                 default: {
5669                         key.af = pd2.af;
5670                         key.proto = pd2.proto;
5671                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5672                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5673                         key.port[0] = key.port[1] = 0;
5674
5675                         STATE_LOOKUP(kif, &key, direction, *state, m);
5676                         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5677
5678                         /* translate source/destination address, if necessary */
5679                         if ((*state)->key[PF_SK_WIRE] !=
5680                             (*state)->key[PF_SK_STACK]) {
5681                                 struct pf_state_key *nk =
5682                                     (*state)->key[pd->didx];
5683
5684                                 if (PF_ANEQ(pd2.src,
5685                                     &nk->addr[pd2.sidx], pd2.af))
5686                                         pf_change_icmp(pd2.src, NULL, daddr,
5687                                             &nk->addr[pd2.sidx], 0, NULL,
5688                                             pd2.ip_sum, icmpsum,
5689                                             pd->ip_sum, 0, pd2.af);
5690
5691                                 if (PF_ANEQ(pd2.dst,
5692                                     &nk->addr[pd2.didx], pd2.af))
5693                                         pf_change_icmp(pd2.src, NULL,
5694                                             NULL, /* XXX Inbound NAT? */
5695                                             &nk->addr[pd2.didx], 0, NULL,
5696                                             pd2.ip_sum, icmpsum,
5697                                             pd->ip_sum, 0, pd2.af);
5698
5699                                 switch (pd2.af) {
5700 #ifdef INET
5701                                 case AF_INET:
5702                                         m_copyback(m, off, ICMP_MINLEN,
5703                                             (caddr_t)pd->hdr.icmp);
5704                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5705                                         m->m_flags &= ~M_HASH;
5706                                         break;
5707 #endif /* INET */
5708 #ifdef INET6
5709                                 case AF_INET6:
5710                                         m_copyback(m, off,
5711                                             sizeof(struct icmp6_hdr),
5712                                             (caddr_t)pd->hdr.icmp6);
5713                                         m_copyback(m, ipoff2, sizeof(h2_6),
5714                                             (caddr_t)&h2_6);
5715                                         m->m_flags &= ~M_HASH;
5716                                         break;
5717 #endif /* INET6 */
5718                                 }
5719                         }
5720                         break;
5721                 }
5722                 }
5723         }
5724
5725         pfsync_update_state(*state);
5726         error = PF_PASS;
5727 done:
5728         if (*state)
5729                 lockmgr(&(*state)->lk, LK_RELEASE);
5730         return (error);
5731 }
5732
5733 /*
5734  * Test other connection state.  Caller must hold the state locked.
5735  */
5736 int
5737 pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
5738                     struct mbuf *m, struct pf_pdesc *pd)
5739 {
5740         struct pf_state_peer    *src, *dst;
5741         struct pf_state_key_cmp  key;
5742
5743         bzero(&key, sizeof(key));
5744         key.af = pd->af;
5745         key.proto = pd->proto;
5746         if (direction == PF_IN) {
5747                 PF_ACPY(&key.addr[0], pd->src, key.af);
5748                 PF_ACPY(&key.addr[1], pd->dst, key.af);
5749                 key.port[0] = key.port[1] = 0;
5750         } else {
5751                 PF_ACPY(&key.addr[1], pd->src, key.af);
5752                 PF_ACPY(&key.addr[0], pd->dst, key.af);
5753                 key.port[1] = key.port[0] = 0;
5754         }
5755
5756         STATE_LOOKUP(kif, &key, direction, *state, m);
5757         lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5758
5759         if (direction == (*state)->direction) {
5760                 src = &(*state)->src;
5761                 dst = &(*state)->dst;
5762         } else {
5763                 src = &(*state)->dst;
5764                 dst = &(*state)->src;
5765         }
5766
5767         /* update states */
5768         if (src->state < PFOTHERS_SINGLE)
5769                 src->state = PFOTHERS_SINGLE;
5770         if (dst->state == PFOTHERS_SINGLE)
5771                 dst->state = PFOTHERS_MULTIPLE;
5772
5773         /* update expire time */
5774         (*state)->expire = time_second;
5775         if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5776                 (*state)->timeout = PFTM_OTHER_MULTIPLE;
5777         else
5778                 (*state)->timeout = PFTM_OTHER_SINGLE;
5779
5780         /* translate source/destination address, if necessary */
5781         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5782                 struct pf_state_key *nk = (*state)->key[pd->didx];
5783
5784                 KKASSERT(nk);
5785                 KKASSERT(pd);
5786                 KKASSERT(pd->src);
5787                 KKASSERT(pd->dst);
5788                 switch (pd->af) {
5789 #ifdef INET
5790                 case AF_INET:
5791                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5792                                 pf_change_a(&pd->src->v4.s_addr,
5793                                     pd->ip_sum,
5794                                     nk->addr[pd->sidx].v4.s_addr,
5795                                     0);
5796
5797
5798                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5799                                 pf_change_a(&pd->dst->v4.s_addr,
5800                                     pd->ip_sum,
5801                                     nk->addr[pd->didx].v4.s_addr,
5802                                     0);
5803
5804                         break;
5805 #endif /* INET */
5806 #ifdef INET6
5807                 case AF_INET6:
5808                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5809                                 PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5810
5811                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5812                                 PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5813 #endif /* INET6 */
5814                 }
5815         }
5816
5817         pfsync_update_state(*state);
5818         lockmgr(&(*state)->lk, LK_RELEASE);
5819         return (PF_PASS);
5820 }
5821
5822 /*
5823  * ipoff and off are measured from the start of the mbuf chain.
5824  * h must be at "ipoff" on the mbuf chain.
5825  */
5826 void *
5827 pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5828     u_short *actionp, u_short *reasonp, sa_family_t af)
5829 {
5830         switch (af) {
5831 #ifdef INET
5832         case AF_INET: {
5833                 struct ip       *h = mtod(m, struct ip *);
5834                 u_int16_t        fragoff = (h->ip_off & IP_OFFMASK) << 3;
5835
5836                 if (fragoff) {
5837                         if (fragoff >= len)
5838                                 ACTION_SET(actionp, PF_PASS);
5839                         else {
5840                                 ACTION_SET(actionp, PF_DROP);
5841                                 REASON_SET(reasonp, PFRES_FRAG);
5842                         }
5843                         return (NULL);
5844                 }
5845                 if (m->m_pkthdr.len < off + len ||
5846                     h->ip_len < off + len) {
5847                         ACTION_SET(actionp, PF_DROP);
5848                         REASON_SET(reasonp, PFRES_SHORT);
5849                         return (NULL);
5850                 }
5851                 break;
5852         }
5853 #endif /* INET */
5854 #ifdef INET6
5855         case AF_INET6: {
5856                 struct ip6_hdr  *h = mtod(m, struct ip6_hdr *);
5857
5858                 if (m->m_pkthdr.len < off + len ||
5859                     (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5860                     (unsigned)(off + len)) {
5861                         ACTION_SET(actionp, PF_DROP);
5862                         REASON_SET(reasonp, PFRES_SHORT);
5863                         return (NULL);
5864                 }
5865                 break;
5866         }
5867 #endif /* INET6 */
5868         }
5869         m_copydata(m, off, len, p);
5870         return (p);
5871 }
5872
5873 int
5874 pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif)
5875 {
5876         struct sockaddr_in      *dst;
5877         int                      ret = 1;
5878         int                      check_mpath;
5879 #ifdef INET6
5880         struct sockaddr_in6     *dst6;
5881         struct route_in6         ro;
5882 #else
5883         struct route             ro;
5884 #endif
5885         struct radix_node       *rn;
5886         struct rtentry          *rt;
5887         struct ifnet            *ifp;
5888
5889         check_mpath = 0;
5890         bzero(&ro, sizeof(ro));
5891         switch (af) {
5892         case AF_INET:
5893                 dst = satosin(&ro.ro_dst);
5894                 dst->sin_family = AF_INET;
5895                 dst->sin_len = sizeof(*dst);
5896                 dst->sin_addr = addr->v4;
5897                 break;
5898 #ifdef INET6
5899         case AF_INET6:
5900                 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5901                 dst6->sin6_family = AF_INET6;
5902                 dst6->sin6_len = sizeof(*dst6);
5903                 dst6->sin6_addr = addr->v6;
5904                 break;
5905 #endif /* INET6 */
5906         default:
5907                 return (0);
5908         }
5909
5910         /* Skip checks for ipsec interfaces */
5911         if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5912                 goto out;
5913
5914         rtalloc_ign((struct route *)&ro, 0);
5915
5916         if (ro.ro_rt != NULL) {
5917                 /* No interface given, this is a no-route check */
5918                 if (kif == NULL)
5919                         goto out;
5920
5921                 if (kif->pfik_ifp == NULL) {
5922                         ret = 0;
5923                         goto out;
5924                 }
5925
5926                 /* Perform uRPF check if passed input interface */
5927                 ret = 0;
5928                 rn = (struct radix_node *)ro.ro_rt;
5929                 do {
5930                         rt = (struct rtentry *)rn;
5931                         ifp = rt->rt_ifp;
5932
5933                         if (kif->pfik_ifp == ifp)
5934                                 ret = 1;
5935                         rn = NULL;
5936                 } while (check_mpath == 1 && rn != NULL && ret == 0);
5937         } else
5938                 ret = 0;
5939 out:
5940         if (ro.ro_rt != NULL)
5941                 RTFREE(ro.ro_rt);
5942         return (ret);
5943 }
5944
5945 int
5946 pf_rtlabel_match(struct pf_addr *addr, sa_family_t af, struct pf_addr_wrap *aw)
5947 {
5948         struct sockaddr_in      *dst;
5949 #ifdef INET6
5950         struct sockaddr_in6     *dst6;
5951         struct route_in6         ro;
5952 #else
5953         struct route             ro;
5954 #endif
5955         int                      ret = 0;
5956
5957         ASSERT_LWKT_TOKEN_HELD(&pf_token);
5958
5959         bzero(&ro, sizeof(ro));
5960         switch (af) {
5961         case AF_INET:
5962                 dst = satosin(&ro.ro_dst);
5963                 dst->sin_family = AF_INET;
5964                 dst->sin_len = sizeof(*dst);
5965                 dst->sin_addr = addr->v4;
5966                 break;
5967 #ifdef INET6
5968         case AF_INET6:
5969                 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5970                 dst6->sin6_family = AF_INET6;
5971                 dst6->sin6_len = sizeof(*dst6);
5972                 dst6->sin6_addr = addr->v6;
5973                 break;
5974 #endif /* INET6 */
5975         default:
5976                 return (0);
5977         }
5978
5979 rtalloc_ign((struct route *)&ro, (RTF_CLONING | RTF_PRCLONING));
5980
5981         if (ro.ro_rt != NULL) {
5982                 RTFREE(ro.ro_rt);
5983         }
5984
5985         return (ret);
5986 }
5987
5988 #ifdef INET
5989 void
5990 pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5991     struct pf_state *s, struct pf_pdesc *pd)
5992 {
5993         struct mbuf             *m0, *m1;
5994         struct route             iproute;
5995         struct route            *ro = NULL;
5996         struct sockaddr_in      *dst;
5997         struct ip               *ip;
5998         struct ifnet            *ifp = NULL;
5999         struct pf_addr           naddr;
6000         struct pf_src_node      *sn = NULL;
6001         int                      error = 0;
6002         int sw_csum;
6003 #ifdef IPSEC
6004         struct m_tag            *mtag;
6005 #endif /* IPSEC */
6006
6007         ASSERT_LWKT_TOKEN_HELD(&pf_token);
6008
6009         if (m == NULL || *m == NULL || r == NULL ||
6010             (dir != PF_IN && dir != PF_OUT) || oifp == NULL)
6011                 panic("pf_route: invalid parameters");
6012
6013         if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
6014                 (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED;
6015                 (*m)->m_pkthdr.pf.routed = 1;
6016         } else {
6017                 if ((*m)->m_pkthdr.pf.routed++ > 3) {
6018                         m0 = *m;
6019                         *m = NULL;
6020                         goto bad;
6021                 }
6022         }
6023
6024         if (r->rt == PF_DUPTO) {
6025                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
6026                         return;
6027                 }
6028         } else {
6029                 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
6030                         return;
6031                 }
6032                 m0 = *m;
6033         }
6034
6035         if (m0->m_len < sizeof(struct ip)) {
6036                 DPFPRINTF(PF_DEBUG_URGENT,
6037                     ("pf_route: m0->m_len < sizeof(struct ip)\n"));
6038                 goto bad;
6039         }
6040
6041         ip = mtod(m0, struct ip *);
6042
6043         ro = &iproute;
6044         bzero((caddr_t)ro, sizeof(*ro));
6045         dst = satosin(&ro->ro_dst);
6046         dst->sin_family = AF_INET;
6047         dst->sin_len = sizeof(*dst);
6048         dst->sin_addr = ip->ip_dst;
6049
6050         if (r->rt == PF_FASTROUTE) {
6051                 rtalloc(ro);
6052                 if (ro->ro_rt == 0) {
6053                         ipstat.ips_noroute++;
6054                         goto bad;
6055                 }
6056
6057                 ifp = ro->ro_rt->rt_ifp;
6058                 ro->ro_rt->rt_use++;
6059
6060                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
6061                         dst = satosin(ro->ro_rt->rt_gateway);
6062         } else {
6063                 if (TAILQ_EMPTY(&r->rpool.list)) {
6064                         DPFPRINTF(PF_DEBUG_URGENT,
6065                             ("pf_route: TAILQ_EMPTY(&r->rpool.list)\n"));
6066                         goto bad;
6067                 }
6068                 if (s == NULL) {
6069                         pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
6070                             &naddr, NULL, &sn);
6071                         if (!PF_AZERO(&naddr, AF_INET))
6072                                 dst->sin_addr.s_addr = naddr.v4.s_addr;
6073                         ifp = r->rpool.cur->kif ?
6074                             r->rpool.cur->kif->pfik_ifp : NULL;
6075                 } else {
6076                         if (!PF_AZERO(&s->rt_addr, AF_INET))
6077                                 dst->sin_addr.s_addr =
6078                                     s->rt_addr.v4.s_addr;
6079                         ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
6080                 }
6081         }
6082         if (ifp == NULL)
6083                 goto bad;
6084
6085         if (oifp != ifp) {
6086                 if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
6087                         goto bad;
6088                 } else if (m0 == NULL) {
6089                         goto done;
6090                 }
6091                 if (m0->m_len < sizeof(struct ip)) {
6092                         DPFPRINTF(PF_DEBUG_URGENT,
6093                             ("pf_route: m0->m_len < sizeof(struct ip)\n"));
6094                         goto bad;
6095                 }
6096                 ip = mtod(m0, struct ip *);
6097         }
6098
6099         /* Copied from FreeBSD 5.1-CURRENT ip_output. */
6100         m0->m_pkthdr.csum_flags |= CSUM_IP;
6101         sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
6102         if (sw_csum & CSUM_DELAY_DATA) {
6103                 in_delayed_cksum(m0);
6104                 sw_csum &= ~CSUM_DELAY_DATA;
6105         }
6106         m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
6107         m0->m_pkthdr.csum_iphlen = (ip->ip_hl << 2);
6108
6109         /*
6110          * WARNING!  We cannot fragment if the packet was modified from an
6111          *           original which expected to be using TSO.  In this
6112          *           situation we pray that the target interface is
6113          *           compatible with the originating interface.
6114          */
6115         if (ip->ip_len <= ifp->if_mtu ||
6116             (m0->m_pkthdr.csum_flags & CSUM_TSO) ||
6117             ((ifp->if_hwassist & CSUM_FRAGMENT) &&
6118                 (ip->ip_off & IP_DF) == 0)) {
6119                 ip->ip_len = htons(ip->ip_len);
6120                 ip->ip_off = htons(ip->ip_off);
6121                 ip->ip_sum = 0;
6122                 if (sw_csum & CSUM_DELAY_IP) {
6123                         /* From KAME */
6124                         if (ip->ip_v == IPVERSION &&
6125                             (ip->ip_hl << 2) == sizeof(*ip)) {
6126                                 ip->ip_sum = in_cksum_hdr(ip);
6127                         } else {
6128                                 ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
6129                         }
6130                 }
6131                 lwkt_reltoken(&pf_token);
6132                 error = ifp->if_output(ifp, m0, sintosa(dst), ro->ro_rt);
6133                 lwkt_gettoken(&pf_token);
6134                 goto done;
6135         }
6136
6137         /*
6138          * Too large for interface; fragment if possible.
6139          * Must be able to put at least 8 bytes per fragment.
6140          */
6141         if (ip->ip_off & IP_DF) {
6142                 ipstat.ips_cantfrag++;
6143                 if (r->rt != PF_DUPTO) {
6144                         icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
6145                                    ifp->if_mtu);
6146                         goto done;
6147                 } else
6148                         goto bad;
6149         }
6150
6151         m1 = m0;
6152         error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
6153         if (error) {
6154                 goto bad;
6155         }
6156
6157         for (m0 = m1; m0; m0 = m1) {
6158                 m1 = m0->m_nextpkt;
6159                 m0->m_nextpkt = 0;
6160                 if (error == 0) {
6161                         lwkt_reltoken(&pf_token);
6162                         error = (*ifp->if_output)(ifp, m0, sintosa(dst),
6163                                                   NULL);
6164                         lwkt_gettoken(&pf_token);
6165                 } else
6166                         m_freem(m0);
6167         }
6168
6169         if (error == 0)
6170                 ipstat.ips_fragmented++;
6171
6172 done:
6173         if (r->rt != PF_DUPTO)
6174                 *m = NULL;
6175         if (ro == &iproute && ro->ro_rt)
6176                 RTFREE(ro->ro_rt);
6177         return;
6178
6179 bad:
6180         m_freem(m0);
6181         goto done;
6182 }
6183 #endif /* INET */
6184
6185 #ifdef INET6
6186 void
6187 pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
6188     struct pf_state *s, struct pf_pdesc *pd)
6189 {
6190         struct mbuf             *m0;
6191         struct route_in6         ip6route;
6192         struct route_in6        *ro;
6193         struct sockaddr_in6     *dst;
6194         struct ip6_hdr          *ip6;
6195         struct ifnet            *ifp = NULL;
6196         struct pf_addr           naddr;
6197         struct pf_src_node      *sn = NULL;
6198
6199         if (m == NULL || *m == NULL || r == NULL ||
6200             (dir != PF_IN && dir != PF_OUT) || oifp == NULL)
6201                 panic("pf_route6: invalid parameters");
6202
6203         if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
6204                 (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED;
6205                 (*m)->m_pkthdr.pf.routed = 1;
6206         } else {
6207                 if ((*m)->m_pkthdr.pf.routed++ > 3) {
6208                         m0 = *m;
6209                         *m = NULL;
6210                         goto bad;
6211                 }
6212         }
6213
6214         if (r->rt == PF_DUPTO) {
6215                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL)
6216                         return;
6217         } else {
6218                 if ((r->rt == PF_REPLYTO) == (r->direction == dir))
6219                         return;
6220                 m0 = *m;
6221         }
6222
6223         if (m0->m_len < sizeof(struct ip6_hdr)) {
6224                 DPFPRINTF(PF_DEBUG_URGENT,
6225                     ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
6226                 goto bad;
6227         }
6228         ip6 = mtod(m0, struct ip6_hdr *);
6229
6230         ro = &ip6route;
6231         bzero((caddr_t)ro, sizeof(*ro));
6232         dst = (struct sockaddr_in6 *)&ro->ro_dst;
6233         dst->sin6_family = AF_INET6;
6234         dst->sin6_len = sizeof(*dst);
6235         dst->sin6_addr = ip6->ip6_dst;
6236
6237         /*
6238          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6239          * so make sure pf.flags is clear.
6240          *
6241          * Cheat. XXX why only in the v6 case???
6242          */
6243         if (r->rt == PF_FASTROUTE) {
6244                 m0->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
6245                 m0->m_pkthdr.pf.flags = 0;
6246                 /* XXX Re-Check when Upgrading to > 4.4 */
6247                 m0->m_pkthdr.pf.statekey = NULL;
6248                 ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
6249                 return;
6250         }
6251
6252         if (TAILQ_EMPTY(&r->rpool.list)) {
6253                 DPFPRINTF(PF_DEBUG_URGENT,
6254                     ("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n"));
6255                 goto bad;
6256         }
6257         if (s == NULL) {
6258                 pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
6259                     &naddr, NULL, &sn);
6260                 if (!PF_AZERO(&naddr, AF_INET6))
6261                         PF_ACPY((struct pf_addr *)&dst->sin6_addr,
6262                             &naddr, AF_INET6);
6263                 ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
6264         } else {
6265                 if (!PF_AZERO(&s->rt_addr, AF_INET6))
6266                         PF_ACPY((struct pf_addr *)&dst->sin6_addr,
6267                             &s->rt_addr, AF_INET6);
6268                 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
6269         }
6270         if (ifp == NULL)
6271                 goto bad;
6272
6273         if (oifp != ifp) {
6274                 if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
6275                         goto bad;
6276                 } else if (m0 == NULL) {
6277                         goto done;
6278                 }
6279                 if (m0->m_len < sizeof(struct ip6_hdr)) {
6280                         DPFPRINTF(PF_DEBUG_URGENT,
6281                             ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
6282                         goto bad;
6283                 }
6284                 ip6 = mtod(m0, struct ip6_hdr *);
6285         }
6286
6287         /*
6288          * If the packet is too large for the outgoing interface,
6289          * send back an icmp6 error.
6290          */
6291         if (IN6_IS_ADDR_LINKLOCAL(&dst->sin6_addr))
6292                 dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index);
6293         if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
6294                 nd6_output(ifp, ifp, m0, dst, NULL);
6295         } else {
6296                 in6_ifstat_inc(ifp, ifs6_in_toobig);
6297                 if (r->rt != PF_DUPTO)
6298                         icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
6299                 else
6300                         goto bad;
6301         }
6302
6303 done:
6304         if (r->rt != PF_DUPTO)
6305                 *m = NULL;
6306         return;
6307
6308 bad:
6309         m_freem(m0);
6310         goto done;
6311 }
6312 #endif /* INET6 */
6313
6314
6315 /*
6316  * check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag
6317  *   off is the offset where the protocol header starts
6318  *   len is the total length of protocol header plus payload
6319  * returns 0 when the checksum is valid, otherwise returns 1.
6320  */
6321 /*
6322  * XXX
6323  * FreeBSD supports cksum offload for the following drivers.
6324  * em(4), gx(4), lge(4), nge(4), ti(4), xl(4)
6325  * If we can make full use of it we would outperform ipfw/ipfilter in
6326  * very heavy traffic. 
6327  * I have not tested 'cause I don't have NICs that supports cksum offload.
6328  * (There might be problems. Typical phenomena would be
6329  *   1. No route message for UDP packet.
6330  *   2. No connection acceptance from external hosts regardless of rule set.)
6331  */
6332 int
6333 pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p,
6334     sa_family_t af)
6335 {
6336         u_int16_t sum = 0;
6337         int hw_assist = 0;
6338         struct ip *ip;
6339
6340         if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
6341                 return (1);
6342         if (m->m_pkthdr.len < off + len)
6343                 return (1);
6344
6345         switch (p) {
6346         case IPPROTO_TCP:
6347         case IPPROTO_UDP:
6348                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
6349                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6350                                 sum = m->m_pkthdr.csum_data;
6351                         } else {
6352                                 ip = mtod(m, struct ip *);      
6353                                 sum = in_pseudo(ip->ip_src.s_addr,
6354                                         ip->ip_dst.s_addr, htonl((u_short)len +
6355                                         m->m_pkthdr.csum_data + p));
6356                         }
6357                         sum ^= 0xffff;
6358                         ++hw_assist;
6359                 }
6360                 break;
6361         case IPPROTO_ICMP:
6362 #ifdef INET6
6363         case IPPROTO_ICMPV6:
6364 #endif /* INET6 */
6365                 break;
6366         default:
6367                 return (1);
6368         }
6369
6370         if (!hw_assist) {
6371                 switch (af) {
6372                 case AF_INET:
6373                         if (p == IPPROTO_ICMP) {
6374                                 if (m->m_len < off)
6375                                         return (1);
6376                                 m->m_data += off;
6377                                 m->m_len -= off;
6378                                 sum = in_cksum(m, len);
6379                                 m->m_data -= off;
6380                                 m->m_len += off;
6381                         } else {
6382                                 if (m->m_len < sizeof(struct ip))
6383                                         return (1);
6384                                 sum = in_cksum_range(m, p, off, len);
6385                                 if (sum == 0) {
6386                                         m->m_pkthdr.csum_flags |=
6387                                             (CSUM_DATA_VALID |
6388                                              CSUM_PSEUDO_HDR);
6389                                         m->m_pkthdr.csum_data = 0xffff;
6390                                 }
6391                         }
6392                         break;
6393 #ifdef INET6
6394                 case AF_INET6:
6395                         if (m->m_len < sizeof(struct ip6_hdr))
6396                                 return (1);
6397                         sum = in6_cksum(m, p, off, len);
6398                         /*
6399                          * XXX
6400                          * IPv6 H/W cksum off-load not supported yet!
6401                          *
6402                          * if (sum == 0) {
6403                          *      m->m_pkthdr.csum_flags |=
6404                          *          (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
6405                          *      m->m_pkthdr.csum_data = 0xffff;
6406                          *}
6407                          */
6408                         break;
6409 #endif /* INET6 */
6410                 default:
6411                         return (1);
6412                 }
6413         }
6414         if (sum) {
6415                 switch (p) {
6416                 case IPPROTO_TCP:
6417                         tcpstat.tcps_rcvbadsum++;
6418                         break;
6419                 case IPPROTO_UDP:
6420                         udp_stat.udps_badsum++;
6421                         break;
6422                 case IPPROTO_ICMP:
6423                         icmpstat.icps_checksum++;
6424                         break;
6425 #ifdef INET6
6426                 case IPPROTO_ICMPV6:
6427                         icmp6stat.icp6s_checksum++;
6428                         break;
6429 #endif /* INET6 */
6430                 }
6431                 return (1);
6432         }
6433         return (0);
6434 }
6435
6436 struct pf_divert *
6437 pf_find_divert(struct mbuf *m)
6438 {
6439         struct m_tag    *mtag;
6440
6441         if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL)
6442                 return (NULL);
6443
6444         return ((struct pf_divert *)(mtag + 1));
6445 }
6446
6447 struct pf_divert *
6448 pf_get_divert(struct mbuf *m)
6449 {
6450         struct m_tag    *mtag;
6451
6452         if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL) {
6453                 mtag = m_tag_get(PACKET_TAG_PF_DIVERT, sizeof(struct pf_divert),
6454                     M_NOWAIT);
6455                 if (mtag == NULL)
6456                         return (NULL);
6457                 bzero(mtag + 1, sizeof(struct pf_divert));
6458                 m_tag_prepend(m, mtag);
6459         }
6460
6461         return ((struct pf_divert *)(mtag + 1));
6462 }
6463
6464 #ifdef INET
6465
6466 /*
6467  * WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
6468  */
6469 int
6470 pf_test(int dir, struct ifnet *ifp, struct mbuf **m0,
6471     struct ether_header *eh, struct inpcb *inp)
6472 {
6473         struct pfi_kif          *kif;
6474         u_short                  action, reason = 0, log = 0;
6475         struct mbuf             *m = *m0;
6476         struct ip               *h = NULL;
6477         struct pf_rule          *a = NULL, *r = &pf_default_rule, *tr, *nr;
6478         struct pf_state         *s = NULL;
6479         struct pf_ruleset       *ruleset = NULL;
6480         struct pf_pdesc          pd;
6481         int                      off, dirndx;
6482 #ifdef ALTQ
6483         int                      pqid = 0;
6484 #endif
6485
6486         if (!pf_status.running)
6487                 return (PF_PASS);
6488
6489         memset(&pd, 0, sizeof(pd));
6490 #ifdef foo
6491         if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
6492                 kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
6493         else
6494 #endif
6495                 kif = (struct pfi_kif *)ifp->if_pf_kif;
6496
6497         if (kif == NULL) {
6498                 DPFPRINTF(PF_DEBUG_URGENT,
6499                     ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
6500                 return (PF_DROP);
6501         }
6502         if (kif->pfik_flags & PFI_IFLAG_SKIP)
6503                 return (PF_PASS);
6504
6505 #ifdef DIAGNOSTIC
6506         if ((m->m_flags & M_PKTHDR) == 0)
6507                 panic("non-M_PKTHDR is passed to pf_test");
6508 #endif /* DIAGNOSTIC */
6509
6510         if (m->m_pkthdr.len < (int)sizeof(*h)) {
6511                 action = PF_DROP;
6512                 REASON_SET(&reason, PFRES_SHORT);
6513                 log = 1;
6514                 goto done;
6515         }
6516
6517         /*
6518          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6519          * so make sure pf.flags is clear.
6520          */
6521         if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
6522                 return (PF_PASS);
6523         m->m_pkthdr.pf.flags = 0;
6524         /* Re-Check when updating to > 4.4 */
6525         m->m_pkthdr.pf.statekey = NULL;
6526
6527         /* We do IP header normalization and packet reassembly here */
6528         if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
6529                 action = PF_DROP;
6530                 goto done;
6531         }
6532         m = *m0;        /* pf_normalize messes with m0 */
6533         h = mtod(m, struct ip *);
6534
6535         off = h->ip_hl << 2;
6536         if (off < (int)sizeof(*h)) {
6537                 action = PF_DROP;
6538                 REASON_SET(&reason, PFRES_SHORT);
6539                 log = 1;
6540                 goto done;
6541         }
6542
6543         pd.src = (struct pf_addr *)&h->ip_src;
6544         pd.dst = (struct pf_addr *)&h->ip_dst;
6545         pd.sport = pd.dport = NULL;
6546         pd.ip_sum = &h->ip_sum;
6547         pd.proto_sum = NULL;
6548         pd.proto = h->ip_p;
6549         pd.dir = dir;
6550         pd.sidx = (dir == PF_IN) ? 0 : 1;
6551         pd.didx = (dir == PF_IN) ? 1 : 0;
6552         pd.af = AF_INET;
6553         pd.tos = h->ip_tos;
6554         pd.tot_len = h->ip_len;
6555         pd.eh = eh;
6556
6557         /* handle fragments that didn't get reassembled by normalization */
6558         if (h->ip_off & (IP_MF | IP_OFFMASK)) {
6559                 action = pf_test_fragment(&r, dir, kif, m, h,
6560                     &pd, &a, &ruleset);
6561                 goto done;
6562         }
6563
6564         switch (h->ip_p) {
6565
6566         case IPPROTO_TCP: {
6567                 struct tcphdr   th;
6568
6569                 pd.hdr.tcp = &th;
6570                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6571                     &action, &reason, AF_INET)) {
6572                         log = action != PF_PASS;
6573                         goto done;
6574                 }
6575                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6576 #ifdef ALTQ
6577                 if ((th.th_flags & TH_ACK) && pd.p_len == 0)
6578                         pqid = 1;
6579 #endif
6580                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6581                 if (action == PF_DROP)
6582                         goto done;
6583                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6584                                            &reason);
6585                 if (action == PF_PASS) {
6586                         r = s->rule.ptr;
6587                         a = s->anchor.ptr;
6588                         log = s->log;
6589                 } else if (s == NULL) {
6590                         action = pf_test_rule(&r, &s, dir, kif,
6591                                               m, off, h, &pd, &a,
6592                                               &ruleset, NULL, inp);
6593                 }
6594                 break;
6595         }
6596
6597         case IPPROTO_UDP: {
6598                 struct udphdr   uh;
6599
6600                 pd.hdr.udp = &uh;
6601                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6602                     &action, &reason, AF_INET)) {
6603                         log = action != PF_PASS;
6604                         goto done;
6605                 }
6606                 if (uh.uh_dport == 0 ||
6607                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6608                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6609                         action = PF_DROP;
6610                         REASON_SET(&reason, PFRES_SHORT);
6611                         goto done;
6612                 }
6613                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6614                 if (action == PF_PASS) {
6615                         r = s->rule.ptr;
6616                         a = s->anchor.ptr;
6617                         log = s->log;
6618                 } else if (s == NULL) {
6619                         action = pf_test_rule(&r, &s, dir, kif,
6620                                               m, off, h, &pd, &a,
6621                                               &ruleset, NULL, inp);
6622                 }
6623                 break;
6624         }
6625
6626         case IPPROTO_ICMP: {
6627                 struct icmp     ih;
6628
6629                 pd.hdr.icmp = &ih;
6630                 if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
6631                     &action, &reason, AF_INET)) {
6632                         log = action != PF_PASS;
6633                         goto done;
6634                 }
6635                 action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
6636                                             &reason);
6637                 if (action == PF_PASS) {
6638                         r = s->rule.ptr;
6639                         a = s->anchor.ptr;
6640                         log = s->log;
6641                 } else if (s == NULL) {
6642                         action = pf_test_rule(&r, &s, dir, kif,
6643                                               m, off, h, &pd, &a,
6644                                               &ruleset, NULL, inp);
6645                 }
6646                 break;
6647         }
6648
6649         default:
6650                 action = pf_test_state_other(&s, dir, kif, m, &pd);
6651                 if (action == PF_PASS) {
6652                         r = s->rule.ptr;
6653                         a = s->anchor.ptr;
6654                         log = s->log;
6655                 } else if (s == NULL) {
6656                         action = pf_test_rule(&r, &s, dir, kif, m, off, h,
6657                                               &pd, &a, &ruleset, NULL, inp);
6658                 }
6659                 break;
6660         }
6661
6662 done:
6663         if (action == PF_PASS && h->ip_hl > 5 &&
6664             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6665                 action = PF_DROP;
6666                 REASON_SET(&reason, PFRES_IPOPTIONS);
6667                 log = 1;
6668                 DPFPRINTF(PF_DEBUG_MISC,
6669                     ("pf: dropping packet with ip options\n"));
6670         }
6671
6672         if ((s && s->tag) || r->rtableid)
6673                 pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
6674
6675 #if 0
6676         if (dir == PF_IN && s && s->key[PF_SK_STACK])
6677                 m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
6678 #endif
6679
6680 #ifdef ALTQ
6681         /*
6682          * Generate a hash code and qid request for ALTQ.  A qid of 0
6683          * is allowed and will cause altq to select the default queue.
6684          */
6685         if (action == PF_PASS) {
6686                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
6687                 if (pqid || (pd.tos & IPTOS_LOWDELAY))
6688                         m->m_pkthdr.pf.qid = r->pqid;
6689                 else
6690                         m->m_pkthdr.pf.qid = r->qid;
6691                 m->m_pkthdr.pf.ecn_af = AF_INET;
6692                 m->m_pkthdr.pf.hdr = h;
6693                 /* add connection hash for fairq */
6694                 if (s) {
6695                         /* for fairq */
6696                         m->m_pkthdr.pf.state_hash = s->hash;
6697                         m->m_pkthdr.pf.flags |= PF_TAG_STATE_HASHED;
6698                 }
6699         }
6700 #endif /* ALTQ */
6701
6702         /*
6703          * connections redirected to loopback should not match sockets
6704          * bound specifically to loopback due to security implications,
6705          * see tcp_input() and in_pcblookup_listen().
6706          */
6707         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6708             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6709             (s->nat_rule.ptr->action == PF_RDR ||
6710             s->nat_rule.ptr->action == PF_BINAT) &&
6711             (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
6712                 m->m_pkthdr.pf.flags |= PF_TAG_TRANSLATE_LOCALHOST;
6713
6714         if (dir == PF_IN && action == PF_PASS && r->divert.port) {
6715                 struct pf_divert *divert;
6716
6717                 if ((divert = pf_get_divert(m))) {
6718                         m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED;
6719                         divert->port = r->divert.port;
6720                         divert->addr.ipv4 = r->divert.addr.v4;
6721                 }
6722         }
6723
6724         if (log) {
6725                 struct pf_rule *lr;
6726
6727                 if (s != NULL && s->nat_rule.ptr != NULL &&
6728                     s->nat_rule.ptr->log & PF_LOG_ALL)
6729                         lr = s->nat_rule.ptr;
6730                 else
6731                         lr = r;
6732                 PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset,
6733                     &pd);
6734         }
6735
6736         kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6737         kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
6738
6739         if (action == PF_PASS || r->action == PF_DROP) {
6740                 dirndx = (dir == PF_OUT);
6741                 r->packets[dirndx]++;
6742                 r->bytes[dirndx] += pd.tot_len;
6743                 if (a != NULL) {
6744                         a->packets[dirndx]++;
6745                         a->bytes[dirndx] += pd.tot_len;
6746                 }
6747                 if (s != NULL) {
6748                         if (s->nat_rule.ptr != NULL) {
6749                                 s->nat_rule.ptr->packets[dirndx]++;
6750                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6751                         }
6752                         if (s->src_node != NULL) {
6753                                 s->src_node->packets[dirndx]++;
6754                                 s->src_node->bytes[dirndx] += pd.tot_len;
6755                         }
6756                         if (s->nat_src_node != NULL) {
6757                                 s->nat_src_node->packets[dirndx]++;
6758                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
6759                         }
6760                         dirndx = (dir == s->direction) ? 0 : 1;
6761                         s->packets[dirndx]++;
6762                         s->bytes[dirndx] += pd.tot_len;
6763                 }
6764                 tr = r;
6765                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6766                 if (nr != NULL && r == &pf_default_rule)
6767                         tr = nr;
6768                 if (tr->src.addr.type == PF_ADDR_TABLE)
6769                         pfr_update_stats(tr->src.addr.p.tbl,
6770                             (s == NULL) ? pd.src :
6771                             &s->key[(s->direction == PF_IN)]->
6772                                 addr[(s->direction == PF_OUT)],
6773                             pd.af, pd.tot_len, dir == PF_OUT,
6774                             r->action == PF_PASS, tr->src.neg);
6775                 if (tr->dst.addr.type == PF_ADDR_TABLE)
6776                         pfr_update_stats(tr->dst.addr.p.tbl,
6777                             (s == NULL) ? pd.dst :
6778                             &s->key[(s->direction == PF_IN)]->
6779                                 addr[(s->direction == PF_IN)],
6780                             pd.af, pd.tot_len, dir == PF_OUT,
6781                             r->action == PF_PASS, tr->dst.neg);
6782         }
6783
6784
6785         if (action == PF_SYNPROXY_DROP) {
6786                 m_freem(*m0);
6787                 *m0 = NULL;
6788                 action = PF_PASS;
6789         } else if (r->rt) {
6790                 /* pf_route can free the mbuf causing *m0 to become NULL */
6791                 pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
6792         }
6793
6794         return (action);
6795 }
6796 #endif /* INET */
6797
6798 #ifdef INET6
6799
6800 /*
6801  * WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
6802  */
6803 int
6804 pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0,
6805     struct ether_header *eh, struct inpcb *inp)
6806 {
6807         struct pfi_kif          *kif;
6808         u_short                  action, reason = 0, log = 0;
6809         struct mbuf             *m = *m0, *n = NULL;
6810         struct ip6_hdr          *h = NULL;
6811         struct pf_rule          *a = NULL, *r = &pf_default_rule, *tr, *nr;
6812         struct pf_state         *s = NULL;
6813         struct pf_ruleset       *ruleset = NULL;
6814         struct pf_pdesc          pd;
6815         int                      off, terminal = 0, dirndx, rh_cnt = 0;
6816
6817         if (!pf_status.running)
6818                 return (PF_PASS);
6819
6820         memset(&pd, 0, sizeof(pd));
6821 #ifdef foo
6822         if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
6823                 kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
6824         else
6825 #endif
6826                 kif = (struct pfi_kif *)ifp->if_pf_kif;
6827
6828         if (kif == NULL) {
6829                 DPFPRINTF(PF_DEBUG_URGENT,
6830                     ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6831                 return (PF_DROP);
6832         }
6833         if (kif->pfik_flags & PFI_IFLAG_SKIP)
6834                 return (PF_PASS);
6835
6836 #ifdef DIAGNOSTIC
6837         if ((m->m_flags & M_PKTHDR) == 0)
6838                 panic("non-M_PKTHDR is passed to pf_test6");
6839 #endif /* DIAGNOSTIC */
6840
6841         if (m->m_pkthdr.len < (int)sizeof(*h)) {
6842                 action = PF_DROP;
6843                 REASON_SET(&reason, PFRES_SHORT);
6844                 log = 1;
6845                 goto done;
6846         }
6847
6848         /*
6849          * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6850          * so make sure pf.flags is clear.
6851          */
6852         if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
6853                 return (PF_PASS);
6854         m->m_pkthdr.pf.flags = 0;
6855         /* Re-Check when updating to > 4.4 */
6856         m->m_pkthdr.pf.statekey = NULL;
6857
6858         /* We do IP header normalization and packet reassembly here */
6859         if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6860                 action = PF_DROP;
6861                 goto done;
6862         }
6863         m = *m0;        /* pf_normalize messes with m0 */
6864         h = mtod(m, struct ip6_hdr *);
6865
6866 #if 1
6867         /*
6868          * we do not support jumbogram yet.  if we keep going, zero ip6_plen
6869          * will do something bad, so drop the packet for now.
6870          */
6871         if (htons(h->ip6_plen) == 0) {
6872                 action = PF_DROP;
6873                 REASON_SET(&reason, PFRES_NORM);        /*XXX*/
6874                 goto done;
6875         }
6876 #endif
6877
6878         pd.src = (struct pf_addr *)&h->ip6_src;
6879         pd.dst = (struct pf_addr *)&h->ip6_dst;
6880         pd.sport = pd.dport = NULL;
6881         pd.ip_sum = NULL;
6882         pd.proto_sum = NULL;
6883         pd.dir = dir;
6884         pd.sidx = (dir == PF_IN) ? 0 : 1;
6885         pd.didx = (dir == PF_IN) ? 1 : 0;
6886         pd.af = AF_INET6;
6887         pd.tos = 0;
6888         pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6889         pd.eh = eh;
6890
6891         off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6892         pd.proto = h->ip6_nxt;
6893         do {
6894                 switch (pd.proto) {
6895                 case IPPROTO_FRAGMENT:
6896                         action = pf_test_fragment(&r, dir, kif, m, h,
6897                             &pd, &a, &ruleset);
6898                         if (action == PF_DROP)
6899                                 REASON_SET(&reason, PFRES_FRAG);
6900                         goto done;
6901                 case IPPROTO_ROUTING: {
6902                         struct ip6_rthdr rthdr;
6903
6904                         if (rh_cnt++) {
6905                                 DPFPRINTF(PF_DEBUG_MISC,
6906                                     ("pf: IPv6 more than one rthdr\n"));
6907                                 action = PF_DROP;
6908                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6909                                 log = 1;
6910                                 goto done;
6911                         }
6912                         if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6913                             &reason, pd.af)) {
6914                                 DPFPRINTF(PF_DEBUG_MISC,
6915                                     ("pf: IPv6 short rthdr\n"));
6916                                 action = PF_DROP;
6917                                 REASON_SET(&reason, PFRES_SHORT);
6918                                 log = 1;
6919                                 goto done;
6920                         }
6921                         if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6922                                 DPFPRINTF(PF_DEBUG_MISC,
6923                                     ("pf: IPv6 rthdr0\n"));
6924                                 action = PF_DROP;
6925                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6926                                 log = 1;
6927                                 goto done;
6928                         }
6929                         /* FALLTHROUGH */
6930                 }
6931                 case IPPROTO_AH:
6932                 case IPPROTO_HOPOPTS:
6933                 case IPPROTO_DSTOPTS: {
6934                         /* get next header and header length */
6935                         struct ip6_ext  opt6;
6936
6937                         if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6938                             NULL, &reason, pd.af)) {
6939                                 DPFPRINTF(PF_DEBUG_MISC,
6940                                     ("pf: IPv6 short opt\n"));
6941                                 action = PF_DROP;
6942                                 log = 1;
6943                                 goto done;
6944                         }
6945                         if (pd.proto == IPPROTO_AH)
6946                                 off += (opt6.ip6e_len + 2) * 4;
6947                         else
6948                                 off += (opt6.ip6e_len + 1) * 8;
6949                         pd.proto = opt6.ip6e_nxt;
6950                         /* goto the next header */
6951                         break;
6952                 }
6953                 default:
6954                         terminal++;
6955                         break;
6956                 }
6957         } while (!terminal);
6958
6959         /* if there's no routing header, use unmodified mbuf for checksumming */
6960         if (!n)
6961                 n = m;
6962
6963         switch (pd.proto) {
6964
6965         case IPPROTO_TCP: {
6966                 struct tcphdr   th;
6967
6968                 pd.hdr.tcp = &th;
6969                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6970                     &action, &reason, AF_INET6)) {
6971                         log = action != PF_PASS;
6972                         goto done;
6973                 }
6974                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6975                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6976                 if (action == PF_DROP)
6977                         goto done;
6978                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6979                                            &reason);
6980                 if (action == PF_PASS) {
6981                         r = s->rule.ptr;
6982                         a = s->anchor.ptr;
6983                         log = s->log;
6984                 } else if (s == NULL) {
6985                         action = pf_test_rule(&r, &s, dir, kif,
6986                                               m, off, h, &pd, &a,
6987                                               &ruleset, NULL, inp);
6988                 }
6989                 break;
6990         }
6991
6992         case IPPROTO_UDP: {
6993                 struct udphdr   uh;
6994
6995                 pd.hdr.udp = &uh;
6996                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6997                     &action, &reason, AF_INET6)) {
6998                         log = action != PF_PASS;
6999                         goto done;
7000                 }
7001                 if (uh.uh_dport == 0 ||
7002                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
7003                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
7004                         action = PF_DROP;
7005                         REASON_SET(&reason, PFRES_SHORT);
7006                         goto done;
7007                 }
7008                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
7009                 if (action == PF_PASS) {
7010                         r = s->rule.ptr;
7011                         a = s->anchor.ptr;
7012                         log = s->log;
7013                 } else if (s == NULL) {
7014                         action = pf_test_rule(&r, &s, dir, kif,
7015                                               m, off, h, &pd, &a,
7016                                               &ruleset, NULL, inp);
7017                 }
7018                 break;
7019         }
7020
7021         case IPPROTO_ICMPV6: {
7022                 struct icmp6_hdr        ih;
7023
7024                 pd.hdr.icmp6 = &ih;
7025                 if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
7026                     &action, &reason, AF_INET6)) {
7027                         log = action != PF_PASS;
7028                         goto done;
7029                 }
7030                 action = pf_test_state_icmp(&s, dir, kif,
7031                                             m, off, h, &pd, &reason);
7032                 if (action == PF_PASS) {
7033                         r = s->rule.ptr;
7034                         a = s->anchor.ptr;
7035                         log = s->log;
7036                 } else if (s == NULL) {
7037                         action = pf_test_rule(&r, &s, dir, kif,
7038                                               m, off, h, &pd, &a,
7039                                               &ruleset, NULL, inp);
7040                 }
7041                 break;
7042         }
7043
7044         default:
7045                 action = pf_test_state_other(&s, dir, kif, m, &pd);
7046                 if (action == PF_PASS) {
7047                         r = s->rule.ptr;
7048                         a = s->anchor.ptr;
7049                         log = s->log;
7050                 } else if (s == NULL) {
7051                         action = pf_test_rule(&r, &s, dir, kif, m, off, h,
7052                                               &pd, &a, &ruleset, NULL, inp);
7053                 }
7054                 break;
7055         }
7056
7057 done:
7058         if (n != m) {
7059                 m_freem(n);
7060                 n = NULL;
7061         }
7062
7063         /* handle dangerous IPv6 extension headers. */
7064         if (action == PF_PASS && rh_cnt &&
7065             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
7066                 action = PF_DROP;
7067                 REASON_SET(&reason, PFRES_IPOPTIONS);
7068                 log = 1;
7069                 DPFPRINTF(PF_DEBUG_MISC,
7070                     ("pf: dropping packet with dangerous v6 headers\n"));
7071         }
7072
7073         if ((s && s->tag) || r->rtableid)
7074                 pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
7075
7076 #if 0
7077         if (dir == PF_IN && s && s->key[PF_SK_STACK])
7078                 m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
7079 #endif
7080
7081 #ifdef ALTQ
7082         /*
7083          * Generate a hash code and qid request for ALTQ.  A qid of 0
7084          * is allowed and will cause altq to select the default queue.
7085          */
7086         if (action == PF_PASS) {
7087                 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
7088                 if (pd.tos & IPTOS_LOWDELAY)
7089                         m->m_pkthdr.pf.qid = r->pqid;
7090                 else
7091                         m->m_pkthdr.pf.qid = r->qid;
7092                 m->m_pkthdr.pf.ecn_af = AF_INET6;
7093                 m->m_pkthdr.pf.hdr = h;
7094                 if (s) {
7095                         /* for fairq */
7096                         m->m_pkthdr.pf.state_hash = s->hash;
7097                         m->m_pkthdr.pf.flags |= PF_TAG_STATE_HASHED;
7098                 }
7099         }
7100 #endif /* ALTQ */
7101
7102         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
7103             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
7104             (s->nat_rule.ptr->action == PF_RDR ||
7105             s->nat_rule.ptr->action == PF_BINAT) &&
7106             IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
7107                 m->m_pkthdr.pf.flags |= PF_TAG_TRANSLATE_LOCALHOST;
7108
7109         if (dir == PF_IN && action == PF_PASS && r->divert.port) {
7110                 struct pf_divert *divert;
7111
7112                 if ((divert = pf_get_divert(m))) {
7113                         m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED;
7114                         divert->port = r->divert.port;
7115                         divert->addr.ipv6 = r->divert.addr.v6;
7116                 }
7117         }
7118
7119         if (log) {
7120                 struct pf_rule *lr;
7121
7122                 if (s != NULL && s->nat_rule.ptr != NULL &&
7123                     s->nat_rule.ptr->log & PF_LOG_ALL)
7124                         lr = s->nat_rule.ptr;
7125                 else
7126                         lr = r;
7127                 PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset,
7128                     &pd);
7129         }
7130
7131         kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
7132         kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
7133
7134         if (action == PF_PASS || r->action == PF_DROP) {
7135                 dirndx = (dir == PF_OUT);
7136                 r->packets[dirndx]++;
7137                 r->bytes[dirndx] += pd.tot_len;
7138                 if (a != NULL) {
7139                         a->packets[dirndx]++;
7140                         a->bytes[dirndx] += pd.tot_len;
7141                 }
7142                 if (s != NULL) {
7143                         if (s->nat_rule.ptr != NULL) {
7144                                 s->nat_rule.ptr->packets[dirndx]++;
7145                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
7146                         }
7147                         if (s->src_node != NULL) {
7148                                 s->src_node->packets[dirndx]++;
7149                                 s->src_node->bytes[dirndx] += pd.tot_len;
7150                         }
7151                         if (s->nat_src_node != NULL) {
7152                                 s->nat_src_node->packets[dirndx]++;
7153                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
7154                         }
7155                         dirndx = (dir == s->direction) ? 0 : 1;
7156                         s->packets[dirndx]++;
7157                         s->bytes[dirndx] += pd.tot_len;
7158                 }
7159                 tr = r;
7160                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
7161                 if (nr != NULL && r == &pf_default_rule)
7162                         tr = nr;
7163                 if (tr->src.addr.type == PF_ADDR_TABLE)
7164                         pfr_update_stats(tr->src.addr.p.tbl,
7165                             (s == NULL) ? pd.src :
7166                             &s->key[(s->direction == PF_IN)]->addr[0],
7167                             pd.af, pd.tot_len, dir == PF_OUT,
7168                             r->action == PF_PASS, tr->src.neg);
7169                 if (tr->dst.addr.type == PF_ADDR_TABLE)
7170                         pfr_update_stats(tr->dst.addr.p.tbl,
7171                             (s == NULL) ? pd.dst :
7172                             &s->key[(s->direction == PF_IN)]->addr[1],
7173                             pd.af, pd.tot_len, dir == PF_OUT,
7174                             r->action == PF_PASS, tr->dst.neg);
7175         }
7176
7177
7178         if (action == PF_SYNPROXY_DROP) {
7179                 m_freem(*m0);
7180                 *m0 = NULL;
7181                 action = PF_PASS;
7182         } else if (r->rt)
7183                 /* pf_route6 can free the mbuf causing *m0 to become NULL */
7184                 pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
7185
7186         return (action);
7187 }
7188 #endif /* INET6 */
7189
7190 int
7191 pf_check_congestion(struct ifqueue *ifq)
7192 {
7193                 return (0);
7194 }