inet6: only mark autoconf addresses tentative if detached
[dragonfly.git] / sys / net / pf / pf.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
3 *
4 * Copyright (c) 2001 Daniel Hartmeier
5 * Copyright (c) 2002 - 2008 Henning Brauer
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35 *
36 */
37
38#include "opt_inet.h"
39#include "opt_inet6.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/malloc.h>
44#include <sys/mbuf.h>
45#include <sys/filio.h>
46#include <sys/socket.h>
47#include <sys/socketvar.h>
48#include <sys/kernel.h>
49#include <sys/time.h>
50#include <sys/sysctl.h>
51#include <sys/endian.h>
52#include <sys/proc.h>
53#include <sys/kthread.h>
54#include <sys/spinlock.h>
55
56#include <sys/md5.h>
57
58#include <net/if.h>
59#include <net/if_types.h>
60#include <net/bpf.h>
61#include <net/netisr2.h>
62#include <net/route.h>
63
64#include <netinet/in.h>
65#include <netinet/in_var.h>
66#include <netinet/in_systm.h>
67#include <netinet/ip.h>
68#include <netinet/ip_var.h>
69#include <netinet/tcp.h>
70#include <netinet/tcp_seq.h>
71#include <netinet/udp.h>
72#include <netinet/ip_icmp.h>
73#include <netinet/in_pcb.h>
74#include <netinet/tcp_timer.h>
75#include <netinet/tcp_var.h>
76#include <netinet/udp_var.h>
77#include <netinet/icmp_var.h>
78#include <netinet/if_ether.h>
79
80#include <net/pf/pfvar.h>
81#include <net/pf/if_pflog.h>
82
83#include <net/pf/if_pfsync.h>
84
85#ifdef INET6
86#include <netinet/ip6.h>
87#include <netinet/icmp6.h>
88#include <netinet6/nd6.h>
89#include <netinet6/ip6_var.h>
90#include <netinet6/in6_pcb.h>
91#endif /* INET6 */
92
93#include <sys/in_cksum.h>
94#include <sys/ucred.h>
95#include <machine/limits.h>
96#include <sys/msgport2.h>
97#include <sys/spinlock2.h>
98#include <net/netmsg2.h>
99#include <net/toeplitz2.h>
100
101extern int ip_optcopy(struct ip *, struct ip *);
102extern int debug_pfugidhack;
103
104/*
105 * pf_token - shared lock for cpu-localized operations,
106 * exclusive lock otherwise.
107 *
108 * pf_gtoken- exclusive lock used for initialization.
109 */
110struct lwkt_token pf_token = LWKT_TOKEN_INITIALIZER(pf_token);
111struct lwkt_token pf_gtoken = LWKT_TOKEN_INITIALIZER(pf_gtoken);
112
113#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) kprintf x
114
115#define FAIL(code) { error = (code); goto done; }
116
117/*
118 * Global variables
119 */
120
121/* mask radix tree */
122struct radix_node_head *pf_maskhead;
123
124/* state tables */
125struct pf_state_tree *pf_statetbl; /* incls one global table */
126struct pf_state **purge_cur;
127struct pf_altqqueue pf_altqs[2];
128struct pf_palist pf_pabuf;
129struct pf_altqqueue *pf_altqs_active;
130struct pf_altqqueue *pf_altqs_inactive;
131struct pf_status pf_status;
132
133u_int32_t ticket_altqs_active;
134u_int32_t ticket_altqs_inactive;
135int altqs_inactive_open;
136u_int32_t ticket_pabuf;
137
138MD5_CTX pf_tcp_secret_ctx;
139u_char pf_tcp_secret[16];
140int pf_tcp_secret_init;
141int pf_tcp_iss_off;
142
143struct pf_anchor_stackframe {
144 struct pf_ruleset *rs;
145 struct pf_rule *r;
146 struct pf_anchor_node *parent;
147 struct pf_anchor *child;
148} pf_anchor_stack[64];
149
150struct malloc_type *pf_src_tree_pl, *pf_rule_pl, *pf_pooladdr_pl;
151struct malloc_type *pf_state_pl, *pf_state_key_pl, *pf_state_item_pl;
152struct malloc_type *pf_altq_pl;
153
154void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
155
156void pf_init_threshold(struct pf_threshold *, u_int32_t,
157 u_int32_t);
158void pf_add_threshold(struct pf_threshold *);
159int pf_check_threshold(struct pf_threshold *);
160
161void pf_change_ap(struct pf_addr *, u_int16_t *,
162 u_int16_t *, u_int16_t *, struct pf_addr *,
163 u_int16_t, u_int8_t, sa_family_t);
164int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
165 struct tcphdr *, struct pf_state_peer *);
166#ifdef INET6
167void pf_change_a6(struct pf_addr *, u_int16_t *,
168 struct pf_addr *, u_int8_t);
169#endif /* INET6 */
170void pf_change_icmp(struct pf_addr *, u_int16_t *,
171 struct pf_addr *, struct pf_addr *, u_int16_t,
172 u_int16_t *, u_int16_t *, u_int16_t *,
173 u_int16_t *, u_int8_t, sa_family_t);
174void pf_send_tcp(const struct pf_rule *, sa_family_t,
175 const struct pf_addr *, const struct pf_addr *,
176 u_int16_t, u_int16_t, u_int32_t, u_int32_t,
177 u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
178 u_int16_t, struct ether_header *, struct ifnet *);
179void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
180 sa_family_t, struct pf_rule *);
181struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *,
182 int, int, struct pfi_kif *,
183 struct pf_addr *, u_int16_t, struct pf_addr *,
184 u_int16_t, int);
185struct pf_rule *pf_get_translation(struct pf_pdesc *, struct mbuf *,
186 int, int, struct pfi_kif *, struct pf_src_node **,
187 struct pf_state_key **, struct pf_state_key **,
188 struct pf_state_key **, struct pf_state_key **,
189 struct pf_addr *, struct pf_addr *,
190 u_int16_t, u_int16_t);
191void pf_detach_state(struct pf_state *);
192int pf_state_key_setup(struct pf_pdesc *, struct pf_rule *,
193 struct pf_state_key **, struct pf_state_key **,
194 struct pf_state_key **, struct pf_state_key **,
195 struct pf_addr *, struct pf_addr *,
196 u_int16_t, u_int16_t);
197void pf_state_key_detach(struct pf_state *, int);
198u_int32_t pf_tcp_iss(struct pf_pdesc *);
199int pf_test_rule(struct pf_rule **, struct pf_state **,
200 int, struct pfi_kif *, struct mbuf *, int,
201 void *, struct pf_pdesc *, struct pf_rule **,
202 struct pf_ruleset **, struct ifqueue *, struct inpcb *);
203static __inline int pf_create_state(struct pf_rule *, struct pf_rule *,
204 struct pf_rule *, struct pf_pdesc *,
205 struct pf_src_node *, struct pf_state_key *,
206 struct pf_state_key *, struct pf_state_key *,
207 struct pf_state_key *, struct mbuf *, int,
208 u_int16_t, u_int16_t, int *, struct pfi_kif *,
209 struct pf_state **, int, u_int16_t, u_int16_t,
210 int);
211int pf_test_fragment(struct pf_rule **, int,
212 struct pfi_kif *, struct mbuf *, void *,
213 struct pf_pdesc *, struct pf_rule **,
214 struct pf_ruleset **);
215int pf_tcp_track_full(struct pf_state_peer *,
216 struct pf_state_peer *, struct pf_state **,
217 struct pfi_kif *, struct mbuf *, int,
218 struct pf_pdesc *, u_short *, int *);
219int pf_tcp_track_sloppy(struct pf_state_peer *,
220 struct pf_state_peer *, struct pf_state **,
221 struct pf_pdesc *, u_short *);
222int pf_test_state_tcp(struct pf_state **, int,
223 struct pfi_kif *, struct mbuf *, int,
224 void *, struct pf_pdesc *, u_short *);
225int pf_test_state_udp(struct pf_state **, int,
226 struct pfi_kif *, struct mbuf *, int,
227 void *, struct pf_pdesc *);
228int pf_test_state_icmp(struct pf_state **, int,
229 struct pfi_kif *, struct mbuf *, int,
230 void *, struct pf_pdesc *, u_short *);
231int pf_test_state_other(struct pf_state **, int,
232 struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
233void pf_step_into_anchor(int *, struct pf_ruleset **, int,
234 struct pf_rule **, struct pf_rule **, int *);
235int pf_step_out_of_anchor(int *, struct pf_ruleset **,
236 int, struct pf_rule **, struct pf_rule **,
237 int *);
238void pf_hash(struct pf_addr *, struct pf_addr *,
239 struct pf_poolhashkey *, sa_family_t);
240int pf_map_addr(u_int8_t, struct pf_rule *,
241 struct pf_addr *, struct pf_addr *,
242 struct pf_addr *, struct pf_src_node **);
243int pf_get_sport(struct pf_pdesc *,
244 sa_family_t, u_int8_t, struct pf_rule *,
245 struct pf_addr *, struct pf_addr *,
246 u_int16_t, u_int16_t,
247 struct pf_addr *, u_int16_t *,
248 u_int16_t, u_int16_t,
249 struct pf_src_node **);
250void pf_route(struct mbuf **, struct pf_rule *, int,
251 struct ifnet *, struct pf_state *,
252 struct pf_pdesc *);
253void pf_route6(struct mbuf **, struct pf_rule *, int,
254 struct ifnet *, struct pf_state *,
255 struct pf_pdesc *);
256u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t,
257 sa_family_t);
258u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t,
259 sa_family_t);
260u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t,
261 u_int16_t);
262void pf_set_rt_ifp(struct pf_state *,
263 struct pf_addr *);
264int pf_check_proto_cksum(struct mbuf *, int, int,
265 u_int8_t, sa_family_t);
266struct pf_divert *pf_get_divert(struct mbuf *);
267void pf_print_state_parts(struct pf_state *,
268 struct pf_state_key *, struct pf_state_key *);
269int pf_addr_wrap_neq(struct pf_addr_wrap *,
270 struct pf_addr_wrap *);
271struct pf_state *pf_find_state(struct pfi_kif *,
272 struct pf_state_key_cmp *, u_int, struct mbuf *);
273int pf_src_connlimit(struct pf_state *);
274int pf_check_congestion(struct ifqueue *);
275
276extern int pf_end_threads;
277
278struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
279 { &pf_state_pl, PFSTATE_HIWAT },
280 { &pf_src_tree_pl, PFSNODE_HIWAT },
281 { &pf_frent_pl, PFFRAG_FRENT_HIWAT },
282 { &pfr_ktable_pl, PFR_KTABLE_HIWAT },
283 { &pfr_kentry_pl, PFR_KENTRY_HIWAT }
284};
285
286/*
287 * If route-to and direction is out we match with no further processing
288 * (rt_kif must be assigned and not equal to the out interface)
289 * If reply-to and direction is in we match with no further processing
290 * (rt_kif must be assigned and not equal to the in interface)
291 */
292#define STATE_LOOKUP(i, k, d, s, m) \
293 do { \
294 s = pf_find_state(i, k, d, m); \
295 if (s == NULL || (s)->timeout == PFTM_PURGE) \
296 return (PF_DROP); \
297 if (d == PF_OUT && \
298 (((s)->rule.ptr->rt == PF_ROUTETO && \
299 (s)->rule.ptr->direction == PF_OUT) || \
300 ((s)->rule.ptr->rt == PF_REPLYTO && \
301 (s)->rule.ptr->direction == PF_IN)) && \
302 (s)->rt_kif != NULL && \
303 (s)->rt_kif != i) \
304 return (PF_PASS); \
305 } while (0)
306
307#define BOUND_IFACE(r, k) \
308 ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all
309
310#define STATE_INC_COUNTERS(s) \
311 do { \
312 atomic_add_int(&s->rule.ptr->states_cur, 1); \
313 s->rule.ptr->states_tot++; \
314 if (s->anchor.ptr != NULL) { \
315 atomic_add_int(&s->anchor.ptr->states_cur, 1); \
316 s->anchor.ptr->states_tot++; \
317 } \
318 if (s->nat_rule.ptr != NULL) { \
319 atomic_add_int(&s->nat_rule.ptr->states_cur, 1); \
320 s->nat_rule.ptr->states_tot++; \
321 } \
322 } while (0)
323
324#define STATE_DEC_COUNTERS(s) \
325 do { \
326 if (s->nat_rule.ptr != NULL) \
327 atomic_add_int(&s->nat_rule.ptr->states_cur, -1); \
328 if (s->anchor.ptr != NULL) \
329 atomic_add_int(&s->anchor.ptr->states_cur, -1); \
330 atomic_add_int(&s->rule.ptr->states_cur, -1); \
331 } while (0)
332
333static MALLOC_DEFINE(M_PFSTATEPL, "pfstatepl", "pf state pool list");
334static MALLOC_DEFINE(M_PFSRCTREEPL, "pfsrctpl", "pf source tree pool list");
335static MALLOC_DEFINE(M_PFSTATEKEYPL, "pfstatekeypl", "pf state key pool list");
336static MALLOC_DEFINE(M_PFSTATEITEMPL, "pfstateitempl", "pf state item pool list");
337
338static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
339static __inline int pf_state_compare_key(struct pf_state_key *,
340 struct pf_state_key *);
341static __inline int pf_state_compare_rkey(struct pf_state_key *,
342 struct pf_state_key *);
343static __inline int pf_state_compare_id(struct pf_state *,
344 struct pf_state *);
345
346struct pf_src_tree *tree_src_tracking;
347struct pf_state_tree_id *tree_id;
348struct pf_state_queue *state_list;
349struct pf_counters *pf_counters;
350
351RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
352RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
353RB_GENERATE(pf_state_rtree, pf_state_key, entry, pf_state_compare_rkey);
354RB_GENERATE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id);
355
356static __inline int
357pf_src_compare(struct pf_src_node *a, struct pf_src_node *b)
358{
359 int diff;
360
361 if (a->rule.ptr > b->rule.ptr)
362 return (1);
363 if (a->rule.ptr < b->rule.ptr)
364 return (-1);
365 if ((diff = a->af - b->af) != 0)
366 return (diff);
367 switch (a->af) {
368#ifdef INET
369 case AF_INET:
370 if (a->addr.addr32[0] > b->addr.addr32[0])
371 return (1);
372 if (a->addr.addr32[0] < b->addr.addr32[0])
373 return (-1);
374 break;
375#endif /* INET */
376#ifdef INET6
377 case AF_INET6:
378 if (a->addr.addr32[3] > b->addr.addr32[3])
379 return (1);
380 if (a->addr.addr32[3] < b->addr.addr32[3])
381 return (-1);
382 if (a->addr.addr32[2] > b->addr.addr32[2])
383 return (1);
384 if (a->addr.addr32[2] < b->addr.addr32[2])
385 return (-1);
386 if (a->addr.addr32[1] > b->addr.addr32[1])
387 return (1);
388 if (a->addr.addr32[1] < b->addr.addr32[1])
389 return (-1);
390 if (a->addr.addr32[0] > b->addr.addr32[0])
391 return (1);
392 if (a->addr.addr32[0] < b->addr.addr32[0])
393 return (-1);
394 break;
395#endif /* INET6 */
396 }
397 return (0);
398}
399
400u_int32_t
401pf_state_hash(struct pf_state_key *sk)
402{
403 u_int32_t hv = (u_int32_t)(((intptr_t)sk >> 6) ^ ((intptr_t)sk >> 15));
404 if (hv == 0) /* disallow 0 */
405 hv = 1;
406 return(hv);
407}
408
409#ifdef INET6
410void
411pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
412{
413 switch (af) {
414#ifdef INET
415 case AF_INET:
416 dst->addr32[0] = src->addr32[0];
417 break;
418#endif /* INET */
419 case AF_INET6:
420 dst->addr32[0] = src->addr32[0];
421 dst->addr32[1] = src->addr32[1];
422 dst->addr32[2] = src->addr32[2];
423 dst->addr32[3] = src->addr32[3];
424 break;
425 }
426}
427#endif /* INET6 */
428
429void
430pf_init_threshold(struct pf_threshold *threshold,
431 u_int32_t limit, u_int32_t seconds)
432{
433 threshold->limit = limit * PF_THRESHOLD_MULT;
434 threshold->seconds = seconds;
435 threshold->count = 0;
436 threshold->last = time_second;
437}
438
439void
440pf_add_threshold(struct pf_threshold *threshold)
441{
442 u_int32_t t = time_second, diff = t - threshold->last;
443
444 if (diff >= threshold->seconds)
445 threshold->count = 0;
446 else
447 threshold->count -= threshold->count * diff /
448 threshold->seconds;
449 threshold->count += PF_THRESHOLD_MULT;
450 threshold->last = t;
451}
452
453int
454pf_check_threshold(struct pf_threshold *threshold)
455{
456 return (threshold->count > threshold->limit);
457}
458
459int
460pf_src_connlimit(struct pf_state *state)
461{
462 int bad = 0;
463 int cpu = mycpu->gd_cpuid;
464
465 atomic_add_int(&state->src_node->conn, 1);
466 state->src.tcp_est = 1;
467 pf_add_threshold(&state->src_node->conn_rate);
468
469 if (state->rule.ptr->max_src_conn &&
470 state->rule.ptr->max_src_conn <
471 state->src_node->conn) {
472 PF_INC_LCOUNTER(LCNT_SRCCONN);
473 bad++;
474 }
475
476 if (state->rule.ptr->max_src_conn_rate.limit &&
477 pf_check_threshold(&state->src_node->conn_rate)) {
478 PF_INC_LCOUNTER(LCNT_SRCCONNRATE);
479 bad++;
480 }
481
482 if (!bad)
483 return 0;
484
485 if (state->rule.ptr->overload_tbl) {
486 struct pfr_addr p;
487 u_int32_t killed = 0;
488
489 PF_INC_LCOUNTER(LCNT_OVERLOAD_TABLE);
490 if (pf_status.debug >= PF_DEBUG_MISC) {
491 kprintf("pf_src_connlimit: blocking address ");
492 pf_print_host(&state->src_node->addr, 0,
493 state->key[PF_SK_WIRE]->af);
494 }
495
496 bzero(&p, sizeof(p));
497 p.pfra_af = state->key[PF_SK_WIRE]->af;
498 switch (state->key[PF_SK_WIRE]->af) {
499#ifdef INET
500 case AF_INET:
501 p.pfra_net = 32;
502 p.pfra_ip4addr = state->src_node->addr.v4;
503 break;
504#endif /* INET */
505#ifdef INET6
506 case AF_INET6:
507 p.pfra_net = 128;
508 p.pfra_ip6addr = state->src_node->addr.v6;
509 break;
510#endif /* INET6 */
511 }
512
513 pfr_insert_kentry(state->rule.ptr->overload_tbl,
514 &p, time_second);
515
516 /* kill existing states if that's required. */
517 if (state->rule.ptr->flush) {
518 struct pf_state_key *sk;
519 struct pf_state *st;
520
521 PF_INC_LCOUNTER(LCNT_OVERLOAD_FLUSH);
522 RB_FOREACH(st, pf_state_tree_id, &tree_id[cpu]) {
523 sk = st->key[PF_SK_WIRE];
524 /*
525 * Kill states from this source. (Only those
526 * from the same rule if PF_FLUSH_GLOBAL is not
527 * set). (Only on current cpu).
528 */
529 if (sk->af ==
530 state->key[PF_SK_WIRE]->af &&
531 ((state->direction == PF_OUT &&
532 PF_AEQ(&state->src_node->addr,
533 &sk->addr[0], sk->af)) ||
534 (state->direction == PF_IN &&
535 PF_AEQ(&state->src_node->addr,
536 &sk->addr[1], sk->af))) &&
537 (state->rule.ptr->flush &
538 PF_FLUSH_GLOBAL ||
539 state->rule.ptr == st->rule.ptr)) {
540 st->timeout = PFTM_PURGE;
541 st->src.state = st->dst.state =
542 TCPS_CLOSED;
543 killed++;
544 }
545 }
546 if (pf_status.debug >= PF_DEBUG_MISC)
547 kprintf(", %u states killed", killed);
548 }
549 if (pf_status.debug >= PF_DEBUG_MISC)
550 kprintf("\n");
551 }
552
553 /* kill this state */
554 state->timeout = PFTM_PURGE;
555 state->src.state = state->dst.state = TCPS_CLOSED;
556
557 return 1;
558}
559
560int
561pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
562 struct pf_addr *src, sa_family_t af)
563{
564 struct pf_src_node k;
565 int cpu = mycpu->gd_cpuid;
566
567 bzero(&k, sizeof(k)); /* avoid gcc warnings */
568 if (*sn == NULL) {
569 k.af = af;
570 PF_ACPY(&k.addr, src, af);
571 if (rule->rule_flag & PFRULE_RULESRCTRACK ||
572 rule->rpool.opts & PF_POOL_STICKYADDR)
573 k.rule.ptr = rule;
574 else
575 k.rule.ptr = NULL;
576 PF_INC_SCOUNTER(SCNT_SRC_NODE_SEARCH);
577 *sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
578 }
579 if (*sn == NULL) {
580 if (!rule->max_src_nodes ||
581 rule->src_nodes < rule->max_src_nodes)
582 (*sn) = kmalloc(sizeof(struct pf_src_node),
583 M_PFSRCTREEPL, M_NOWAIT|M_ZERO);
584 else
585 PF_INC_LCOUNTER(LCNT_SRCNODES);
586 if ((*sn) == NULL)
587 return (-1);
588
589 pf_init_threshold(&(*sn)->conn_rate,
590 rule->max_src_conn_rate.limit,
591 rule->max_src_conn_rate.seconds);
592
593 (*sn)->af = af;
594 if (rule->rule_flag & PFRULE_RULESRCTRACK ||
595 rule->rpool.opts & PF_POOL_STICKYADDR)
596 (*sn)->rule.ptr = rule;
597 else
598 (*sn)->rule.ptr = NULL;
599 PF_ACPY(&(*sn)->addr, src, af);
600 if (RB_INSERT(pf_src_tree,
601 &tree_src_tracking[cpu], *sn) != NULL) {
602 if (pf_status.debug >= PF_DEBUG_MISC) {
603 kprintf("pf: src_tree insert failed: ");
604 pf_print_host(&(*sn)->addr, 0, af);
605 kprintf("\n");
606 }
607 kfree(*sn, M_PFSRCTREEPL);
608 return (-1);
609 }
610
611 /*
612 * Atomic op required to increment src_nodes in the rule
613 * because we hold a shared token here (decrements will use
614 * an exclusive token).
615 */
616 (*sn)->creation = time_second;
617 (*sn)->ruletype = rule->action;
618 if ((*sn)->rule.ptr != NULL)
619 atomic_add_int(&(*sn)->rule.ptr->src_nodes, 1);
620 PF_INC_SCOUNTER(SCNT_SRC_NODE_INSERT);
621 atomic_add_int(&pf_status.src_nodes, 1);
622 } else {
623 if (rule->max_src_states &&
624 (*sn)->states >= rule->max_src_states) {
625 PF_INC_LCOUNTER(LCNT_SRCSTATES);
626 return (-1);
627 }
628 }
629 return (0);
630}
631
632/*
633 * state table (indexed by the pf_state_key structure), normal RBTREE
634 * comparison.
635 */
636static __inline int
637pf_state_compare_key(struct pf_state_key *a, struct pf_state_key *b)
638{
639 int diff;
640
641 if ((diff = a->proto - b->proto) != 0)
642 return (diff);
643 if ((diff = a->af - b->af) != 0)
644 return (diff);
645 switch (a->af) {
646#ifdef INET
647 case AF_INET:
648 if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
649 return (1);
650 if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
651 return (-1);
652 if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
653 return (1);
654 if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
655 return (-1);
656 break;
657#endif /* INET */
658#ifdef INET6
659 case AF_INET6:
660 if (a->addr[0].addr32[3] > b->addr[0].addr32[3])
661 return (1);
662 if (a->addr[0].addr32[3] < b->addr[0].addr32[3])
663 return (-1);
664 if (a->addr[1].addr32[3] > b->addr[1].addr32[3])
665 return (1);
666 if (a->addr[1].addr32[3] < b->addr[1].addr32[3])
667 return (-1);
668 if (a->addr[0].addr32[2] > b->addr[0].addr32[2])
669 return (1);
670 if (a->addr[0].addr32[2] < b->addr[0].addr32[2])
671 return (-1);
672 if (a->addr[1].addr32[2] > b->addr[1].addr32[2])
673 return (1);
674 if (a->addr[1].addr32[2] < b->addr[1].addr32[2])
675 return (-1);
676 if (a->addr[0].addr32[1] > b->addr[0].addr32[1])
677 return (1);
678 if (a->addr[0].addr32[1] < b->addr[0].addr32[1])
679 return (-1);
680 if (a->addr[1].addr32[1] > b->addr[1].addr32[1])
681 return (1);
682 if (a->addr[1].addr32[1] < b->addr[1].addr32[1])
683 return (-1);
684 if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
685 return (1);
686 if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
687 return (-1);
688 if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
689 return (1);
690 if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
691 return (-1);
692 break;
693#endif /* INET6 */
694 }
695
696 if ((diff = a->port[0] - b->port[0]) != 0)
697 return (diff);
698 if ((diff = a->port[1] - b->port[1]) != 0)
699 return (diff);
700
701 return (0);
702}
703
704/*
705 * Used for RB_FIND only, compare in the reverse direction. The
706 * element to be reversed is always (a), since we obviously can't
707 * reverse the state tree depicted by (b).
708 */
709static __inline int
710pf_state_compare_rkey(struct pf_state_key *a, struct pf_state_key *b)
711{
712 int diff;
713
714 if ((diff = a->proto - b->proto) != 0)
715 return (diff);
716 if ((diff = a->af - b->af) != 0)
717 return (diff);
718 switch (a->af) {
719#ifdef INET
720 case AF_INET:
721 if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
722 return (1);
723 if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
724 return (-1);
725 if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
726 return (1);
727 if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
728 return (-1);
729 break;
730#endif /* INET */
731#ifdef INET6
732 case AF_INET6:
733 if (a->addr[1].addr32[3] > b->addr[0].addr32[3])
734 return (1);
735 if (a->addr[1].addr32[3] < b->addr[0].addr32[3])
736 return (-1);
737 if (a->addr[0].addr32[3] > b->addr[1].addr32[3])
738 return (1);
739 if (a->addr[0].addr32[3] < b->addr[1].addr32[3])
740 return (-1);
741 if (a->addr[1].addr32[2] > b->addr[0].addr32[2])
742 return (1);
743 if (a->addr[1].addr32[2] < b->addr[0].addr32[2])
744 return (-1);
745 if (a->addr[0].addr32[2] > b->addr[1].addr32[2])
746 return (1);
747 if (a->addr[0].addr32[2] < b->addr[1].addr32[2])
748 return (-1);
749 if (a->addr[1].addr32[1] > b->addr[0].addr32[1])
750 return (1);
751 if (a->addr[1].addr32[1] < b->addr[0].addr32[1])
752 return (-1);
753 if (a->addr[0].addr32[1] > b->addr[1].addr32[1])
754 return (1);
755 if (a->addr[0].addr32[1] < b->addr[1].addr32[1])
756 return (-1);
757 if (a->addr[1].addr32[0] > b->addr[0].addr32[0])
758 return (1);
759 if (a->addr[1].addr32[0] < b->addr[0].addr32[0])
760 return (-1);
761 if (a->addr[0].addr32[0] > b->addr[1].addr32[0])
762 return (1);
763 if (a->addr[0].addr32[0] < b->addr[1].addr32[0])
764 return (-1);
765 break;
766#endif /* INET6 */
767 }
768
769 if ((diff = a->port[1] - b->port[0]) != 0)
770 return (diff);
771 if ((diff = a->port[0] - b->port[1]) != 0)
772 return (diff);
773
774 return (0);
775}
776
777static __inline int
778pf_state_compare_id(struct pf_state *a, struct pf_state *b)
779{
780 if (a->id > b->id)
781 return (1);
782 if (a->id < b->id)
783 return (-1);
784 if (a->creatorid > b->creatorid)
785 return (1);
786 if (a->creatorid < b->creatorid)
787 return (-1);
788
789 return (0);
790}
791
792int
793pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx)
794{
795 struct pf_state_item *si;
796 struct pf_state_key *cur;
797 int cpu;
798 int error;
799
800 /*
801 * PFSTATE_STACK_GLOBAL is set when the state might not hash to the
802 * current cpu. The keys are managed on the global statetbl tree
803 * for this case. Only translations (RDR, NAT) can cause this.
804 *
805 * When this flag is not set we must still check the global statetbl
806 * for a collision, and if we find one we set the HALF_DUPLEX flag
807 * in the state.
808 */
809 if (s->state_flags & PFSTATE_STACK_GLOBAL) {
810 cpu = ncpus;
811 lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
812 } else {
813 cpu = mycpu->gd_cpuid;
814 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
815 }
816 KKASSERT(s->key[idx] == NULL); /* XXX handle this? */
817
818 if (pf_status.debug >= PF_DEBUG_MISC) {
819 kprintf("state_key attach cpu %d (%08x:%d) %s (%08x:%d)\n",
820 cpu,
821 ntohl(sk->addr[0].addr32[0]), ntohs(sk->port[0]),
822 (idx == PF_SK_WIRE ? "->" : "<-"),
823 ntohl(sk->addr[1].addr32[0]), ntohs(sk->port[1]));
824 }
825
826 /*
827 * Check whether (e.g.) a PASS rule being put on a per-cpu tree
828 * collides with a translation rule on the global tree. This is
829 * NOT an error. We *WANT* to establish state for this case so the
830 * packet path is short-cutted and doesn't need to scan the ruleset
831 * on every packet. But the established state will only see one
832 * side of a two-way packet conversation. To prevent this from
833 * causing problems (e.g. generating a RST), we force PFSTATE_SLOPPY
834 * to be set on the established state.
835 *
836 * A collision against RDR state can only occur with a PASS IN in the
837 * opposite direction or a PASS OUT in the forwards direction. This
838 * is because RDRs are processed on the input side.
839 *
840 * A collision against NAT state can only occur with a PASS IN in the
841 * forwards direction or a PASS OUT in the opposite direction. This
842 * is because NATs are processed on the output side.
843 *
844 * In both situations we need to do a reverse addr/port test because
845 * the PASS IN or PASS OUT only establishes if it doesn't match the
846 * established RDR state in the forwards direction. The direction
847 * flag has to be ignored (it will be one way for a PASS IN and the
848 * other way for a PASS OUT).
849 *
850 * pf_global_statetbl_lock will be locked shared when testing and
851 * not entering into the global state table.
852 */
853 if (cpu != ncpus &&
854 (cur = RB_FIND(pf_state_rtree,
855 (struct pf_state_rtree *)&pf_statetbl[ncpus],
856 sk)) != NULL) {
857 TAILQ_FOREACH(si, &cur->states, entry) {
858 /*
859 * NOTE: We must ignore direction mismatches.
860 */
861 if (si->s->kif == s->kif) {
862 s->state_flags |= PFSTATE_HALF_DUPLEX |
863 PFSTATE_SLOPPY;
864 if (pf_status.debug >= PF_DEBUG_MISC) {
865 kprintf(
866 "pf: %s key attach collision "
867 "on %s: ",
868 (idx == PF_SK_WIRE) ?
869 "wire" : "stack",
870 s->kif->pfik_name);
871 pf_print_state_parts(s,
872 (idx == PF_SK_WIRE) ? sk : NULL,
873 (idx == PF_SK_STACK) ? sk : NULL);
874 kprintf("\n");
875 }
876 break;
877 }
878 }
879 }
880
881 /*
882 * Enter into either the per-cpu or the global state table.
883 *
884 * pf_global_statetbl_lock will be locked exclusively when entering
885 * into the global state table.
886 */
887 if ((cur = RB_INSERT(pf_state_tree, &pf_statetbl[cpu], sk)) != NULL) {
888 /* key exists. check for same kif, if none, add to key */
889 TAILQ_FOREACH(si, &cur->states, entry) {
890 if (si->s->kif == s->kif &&
891 si->s->direction == s->direction) {
892 if (pf_status.debug >= PF_DEBUG_MISC) {
893 kprintf(
894 "pf: %s key attach failed on %s: ",
895 (idx == PF_SK_WIRE) ?
896 "wire" : "stack",
897 s->kif->pfik_name);
898 pf_print_state_parts(s,
899 (idx == PF_SK_WIRE) ? sk : NULL,
900 (idx == PF_SK_STACK) ? sk : NULL);
901 kprintf("\n");
902 }
903 kfree(sk, M_PFSTATEKEYPL);
904 error = -1;
905 goto failed; /* collision! */
906 }
907 }
908 kfree(sk, M_PFSTATEKEYPL);
909
910 s->key[idx] = cur;
911 } else {
912 s->key[idx] = sk;
913 }
914
915 if ((si = kmalloc(sizeof(struct pf_state_item),
916 M_PFSTATEITEMPL, M_NOWAIT)) == NULL) {
917 pf_state_key_detach(s, idx);
918 error = -1;
919 goto failed; /* collision! */
920 }
921 si->s = s;
922
923 /* list is sorted, if-bound states before floating */
924 if (s->kif == pfi_all)
925 TAILQ_INSERT_TAIL(&s->key[idx]->states, si, entry);
926 else
927 TAILQ_INSERT_HEAD(&s->key[idx]->states, si, entry);
928
929 error = 0;
930failed:
931 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
932 return error;
933}
934
935/*
936 * NOTE: Can only be called indirectly via the purge thread with pf_token
937 * exclusively locked.
938 */
939void
940pf_detach_state(struct pf_state *s)
941{
942 if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
943 s->key[PF_SK_WIRE] = NULL;
944
945 if (s->key[PF_SK_STACK] != NULL)
946 pf_state_key_detach(s, PF_SK_STACK);
947
948 if (s->key[PF_SK_WIRE] != NULL)
949 pf_state_key_detach(s, PF_SK_WIRE);
950}
951
952/*
953 * NOTE: Can only be called indirectly via the purge thread with pf_token
954 * exclusively locked.
955 */
956void
957pf_state_key_detach(struct pf_state *s, int idx)
958{
959 struct pf_state_item *si;
960 int cpu;
961
962 /*
963 * PFSTATE_STACK_GLOBAL is set for translations when the translated
964 * address/port is not localized to the same cpu that the untranslated
965 * address/port is on. The wire pf_state_key is managed on the global
966 * statetbl tree for this case.
967 */
968 if (s->state_flags & PFSTATE_STACK_GLOBAL) {
969 cpu = ncpus;
970 lockmgr(&pf_global_statetbl_lock, LK_EXCLUSIVE);
971 } else {
972 cpu = mycpu->gd_cpuid;
973 }
974
975 si = TAILQ_FIRST(&s->key[idx]->states);
976 while (si && si->s != s)
977 si = TAILQ_NEXT(si, entry);
978
979 if (si) {
980 TAILQ_REMOVE(&s->key[idx]->states, si, entry);
981 kfree(si, M_PFSTATEITEMPL);
982 }
983
984 if (TAILQ_EMPTY(&s->key[idx]->states)) {
985 RB_REMOVE(pf_state_tree, &pf_statetbl[cpu], s->key[idx]);
986 if (s->key[idx]->reverse)
987 s->key[idx]->reverse->reverse = NULL;
988 if (s->key[idx]->inp)
989 s->key[idx]->inp->inp_pf_sk = NULL;
990 kfree(s->key[idx], M_PFSTATEKEYPL);
991 }
992 s->key[idx] = NULL;
993
994 if (s->state_flags & PFSTATE_STACK_GLOBAL)
995 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
996}
997
998struct pf_state_key *
999pf_alloc_state_key(int pool_flags)
1000{
1001 struct pf_state_key *sk;
1002
1003 sk = kmalloc(sizeof(struct pf_state_key), M_PFSTATEKEYPL, pool_flags);
1004 if (sk) {
1005 TAILQ_INIT(&sk->states);
1006 }
1007 return (sk);
1008}
1009
1010int
1011pf_state_key_setup(struct pf_pdesc *pd, struct pf_rule *nr,
1012 struct pf_state_key **skw, struct pf_state_key **sks,
1013 struct pf_state_key **skp, struct pf_state_key **nkp,
1014 struct pf_addr *saddr, struct pf_addr *daddr,
1015 u_int16_t sport, u_int16_t dport)
1016{
1017 KKASSERT((*skp == NULL && *nkp == NULL));
1018
1019 if ((*skp = pf_alloc_state_key(M_NOWAIT | M_ZERO)) == NULL)
1020 return (ENOMEM);
1021
1022 PF_ACPY(&(*skp)->addr[pd->sidx], saddr, pd->af);
1023 PF_ACPY(&(*skp)->addr[pd->didx], daddr, pd->af);
1024 (*skp)->port[pd->sidx] = sport;
1025 (*skp)->port[pd->didx] = dport;
1026 (*skp)->proto = pd->proto;
1027 (*skp)->af = pd->af;
1028
1029 if (nr != NULL) {
1030 if ((*nkp = pf_alloc_state_key(M_NOWAIT | M_ZERO)) == NULL)
1031 return (ENOMEM); /* caller must handle cleanup */
1032
1033 /* XXX maybe just bcopy and TAILQ_INIT(&(*nkp)->states) */
1034 PF_ACPY(&(*nkp)->addr[0], &(*skp)->addr[0], pd->af);
1035 PF_ACPY(&(*nkp)->addr[1], &(*skp)->addr[1], pd->af);
1036 (*nkp)->port[0] = (*skp)->port[0];
1037 (*nkp)->port[1] = (*skp)->port[1];
1038 (*nkp)->proto = pd->proto;
1039 (*nkp)->af = pd->af;
1040 } else {
1041 *nkp = *skp;
1042 }
1043
1044 if (pd->dir == PF_IN) {
1045 *skw = *skp;
1046 *sks = *nkp;
1047 } else {
1048 *sks = *skp;
1049 *skw = *nkp;
1050 }
1051 return (0);
1052}
1053
1054/*
1055 * Insert pf_state with one or two state keys (allowing a reverse path lookup
1056 * which is used by NAT). In the NAT case skw is the initiator (?) and
1057 * sks is the target.
1058 */
1059int
1060pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1061 struct pf_state_key *sks, struct pf_state *s)
1062{
1063 int cpu = mycpu->gd_cpuid;
1064
1065 s->kif = kif;
1066 s->cpuid = cpu;
1067
1068 if (skw == sks) {
1069 if (pf_state_key_attach(skw, s, PF_SK_WIRE))
1070 return (-1);
1071 s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1072 } else {
1073 /*
1074 skw->reverse = sks;
1075 sks->reverse = skw;
1076 */
1077 if (pf_state_key_attach(skw, s, PF_SK_WIRE)) {
1078 kfree(sks, M_PFSTATEKEYPL);
1079 return (-1);
1080 }
1081 if (pf_state_key_attach(sks, s, PF_SK_STACK)) {
1082 pf_state_key_detach(s, PF_SK_WIRE);
1083 return (-1);
1084 }
1085 }
1086
1087 if (s->id == 0 && s->creatorid == 0) {
1088 u_int64_t sid;
1089
1090 sid = atomic_fetchadd_long(&pf_status.stateid, 1);
1091 s->id = htobe64(sid);
1092 s->creatorid = pf_status.hostid;
1093 }
1094
1095 /*
1096 * Calculate hash code for altq
1097 */
1098 s->hash = crc32(s->key[PF_SK_WIRE], PF_STATE_KEY_HASH_LENGTH);
1099
1100 if (RB_INSERT(pf_state_tree_id, &tree_id[cpu], s) != NULL) {
1101 if (pf_status.debug >= PF_DEBUG_MISC) {
1102 kprintf("pf: state insert failed: "
1103 "id: %016jx creatorid: %08x",
1104 (uintmax_t)be64toh(s->id), ntohl(s->creatorid));
1105 if (s->sync_flags & PFSTATE_FROMSYNC)
1106 kprintf(" (from sync)");
1107 kprintf("\n");
1108 }
1109 pf_detach_state(s);
1110 return (-1);
1111 }
1112 TAILQ_INSERT_TAIL(&state_list[cpu], s, entry_list);
1113 PF_INC_FCOUNTER(FCNT_STATE_INSERT);
1114 atomic_add_int(&pf_status.states, 1);
1115 pfi_kif_ref(kif, PFI_KIF_REF_STATE);
1116 pfsync_insert_state(s);
1117 return (0);
1118}
1119
1120struct pf_state *
1121pf_find_state_byid(struct pf_state_cmp *key)
1122{
1123 int cpu = mycpu->gd_cpuid;
1124
1125 PF_INC_FCOUNTER(FCNT_STATE_SEARCH);
1126
1127 return (RB_FIND(pf_state_tree_id, &tree_id[cpu],
1128 (struct pf_state *)key));
1129}
1130
1131/*
1132 * WARNING! May return a state structure that was localized to another cpu,
1133 * destruction is typically protected by the callers pf_token.
1134 * The element can only be destroyed
1135 */
1136struct pf_state *
1137pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir,
1138 struct mbuf *m)
1139{
1140 struct pf_state_key *skey = (void *)key;
1141 struct pf_state_key *sk;
1142 struct pf_state_item *si;
1143 struct pf_state *s;
1144 int cpu = mycpu->gd_cpuid;
1145 int globalstl = 0;
1146
1147 PF_INC_FCOUNTER(FCNT_STATE_SEARCH);
1148
1149 if (dir == PF_OUT && m->m_pkthdr.pf.statekey &&
1150 ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse) {
1151 sk = ((struct pf_state_key *)m->m_pkthdr.pf.statekey)->reverse;
1152 } else {
1153 sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
1154 if (sk == NULL) {
1155 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
1156 sk = RB_FIND(pf_state_tree, &pf_statetbl[ncpus], skey);
1157 if (sk == NULL) {
1158 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1159 return (NULL);
1160 }
1161 globalstl = 1;
1162 }
1163 if (dir == PF_OUT && m->m_pkthdr.pf.statekey) {
1164 ((struct pf_state_key *)
1165 m->m_pkthdr.pf.statekey)->reverse = sk;
1166 sk->reverse = m->m_pkthdr.pf.statekey;
1167 }
1168 }
1169 if (dir == PF_OUT)
1170 m->m_pkthdr.pf.statekey = NULL;
1171
1172 /* list is sorted, if-bound states before floating ones */
1173 TAILQ_FOREACH(si, &sk->states, entry) {
1174 if ((si->s->kif == pfi_all || si->s->kif == kif) &&
1175 sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
1176 si->s->key[PF_SK_STACK])) {
1177 break;
1178 }
1179 }
1180
1181 /*
1182 * Extract state before potentially releasing the global statetbl
1183 * lock. Ignore the state if the create is still in-progress as
1184 * it can be deleted out from under us by the owning localized cpu.
1185 * However, if CREATEINPROG is not set, state can only be deleted
1186 * by the purge thread which we are protected from via our shared
1187 * pf_token.
1188 */
1189 if (si) {
1190 s = si->s;
1191 if (s && (s->state_flags & PFSTATE_CREATEINPROG))
1192 s = NULL;
1193 } else {
1194 s = NULL;
1195 }
1196 if (globalstl)
1197 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1198 return s;
1199}
1200
1201/*
1202 * WARNING! May return a state structure that was localized to another cpu,
1203 * destruction is typically protected by the callers pf_token.
1204 */
1205struct pf_state *
1206pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1207{
1208 struct pf_state_key *skey = (void *)key;
1209 struct pf_state_key *sk;
1210 struct pf_state_item *si, *ret = NULL;
1211 struct pf_state *s;
1212 int cpu = mycpu->gd_cpuid;
1213 int globalstl = 0;
1214
1215 PF_INC_FCOUNTER(FCNT_STATE_SEARCH);
1216
1217 sk = RB_FIND(pf_state_tree, &pf_statetbl[cpu], skey);
1218 if (sk == NULL) {
1219 lockmgr(&pf_global_statetbl_lock, LK_SHARED);
1220 sk = RB_FIND(pf_state_tree, &pf_statetbl[ncpus], skey);
1221 globalstl = 1;
1222 }
1223 if (sk != NULL) {
1224 TAILQ_FOREACH(si, &sk->states, entry)
1225 if (dir == PF_INOUT ||
1226 (sk == (dir == PF_IN ? si->s->key[PF_SK_WIRE] :
1227 si->s->key[PF_SK_STACK]))) {
1228 if (more == NULL) {
1229 ret = si;
1230 break;
1231 }
1232 if (ret)
1233 (*more)++;
1234 else
1235 ret = si;
1236 }
1237 }
1238
1239 /*
1240 * Extract state before potentially releasing the global statetbl
1241 * lock. Ignore the state if the create is still in-progress as
1242 * it can be deleted out from under us by the owning localized cpu.
1243 * However, if CREATEINPROG is not set, state can only be deleted
1244 * by the purge thread which we are protected from via our shared
1245 * pf_token.
1246 */
1247 if (ret) {
1248 s = ret->s;
1249 if (s && (s->state_flags & PFSTATE_CREATEINPROG))
1250 s = NULL;
1251 } else {
1252 s = NULL;
1253 }
1254 if (globalstl)
1255 lockmgr(&pf_global_statetbl_lock, LK_RELEASE);
1256 return s;
1257}
1258
1259/* END state table stuff */
1260
1261void
1262pf_purge_thread(void *v)
1263{
1264 globaldata_t save_gd = mycpu;
1265 int nloops = 0;
1266 int locked = 0;
1267 int nn;
1268 int endingit;
1269
1270 for (;;) {
1271 tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);
1272
1273 endingit = pf_end_threads;
1274
1275 for (nn = 0; nn < ncpus; ++nn) {
1276 lwkt_setcpu_self(globaldata_find(nn));
1277
1278 lwkt_gettoken(&pf_token);
1279 lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1280 crit_enter();
1281
1282 /*
1283 * process a fraction of the state table every second
1284 */
1285 if(!pf_purge_expired_states(
1286 1 + (pf_status.states /
1287 pf_default_rule.timeout[
1288 PFTM_INTERVAL]), 0)) {
1289 pf_purge_expired_states(
1290 1 + (pf_status.states /
1291 pf_default_rule.timeout[
1292 PFTM_INTERVAL]), 1);
1293 }
1294
1295 /*
1296 * purge other expired types every PFTM_INTERVAL
1297 * seconds
1298 */
1299 if (++nloops >=
1300 pf_default_rule.timeout[PFTM_INTERVAL]) {
1301 pf_purge_expired_fragments();
1302 if (!pf_purge_expired_src_nodes(locked)) {
1303 pf_purge_expired_src_nodes(1);
1304 }
1305 nloops = 0;
1306 }
1307
1308 /*
1309 * If terminating the thread, clean everything out
1310 * (on all cpus).
1311 */
1312 if (endingit) {
1313 pf_purge_expired_states(pf_status.states, 0);
1314 pf_purge_expired_fragments();
1315 pf_purge_expired_src_nodes(1);
1316 }
1317
1318 crit_exit();
1319 lockmgr(&pf_consistency_lock, LK_RELEASE);
1320 lwkt_reltoken(&pf_token);
1321 }
1322 lwkt_setcpu_self(save_gd);
1323 if (endingit)
1324 break;
1325 }
1326
1327 /*
1328 * Thread termination
1329 */
1330 pf_end_threads++;
1331 wakeup(pf_purge_thread);
1332 kthread_exit();
1333}
1334
1335u_int32_t
1336pf_state_expires(const struct pf_state *state)
1337{
1338 u_int32_t timeout;
1339 u_int32_t start;
1340 u_int32_t end;
1341 u_int32_t states;
1342
1343 /* handle all PFTM_* > PFTM_MAX here */
1344 if (state->timeout == PFTM_PURGE)
1345 return (time_second);
1346 if (state->timeout == PFTM_UNTIL_PACKET)
1347 return (0);
1348 KKASSERT(state->timeout != PFTM_UNLINKED);
1349 KKASSERT(state->timeout < PFTM_MAX);
1350 timeout = state->rule.ptr->timeout[state->timeout];
1351 if (!timeout)
1352 timeout = pf_default_rule.timeout[state->timeout];
1353 start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1354 if (start) {
1355 end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1356 states = state->rule.ptr->states_cur;
1357 } else {
1358 start = pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1359 end = pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1360 states = pf_status.states;
1361 }
1362
1363 /*
1364 * If the number of states exceeds allowed values, adaptively
1365 * timeout the state more quickly. This can be very dangerous
1366 * to legitimate connections, however, so defray the timeout
1367 * based on the packet count.
1368 *
1369 * Retain from 0-100% based on number of states.
1370 *
1371 * Recover up to 50% of the lost portion if there was
1372 * packet traffic (100 pkts = 50%).
1373 */
1374 if (end && states > start && start < end) {
1375 u_int32_t n; /* timeout retention 0-100% */
1376 u_int64_t pkts;
1377#if 0
1378 static struct krate boorate = { .freq = 1 };
1379#endif
1380
1381 /*
1382 * Reduce timeout by n% (0-100)
1383 */
1384 n = (states - start) * 100 / (end - start);
1385 if (n > 100)
1386 n = 0;
1387 else
1388 n = 100 - n;
1389
1390 /*
1391 * But claw back some of the reduction based on packet
1392 * count associated with the state.
1393 */
1394 pkts = state->packets[0] + state->packets[1];
1395 if (pkts > 100)
1396 pkts = 100;
1397#if 0
1398 krateprintf(&boorate, "timeout %-4u n=%u pkts=%-3lu -> %lu\n",
1399 timeout, n, pkts, n + (100 - n) * pkts / 200);
1400#endif
1401
1402 n += (100 - n) * pkts / 200; /* recover by up-to 50% */
1403 timeout = timeout * n / 100;
1404
1405 }
1406 return (state->expire + timeout);
1407}
1408
1409/*
1410 * (called with exclusive pf_token)
1411 */
1412int
1413pf_purge_expired_src_nodes(int waslocked)
1414{
1415 struct pf_src_node *cur, *next;
1416 int locked = waslocked;
1417 int cpu = mycpu->gd_cpuid;
1418
1419 for (cur = RB_MIN(pf_src_tree, &tree_src_tracking[cpu]);
1420 cur;
1421 cur = next) {
1422 next = RB_NEXT(pf_src_tree, &tree_src_tracking[cpu], cur);
1423
1424 if (cur->states <= 0 && cur->expire <= time_second) {
1425 if (!locked) {
1426 lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1427 next = RB_NEXT(pf_src_tree,
1428 &tree_src_tracking[cpu], cur);
1429 locked = 1;
1430 }
1431 if (cur->rule.ptr != NULL) {
1432 /*
1433 * decrements in rule should be ok, token is
1434 * held exclusively in this code path.
1435 */
1436 atomic_add_int(&cur->rule.ptr->src_nodes, -1);
1437 if (cur->rule.ptr->states_cur <= 0 &&
1438 cur->rule.ptr->max_src_nodes <= 0)
1439 pf_rm_rule(NULL, cur->rule.ptr);
1440 }
1441 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], cur);
1442 PF_INC_SCOUNTER(SCNT_SRC_NODE_REMOVALS);
1443 atomic_add_int(&pf_status.src_nodes, -1);
1444 kfree(cur, M_PFSRCTREEPL);
1445 }
1446 }
1447 if (locked && !waslocked)
1448 lockmgr(&pf_consistency_lock, LK_RELEASE);
1449 return(1);
1450}
1451
1452void
1453pf_src_tree_remove_state(struct pf_state *s)
1454{
1455 u_int32_t timeout;
1456
1457 if (s->src_node != NULL) {
1458 if (s->src.tcp_est)
1459 atomic_add_int(&s->src_node->conn, -1);
1460 if (--s->src_node->states <= 0) {
1461 timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1462 if (!timeout) {
1463 timeout =
1464 pf_default_rule.timeout[PFTM_SRC_NODE];
1465 }
1466 s->src_node->expire = time_second + timeout;
1467 }
1468 }
1469 if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1470 if (--s->nat_src_node->states <= 0) {
1471 timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1472 if (!timeout)
1473 timeout =
1474 pf_default_rule.timeout[PFTM_SRC_NODE];
1475 s->nat_src_node->expire = time_second + timeout;
1476 }
1477 }
1478 s->src_node = s->nat_src_node = NULL;
1479}
1480
1481/* callers should be at crit_enter() */
1482void
1483pf_unlink_state(struct pf_state *cur)
1484{
1485 int cpu = mycpu->gd_cpuid;
1486
1487 if (cur->src.state == PF_TCPS_PROXY_DST) {
1488 /* XXX wire key the right one? */
1489 pf_send_tcp(cur->rule.ptr, cur->key[PF_SK_WIRE]->af,
1490 &cur->key[PF_SK_WIRE]->addr[1],
1491 &cur->key[PF_SK_WIRE]->addr[0],
1492 cur->key[PF_SK_WIRE]->port[1],
1493 cur->key[PF_SK_WIRE]->port[0],
1494 cur->src.seqhi, cur->src.seqlo + 1,
1495 TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL);
1496 }
1497 RB_REMOVE(pf_state_tree_id, &tree_id[cpu], cur);
1498 if (cur->creatorid == pf_status.hostid)
1499 pfsync_delete_state(cur);
1500 cur->timeout = PFTM_UNLINKED;
1501 pf_src_tree_remove_state(cur);
1502 pf_detach_state(cur);
1503}
1504
1505/*
1506 * callers should be at crit_enter() and hold pf_consistency_lock exclusively.
1507 * pf_token must also be held exclusively.
1508 */
1509void
1510pf_free_state(struct pf_state *cur)
1511{
1512 int cpu = mycpu->gd_cpuid;
1513
1514 KKASSERT(cur->cpuid == cpu);
1515
1516 if (pfsyncif != NULL &&
1517 (pfsyncif->sc_bulk_send_next == cur ||
1518 pfsyncif->sc_bulk_terminator == cur))
1519 return;
1520 KKASSERT(cur->timeout == PFTM_UNLINKED);
1521 /*
1522 * decrements in rule should be ok, token is
1523 * held exclusively in this code path.
1524 */
1525 if (--cur->rule.ptr->states_cur <= 0 &&
1526 cur->rule.ptr->src_nodes <= 0)
1527 pf_rm_rule(NULL, cur->rule.ptr);
1528 if (cur->nat_rule.ptr != NULL) {
1529 if (--cur->nat_rule.ptr->states_cur <= 0 &&
1530 cur->nat_rule.ptr->src_nodes <= 0) {
1531 pf_rm_rule(NULL, cur->nat_rule.ptr);
1532 }
1533 }
1534 if (cur->anchor.ptr != NULL) {
1535 if (--cur->anchor.ptr->states_cur <= 0)
1536 pf_rm_rule(NULL, cur->anchor.ptr);
1537 }
1538 pf_normalize_tcp_cleanup(cur);
1539 pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
1540
1541 /*
1542 * We may be freeing pf_purge_expired_states()'s saved scan entry,
1543 * adjust it if necessary.
1544 */
1545 if (purge_cur[cpu] == cur) {
1546 kprintf("PURGE CONFLICT\n");
1547 purge_cur[cpu] = TAILQ_NEXT(purge_cur[cpu], entry_list);
1548 }
1549 TAILQ_REMOVE(&state_list[cpu], cur, entry_list);
1550 if (cur->tag)
1551 pf_tag_unref(cur->tag);
1552 kfree(cur, M_PFSTATEPL);
1553 PF_INC_FCOUNTER(FCNT_STATE_REMOVALS);
1554 atomic_add_int(&pf_status.states, -1);
1555}
1556
1557int
1558pf_purge_expired_states(u_int32_t maxcheck, int waslocked)
1559{
1560 struct pf_state *cur;
1561 int locked = waslocked;
1562 int cpu = mycpu->gd_cpuid;
1563
1564 while (maxcheck--) {
1565 /*
1566 * Wrap to start of list when we hit the end
1567 */
1568 cur = purge_cur[cpu];
1569 if (cur == NULL) {
1570 cur = TAILQ_FIRST(&state_list[cpu]);
1571 if (cur == NULL)
1572 break; /* list empty */
1573 }
1574
1575 /*
1576 * Setup next (purge_cur) while we process this one. If
1577 * we block and something else deletes purge_cur,
1578 * pf_free_state() will adjust it further ahead.
1579 */
1580 purge_cur[cpu] = TAILQ_NEXT(cur, entry_list);
1581
1582 if (cur->timeout == PFTM_UNLINKED) {
1583 /* free unlinked state */
1584 if (! locked) {
1585 lockmgr(&pf_consistency_lock, LK_EXCLUSIVE);
1586 locked = 1;
1587 }
1588 pf_free_state(cur);
1589 } else if (pf_state_expires(cur) <= time_second) {
1590 /* unlink and free expired state */
1591 pf_unlink_state(cur);
1592 if (! locked) {
1593 if (!lockmgr(&pf_consistency_lock, LK_EXCLUSIVE))
1594 return (0);
1595 locked = 1;
1596 }
1597 pf_free_state(cur);
1598 }
1599 }
1600
1601 if (locked)
1602 lockmgr(&pf_consistency_lock, LK_RELEASE);
1603 return (1);
1604}
1605
1606int
1607pf_tbladdr_setup(struct pf_ruleset *rs, struct pf_addr_wrap *aw)
1608{
1609 if (aw->type != PF_ADDR_TABLE)
1610 return (0);
1611 if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL)
1612 return (1);
1613 return (0);
1614}
1615
1616void
1617pf_tbladdr_remove(struct pf_addr_wrap *aw)
1618{
1619 if (aw->type != PF_ADDR_TABLE || aw->p.tbl == NULL)
1620 return;
1621 pfr_detach_table(aw->p.tbl);
1622 aw->p.tbl = NULL;
1623}
1624
1625void
1626pf_tbladdr_copyout(struct pf_addr_wrap *aw)
1627{
1628 struct pfr_ktable *kt = aw->p.tbl;
1629
1630 if (aw->type != PF_ADDR_TABLE || kt == NULL)
1631 return;
1632 if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
1633 kt = kt->pfrkt_root;
1634 aw->p.tbl = NULL;
1635 aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
1636 kt->pfrkt_cnt : -1;
1637}
1638
1639void
1640pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1641{
1642 switch (af) {
1643#ifdef INET
1644 case AF_INET: {
1645 u_int32_t a = ntohl(addr->addr32[0]);
1646 kprintf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1647 (a>>8)&255, a&255);
1648 if (p) {
1649 p = ntohs(p);
1650 kprintf(":%u", p);
1651 }
1652 break;
1653 }
1654#endif /* INET */
1655#ifdef INET6
1656 case AF_INET6: {
1657 u_int16_t b;
1658 u_int8_t i, curstart, curend, maxstart, maxend;
1659 curstart = curend = maxstart = maxend = 255;
1660 for (i = 0; i < 8; i++) {
1661 if (!addr->addr16[i]) {
1662 if (curstart == 255)
1663 curstart = i;
1664 curend = i;
1665 } else {
1666 if ((curend - curstart) >
1667 (maxend - maxstart)) {
1668 maxstart = curstart;
1669 maxend = curend;
1670 }
1671 curstart = curend = 255;
1672 }
1673 }
1674 if ((curend - curstart) >
1675 (maxend - maxstart)) {
1676 maxstart = curstart;
1677 maxend = curend;
1678 }
1679 for (i = 0; i < 8; i++) {
1680 if (i >= maxstart && i <= maxend) {
1681 if (i == 0)
1682 kprintf(":");
1683 if (i == maxend)
1684 kprintf(":");
1685 } else {
1686 b = ntohs(addr->addr16[i]);
1687 kprintf("%x", b);
1688 if (i < 7)
1689 kprintf(":");
1690 }
1691 }
1692 if (p) {
1693 p = ntohs(p);
1694 kprintf("[%u]", p);
1695 }
1696 break;
1697 }
1698#endif /* INET6 */
1699 }
1700}
1701
1702void
1703pf_print_state(struct pf_state *s)
1704{
1705 pf_print_state_parts(s, NULL, NULL);
1706}
1707
1708void
1709pf_print_state_parts(struct pf_state *s,
1710 struct pf_state_key *skwp, struct pf_state_key *sksp)
1711{
1712 struct pf_state_key *skw, *sks;
1713 u_int8_t proto, dir;
1714
1715 /* Do our best to fill these, but they're skipped if NULL */
1716 skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1717 sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1718 proto = skw ? skw->proto : (sks ? sks->proto : 0);
1719 dir = s ? s->direction : 0;
1720
1721 switch (proto) {
1722 case IPPROTO_TCP:
1723 kprintf("TCP ");
1724 break;
1725 case IPPROTO_UDP:
1726 kprintf("UDP ");
1727 break;
1728 case IPPROTO_ICMP:
1729 kprintf("ICMP ");
1730 break;
1731 case IPPROTO_ICMPV6:
1732 kprintf("ICMPV6 ");
1733 break;
1734 default:
1735 kprintf("%u ", skw->proto);
1736 break;
1737 }
1738 switch (dir) {
1739 case PF_IN:
1740 kprintf(" in");
1741 break;
1742 case PF_OUT:
1743 kprintf(" out");
1744 break;
1745 }
1746 if (skw) {
1747 kprintf(" wire: ");
1748 pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1749 kprintf(" ");
1750 pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1751 }
1752 if (sks) {
1753 kprintf(" stack: ");
1754 if (sks != skw) {
1755 pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1756 kprintf(" ");
1757 pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1758 } else
1759 kprintf("-");
1760 }
1761 if (s) {
1762 if (proto == IPPROTO_TCP) {
1763 kprintf(" [lo=%u high=%u win=%u modulator=%u",
1764 s->src.seqlo, s->src.seqhi,
1765 s->src.max_win, s->src.seqdiff);
1766 if (s->src.wscale && s->dst.wscale)
1767 kprintf(" wscale=%u",
1768 s->src.wscale & PF_WSCALE_MASK);
1769 kprintf("]");
1770 kprintf(" [lo=%u high=%u win=%u modulator=%u",
1771 s->dst.seqlo, s->dst.seqhi,
1772 s->dst.max_win, s->dst.seqdiff);
1773 if (s->src.wscale && s->dst.wscale)
1774 kprintf(" wscale=%u",
1775 s->dst.wscale & PF_WSCALE_MASK);
1776 kprintf("]");
1777 }
1778 kprintf(" %u:%u", s->src.state, s->dst.state);
1779 }
1780}
1781
1782void
1783pf_print_flags(u_int8_t f)
1784{
1785 if (f)
1786 kprintf(" ");
1787 if (f & TH_FIN)
1788 kprintf("F");
1789 if (f & TH_SYN)
1790 kprintf("S");
1791 if (f & TH_RST)
1792 kprintf("R");
1793 if (f & TH_PUSH)
1794 kprintf("P");
1795 if (f & TH_ACK)
1796 kprintf("A");
1797 if (f & TH_URG)
1798 kprintf("U");
1799 if (f & TH_ECE)
1800 kprintf("E");
1801 if (f & TH_CWR)
1802 kprintf("W");
1803}
1804
1805#define PF_SET_SKIP_STEPS(i) \
1806 do { \
1807 while (head[i] != cur) { \
1808 head[i]->skip[i].ptr = cur; \
1809 head[i] = TAILQ_NEXT(head[i], entries); \
1810 } \
1811 } while (0)
1812
1813void
1814pf_calc_skip_steps(struct pf_rulequeue *rules)
1815{
1816 struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1817 int i;
1818
1819 cur = TAILQ_FIRST(rules);
1820 prev = cur;
1821 for (i = 0; i < PF_SKIP_COUNT; ++i)
1822 head[i] = cur;
1823 while (cur != NULL) {
1824
1825 if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1826 PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1827 if (cur->direction != prev->direction)
1828 PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1829 if (cur->af != prev->af)
1830 PF_SET_SKIP_STEPS(PF_SKIP_AF);
1831 if (cur->proto != prev->proto)
1832 PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1833 if (cur->src.neg != prev->src.neg ||
1834 pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1835 PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1836 if (cur->src.port[0] != prev->src.port[0] ||
1837 cur->src.port[1] != prev->src.port[1] ||
1838 cur->src.port_op != prev->src.port_op)
1839 PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1840 if (cur->dst.neg != prev->dst.neg ||
1841 pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1842 PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1843 if (cur->dst.port[0] != prev->dst.port[0] ||
1844 cur->dst.port[1] != prev->dst.port[1] ||
1845 cur->dst.port_op != prev->dst.port_op)
1846 PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1847
1848 prev = cur;
1849 cur = TAILQ_NEXT(cur, entries);
1850 }
1851 for (i = 0; i < PF_SKIP_COUNT; ++i)
1852 PF_SET_SKIP_STEPS(i);
1853}
1854
1855int
1856pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1857{
1858 if (aw1->type != aw2->type)
1859 return (1);
1860 switch (aw1->type) {
1861 case PF_ADDR_ADDRMASK:
1862 case PF_ADDR_RANGE:
1863 if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
1864 return (1);
1865 if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
1866 return (1);
1867 return (0);
1868 case PF_ADDR_DYNIFTL:
1869 return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1870 case PF_ADDR_NOROUTE:
1871 case PF_ADDR_URPFFAILED:
1872 return (0);
1873 case PF_ADDR_TABLE:
1874 return (aw1->p.tbl != aw2->p.tbl);
1875 case PF_ADDR_RTLABEL:
1876 return (aw1->v.rtlabel != aw2->v.rtlabel);
1877 default:
1878 kprintf("invalid address type: %d\n", aw1->type);
1879 return (1);
1880 }
1881}
1882
1883u_int16_t
1884pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1885{
1886 u_int32_t l;
1887
1888 if (udp && !cksum)
1889 return (0x0000);
1890 l = cksum + old - new;
1891 l = (l >> 16) + (l & 65535);
1892 l = l & 65535;
1893 if (udp && !l)
1894 return (0xFFFF);
1895 return (l);
1896}
1897
1898void
1899pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1900 struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1901{
1902 struct pf_addr ao;
1903 u_int16_t po = *p;
1904
1905 PF_ACPY(&ao, a, af);
1906 PF_ACPY(a, an, af);
1907
1908 *p = pn;
1909
1910 switch (af) {
1911#ifdef INET
1912 case AF_INET:
1913 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1914 ao.addr16[0], an->addr16[0], 0),
1915 ao.addr16[1], an->addr16[1], 0);
1916 *p = pn;
1917 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1918 ao.addr16[0], an->addr16[0], u),
1919 ao.addr16[1], an->addr16[1], u),
1920 po, pn, u);
1921 break;
1922#endif /* INET */
1923#ifdef INET6
1924 case AF_INET6:
1925 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1926 pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1927 pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1928 ao.addr16[0], an->addr16[0], u),
1929 ao.addr16[1], an->addr16[1], u),
1930 ao.addr16[2], an->addr16[2], u),
1931 ao.addr16[3], an->addr16[3], u),
1932 ao.addr16[4], an->addr16[4], u),
1933 ao.addr16[5], an->addr16[5], u),
1934 ao.addr16[6], an->addr16[6], u),
1935 ao.addr16[7], an->addr16[7], u),
1936 po, pn, u);
1937 break;
1938#endif /* INET6 */
1939 }
1940}
1941
1942
1943/* Changes a u_int32_t. Uses a void * so there are no align restrictions */
1944void
1945pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
1946{
1947 u_int32_t ao;
1948
1949 memcpy(&ao, a, sizeof(ao));
1950 memcpy(a, &an, sizeof(u_int32_t));
1951 *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
1952 ao % 65536, an % 65536, u);
1953}
1954
1955#ifdef INET6
1956void
1957pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
1958{
1959 struct pf_addr ao;
1960
1961 PF_ACPY(&ao, a, AF_INET6);
1962 PF_ACPY(a, an, AF_INET6);
1963
1964 *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1965 pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1966 pf_cksum_fixup(pf_cksum_fixup(*c,
1967 ao.addr16[0], an->addr16[0], u),
1968 ao.addr16[1], an->addr16[1], u),
1969 ao.addr16[2], an->addr16[2], u),
1970 ao.addr16[3], an->addr16[3], u),
1971 ao.addr16[4], an->addr16[4], u),
1972 ao.addr16[5], an->addr16[5], u),
1973 ao.addr16[6], an->addr16[6], u),
1974 ao.addr16[7], an->addr16[7], u);
1975}
1976#endif /* INET6 */
1977
1978void
1979pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
1980 struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
1981 u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
1982{
1983 struct pf_addr oia, ooa;
1984
1985 PF_ACPY(&oia, ia, af);
1986 if (oa)
1987 PF_ACPY(&ooa, oa, af);
1988
1989 /* Change inner protocol port, fix inner protocol checksum. */
1990 if (ip != NULL) {
1991 u_int16_t oip = *ip;
1992 u_int32_t opc = 0;
1993
1994 if (pc != NULL)
1995 opc = *pc;
1996 *ip = np;
1997 if (pc != NULL)
1998 *pc = pf_cksum_fixup(*pc, oip, *ip, u);
1999 *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2000 if (pc != NULL)
2001 *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2002 }
2003 /* Change inner ip address, fix inner ip and icmp checksums. */
2004 PF_ACPY(ia, na, af);
2005 switch (af) {
2006#ifdef INET
2007 case AF_INET: {
2008 u_int32_t oh2c = *h2c;
2009
2010 *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2011 oia.addr16[0], ia->addr16[0], 0),
2012 oia.addr16[1], ia->addr16[1], 0);
2013 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2014 oia.addr16[0], ia->addr16[0], 0),
2015 oia.addr16[1], ia->addr16[1], 0);
2016 *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2017 break;
2018 }
2019#endif /* INET */
2020#ifdef INET6
2021 case AF_INET6:
2022 *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2023 pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2024 pf_cksum_fixup(pf_cksum_fixup(*ic,
2025 oia.addr16[0], ia->addr16[0], u),
2026 oia.addr16[1], ia->addr16[1], u),
2027 oia.addr16[2], ia->addr16[2], u),
2028 oia.addr16[3], ia->addr16[3], u),
2029 oia.addr16[4], ia->addr16[4], u),
2030 oia.addr16[5], ia->addr16[5], u),
2031 oia.addr16[6], ia->addr16[6], u),
2032 oia.addr16[7], ia->addr16[7], u);
2033 break;
2034#endif /* INET6 */
2035 }
2036 /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2037 if (oa) {
2038 PF_ACPY(oa, na, af);
2039 switch (af) {
2040#ifdef INET
2041 case AF_INET:
2042 *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2043 ooa.addr16[0], oa->addr16[0], 0),
2044 ooa.addr16[1], oa->addr16[1], 0);
2045 break;
2046#endif /* INET */
2047#ifdef INET6
2048 case AF_INET6:
2049 *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2050 pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2051 pf_cksum_fixup(pf_cksum_fixup(*ic,
2052 ooa.addr16[0], oa->addr16[0], u),
2053 ooa.addr16[1], oa->addr16[1], u),
2054 ooa.addr16[2], oa->addr16[2], u),
2055 ooa.addr16[3], oa->addr16[3], u),
2056 ooa.addr16[4], oa->addr16[4], u),
2057 ooa.addr16[5], oa->addr16[5], u),
2058 ooa.addr16[6], oa->addr16[6], u),
2059 ooa.addr16[7], oa->addr16[7], u);
2060 break;
2061#endif /* INET6 */
2062 }
2063 }
2064}
2065
2066
2067/*
2068 * Need to modulate the sequence numbers in the TCP SACK option
2069 * (credits to Krzysztof Pfaff for report and patch)
2070 */
2071int
2072pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2073 struct tcphdr *th, struct pf_state_peer *dst)
2074{
2075 int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2076 u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2077 int copyback = 0, i, olen;
2078 struct raw_sackblock sack;
2079
2080#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
2081 if (hlen < TCPOLEN_SACKLEN ||
2082 !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2083 return 0;
2084
2085 while (hlen >= TCPOLEN_SACKLEN) {
2086 olen = opt[1];
2087 switch (*opt) {
2088 case TCPOPT_EOL: /* FALLTHROUGH */
2089 case TCPOPT_NOP:
2090 opt++;
2091 hlen--;
2092 break;
2093 case TCPOPT_SACK:
2094 if (olen > hlen)
2095 olen = hlen;
2096 if (olen >= TCPOLEN_SACKLEN) {
2097 for (i = 2; i + TCPOLEN_SACK <= olen;
2098 i += TCPOLEN_SACK) {
2099 memcpy(&sack, &opt[i], sizeof(sack));
2100 pf_change_a(&sack.rblk_start, &th->th_sum,
2101 htonl(ntohl(sack.rblk_start) -
2102 dst->seqdiff), 0);
2103 pf_change_a(&sack.rblk_end, &th->th_sum,
2104 htonl(ntohl(sack.rblk_end) -
2105 dst->seqdiff), 0);
2106 memcpy(&opt[i], &sack, sizeof(sack));
2107 }
2108 copyback = 1;
2109 }
2110 /* FALLTHROUGH */
2111 default:
2112 if (olen < 2)
2113 olen = 2;
2114 hlen -= olen;
2115 opt += olen;
2116 }
2117 }
2118
2119 if (copyback)
2120 m_copyback(m, off + sizeof(*th), thoptlen, opts);
2121 return (copyback);
2122}
2123
2124void
2125pf_send_tcp(const struct pf_rule *r, sa_family_t af,
2126 const struct pf_addr *saddr, const struct pf_addr *daddr,
2127 u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2128 u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2129 u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp)
2130{
2131 struct mbuf *m;
2132 int len = 0, tlen;
2133#ifdef INET
2134 struct ip *h = NULL;
2135#endif /* INET */
2136#ifdef INET6
2137 struct ip6_hdr *h6 = NULL;
2138#endif /* INET6 */
2139 struct tcphdr *th = NULL;
2140 char *opt;
2141
2142 ASSERT_LWKT_TOKEN_HELD(&pf_token);
2143
2144 /* maximum segment size tcp option */
2145 tlen = sizeof(struct tcphdr);
2146 if (mss)
2147 tlen += 4;
2148
2149 switch (af) {
2150#ifdef INET
2151 case AF_INET:
2152 len = sizeof(struct ip) + tlen;
2153 break;
2154#endif /* INET */
2155#ifdef INET6
2156 case AF_INET6:
2157 len = sizeof(struct ip6_hdr) + tlen;
2158 break;
2159#endif /* INET6 */
2160 }
2161
2162 /*
2163 * Create outgoing mbuf.
2164 *
2165 * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
2166 * so make sure pf.flags is clear.
2167 */
2168 m = m_gethdr(M_NOWAIT, MT_HEADER);
2169 if (m == NULL) {
2170 return;
2171 }
2172 if (tag)
2173 m->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
2174 m->m_pkthdr.pf.flags = 0;
2175 m->m_pkthdr.pf.tag = rtag;
2176 /* XXX Recheck when upgrading to > 4.4 */
2177 m->m_pkthdr.pf.statekey = NULL;
2178 if (r != NULL && r->rtableid >= 0)
2179 m->m_pkthdr.pf.rtableid = r->rtableid;
2180
2181#ifdef ALTQ
2182 if (r != NULL && r->qid) {
2183 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
2184 m->m_pkthdr.pf.qid = r->qid;
2185 m->m_pkthdr.pf.ecn_af = af;
2186 m->m_pkthdr.pf.hdr = mtod(m, struct ip *);
2187 }
2188#endif /* ALTQ */
2189 m->m_data += max_linkhdr;
2190 m->m_pkthdr.len = m->m_len = len;
2191 m->m_pkthdr.rcvif = NULL;
2192 bzero(m->m_data, len);
2193 switch (af) {
2194#ifdef INET
2195 case AF_INET:
2196 h = mtod(m, struct ip *);
2197
2198 /* IP header fields included in the TCP checksum */
2199 h->ip_p = IPPROTO_TCP;
2200 h->ip_len = htons(tlen);
2201 h->ip_src.s_addr = saddr->v4.s_addr;
2202 h->ip_dst.s_addr = daddr->v4.s_addr;
2203
2204 th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2205 break;
2206#endif /* INET */
2207#ifdef INET6
2208 case AF_INET6:
2209 h6 = mtod(m, struct ip6_hdr *);
2210
2211 /* IP header fields included in the TCP checksum */
2212 h6->ip6_nxt = IPPROTO_TCP;
2213 h6->ip6_plen = htons(tlen);
2214 memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2215 memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2216
2217 th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2218 break;
2219#endif /* INET6 */
2220 }
2221
2222 /* TCP header */
2223 th->th_sport = sport;
2224 th->th_dport = dport;
2225 th->th_seq = htonl(seq);
2226 th->th_ack = htonl(ack);
2227 th->th_off = tlen >> 2;
2228 th->th_flags = flags;
2229 th->th_win = htons(win);
2230
2231 if (mss) {
2232 opt = (char *)(th + 1);
2233 opt[0] = TCPOPT_MAXSEG;
2234 opt[1] = 4;
2235 mss = htons(mss);
2236 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2237 }
2238
2239 switch (af) {
2240#ifdef INET
2241 case AF_INET:
2242 /* TCP checksum */
2243 th->th_sum = in_cksum(m, len);
2244
2245 /* Finish the IP header */
2246 h->ip_v = 4;
2247 h->ip_hl = sizeof(*h) >> 2;
2248 h->ip_tos = IPTOS_LOWDELAY;
2249 h->ip_len = htons(len);
2250 h->ip_off = path_mtu_discovery ? htons(IP_DF) : 0;
2251 h->ip_ttl = ttl ? ttl : ip_defttl;
2252 h->ip_sum = 0;
2253 if (eh == NULL) {
2254 lwkt_reltoken(&pf_token);
2255 ip_output(m, NULL, NULL, 0, NULL, NULL);
2256 lwkt_gettoken(&pf_token);
2257 } else {
2258 struct route ro;
2259 struct rtentry rt;
2260 struct ether_header *e = (void *)ro.ro_dst.sa_data;
2261
2262 if (ifp == NULL) {
2263 m_freem(m);
2264 return;
2265 }
2266 rt.rt_ifp = ifp;
2267 ro.ro_rt = &rt;
2268 ro.ro_dst.sa_len = sizeof(ro.ro_dst);
2269 ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT;
2270 bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN);
2271 bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN);
2272 e->ether_type = eh->ether_type;
2273 /* XXX_IMPORT: later */
2274 lwkt_reltoken(&pf_token);
2275 ip_output(m, NULL, &ro, 0, NULL, NULL);
2276 lwkt_gettoken(&pf_token);
2277 }
2278 break;
2279#endif /* INET */
2280#ifdef INET6
2281 case AF_INET6:
2282 /* TCP checksum */
2283 th->th_sum = in6_cksum(m, IPPROTO_TCP,
2284 sizeof(struct ip6_hdr), tlen);
2285
2286 h6->ip6_vfc |= IPV6_VERSION;
2287 h6->ip6_hlim = IPV6_DEFHLIM;
2288
2289 lwkt_reltoken(&pf_token);
2290 ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
2291 lwkt_gettoken(&pf_token);
2292 break;
2293#endif /* INET6 */
2294 }
2295}
2296
2297void
2298pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2299 struct pf_rule *r)
2300{
2301 struct mbuf *m0;
2302
2303 /*
2304 * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
2305 * so make sure pf.flags is clear.
2306 */
2307 if ((m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
2308 return;
2309
2310 m0->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
2311 m0->m_pkthdr.pf.flags = 0;
2312 /* XXX Re-Check when Upgrading to > 4.4 */
2313 m0->m_pkthdr.pf.statekey = NULL;
2314
2315 if (r->rtableid >= 0)
2316 m0->m_pkthdr.pf.rtableid = r->rtableid;
2317
2318#ifdef ALTQ
2319 if (r->qid) {
2320 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
2321 m0->m_pkthdr.pf.qid = r->qid;
2322 m0->m_pkthdr.pf.ecn_af = af;
2323 m0->m_pkthdr.pf.hdr = mtod(m0, struct ip *);
2324 }
2325#endif /* ALTQ */
2326
2327 switch (af) {
2328#ifdef INET
2329 case AF_INET:
2330 icmp_error(m0, type, code, 0, 0);
2331 break;
2332#endif /* INET */
2333#ifdef INET6
2334 case AF_INET6:
2335 icmp6_error(m0, type, code, 0);
2336 break;
2337#endif /* INET6 */
2338 }
2339}
2340
2341/*
2342 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2343 * If n is 0, they match if they are equal. If n is != 0, they match if they
2344 * are different.
2345 */
2346int
2347pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2348 struct pf_addr *b, sa_family_t af)
2349{
2350 int match = 0;
2351
2352 switch (af) {
2353#ifdef INET
2354 case AF_INET:
2355 if ((a->addr32[0] & m->addr32[0]) ==
2356 (b->addr32[0] & m->addr32[0]))
2357 match++;
2358 break;
2359#endif /* INET */
2360#ifdef INET6
2361 case AF_INET6:
2362 if (((a->addr32[0] & m->addr32[0]) ==
2363 (b->addr32[0] & m->addr32[0])) &&
2364 ((a->addr32[1] & m->addr32[1]) ==
2365 (b->addr32[1] & m->addr32[1])) &&
2366 ((a->addr32[2] & m->addr32[2]) ==
2367 (b->addr32[2] & m->addr32[2])) &&
2368 ((a->addr32[3] & m->addr32[3]) ==
2369 (b->addr32[3] & m->addr32[3])))
2370 match++;
2371 break;
2372#endif /* INET6 */
2373 }
2374 if (match) {
2375 if (n)
2376 return (0);
2377 else
2378 return (1);
2379 } else {
2380 if (n)
2381 return (1);
2382 else
2383 return (0);
2384 }
2385}
2386
2387/*
2388 * Return 1 if b <= a <= e, otherwise return 0.
2389 */
2390int
2391pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2392 struct pf_addr *a, sa_family_t af)
2393{
2394 switch (af) {
2395#ifdef INET
2396 case AF_INET:
2397 if ((a->addr32[0] < b->addr32[0]) ||
2398 (a->addr32[0] > e->addr32[0]))
2399 return (0);
2400 break;
2401#endif /* INET */
2402#ifdef INET6
2403 case AF_INET6: {
2404 int i;
2405
2406 /* check a >= b */
2407 for (i = 0; i < 4; ++i)
2408 if (a->addr32[i] > b->addr32[i])
2409 break;
2410 else if (a->addr32[i] < b->addr32[i])
2411 return (0);
2412 /* check a <= e */
2413 for (i = 0; i < 4; ++i)
2414 if (a->addr32[i] < e->addr32[i])
2415 break;
2416 else if (a->addr32[i] > e->addr32[i])
2417 return (0);
2418 break;
2419 }
2420#endif /* INET6 */
2421 }
2422 return (1);
2423}
2424
2425int
2426pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2427{
2428 switch (op) {
2429 case PF_OP_IRG:
2430 return ((p > a1) && (p < a2));
2431 case PF_OP_XRG:
2432 return ((p < a1) || (p > a2));
2433 case PF_OP_RRG:
2434 return ((p >= a1) && (p <= a2));
2435 case PF_OP_EQ:
2436 return (p == a1);
2437 case PF_OP_NE:
2438 return (p != a1);
2439 case PF_OP_LT:
2440 return (p < a1);
2441 case PF_OP_LE:
2442 return (p <= a1);
2443 case PF_OP_GT:
2444 return (p > a1);
2445 case PF_OP_GE:
2446 return (p >= a1);
2447 }
2448 return (0); /* never reached */
2449}
2450
2451int
2452pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2453{
2454 a1 = ntohs(a1);
2455 a2 = ntohs(a2);
2456 p = ntohs(p);
2457 return (pf_match(op, a1, a2, p));
2458}
2459
2460int
2461pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2462{
2463 if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2464 return (0);
2465 return (pf_match(op, a1, a2, u));
2466}
2467
2468int
2469pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2470{
2471 if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2472 return (0);
2473 return (pf_match(op, a1, a2, g));
2474}
2475
2476int
2477pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag)
2478{
2479 if (*tag == -1)
2480 *tag = m->m_pkthdr.pf.tag;
2481
2482 return ((!r->match_tag_not && r->match_tag == *tag) ||
2483 (r->match_tag_not && r->match_tag != *tag));
2484}
2485
2486int
2487pf_tag_packet(struct mbuf *m, int tag, int rtableid)
2488{
2489 if (tag <= 0 && rtableid < 0)
2490 return (0);
2491
2492 if (tag > 0)
2493 m->m_pkthdr.pf.tag = tag;
2494 if (rtableid >= 0)
2495 m->m_pkthdr.pf.rtableid = rtableid;
2496
2497 return (0);
2498}
2499
2500void
2501pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
2502 struct pf_rule **r, struct pf_rule **a, int *match)
2503{
2504 struct pf_anchor_stackframe *f;
2505
2506 (*r)->anchor->match = 0;
2507 if (match)
2508 *match = 0;
2509 if (*depth >= NELEM(pf_anchor_stack)) {
2510 kprintf("pf_step_into_anchor: stack overflow\n");
2511 *r = TAILQ_NEXT(*r, entries);
2512 return;
2513 } else if (*depth == 0 && a != NULL)
2514 *a = *r;
2515 f = pf_anchor_stack + (*depth)++;
2516 f->rs = *rs;
2517 f->r = *r;
2518 if ((*r)->anchor_wildcard) {
2519 f->parent = &(*r)->anchor->children;
2520 if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
2521 NULL) {
2522 *r = NULL;
2523 return;
2524 }
2525 *rs = &f->child->ruleset;
2526 } else {
2527 f->parent = NULL;
2528 f->child = NULL;
2529 *rs = &(*r)->anchor->ruleset;
2530 }
2531 *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2532}
2533
2534int
2535pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
2536 struct pf_rule **r, struct pf_rule **a, int *match)
2537{
2538 struct pf_anchor_stackframe *f;
2539 int quick = 0;
2540
2541 do {
2542 if (*depth <= 0)
2543 break;
2544 f = pf_anchor_stack + *depth - 1;
2545 if (f->parent != NULL && f->child != NULL) {
2546 if (f->child->match ||
2547 (match != NULL && *match)) {
2548 f->r->anchor->match = 1;
2549 *match = 0;
2550 }
2551 f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
2552 if (f->child != NULL) {
2553 *rs = &f->child->ruleset;
2554 *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2555 if (*r == NULL)
2556 continue;
2557 else
2558 break;
2559 }
2560 }
2561 (*depth)--;
2562 if (*depth == 0 && a != NULL)
2563 *a = NULL;
2564 *rs = f->rs;
2565 if (f->r->anchor->match || (match != NULL && *match))
2566 quick = f->r->quick;
2567 *r = TAILQ_NEXT(f->r, entries);
2568 } while (*r == NULL);
2569
2570 return (quick);
2571}
2572
2573#ifdef INET6
2574void
2575pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2576 struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2577{
2578 switch (af) {
2579#ifdef INET
2580 case AF_INET:
2581 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2582 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2583 break;
2584#endif /* INET */
2585 case AF_INET6:
2586 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2587 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2588 naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2589 ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2590 naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2591 ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2592 naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2593 ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2594 break;
2595 }
2596}
2597
2598void
2599pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2600{
2601 switch (af) {
2602#ifdef INET
2603 case AF_INET:
2604 addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2605 break;
2606#endif /* INET */
2607 case AF_INET6:
2608 if (addr->addr32[3] == 0xffffffff) {
2609 addr->addr32[3] = 0;
2610 if (addr->addr32[2] == 0xffffffff) {
2611 addr->addr32[2] = 0;
2612 if (addr->addr32[1] == 0xffffffff) {
2613 addr->addr32[1] = 0;
2614 addr->addr32[0] =
2615 htonl(ntohl(addr->addr32[0]) + 1);
2616 } else
2617 addr->addr32[1] =
2618 htonl(ntohl(addr->addr32[1]) + 1);
2619 } else
2620 addr->addr32[2] =
2621 htonl(ntohl(addr->addr32[2]) + 1);
2622 } else
2623 addr->addr32[3] =
2624 htonl(ntohl(addr->addr32[3]) + 1);
2625 break;
2626 }
2627}
2628#endif /* INET6 */
2629
2630#define mix(a,b,c) \
2631 do { \
2632 a -= b; a -= c; a ^= (c >> 13); \
2633 b -= c; b -= a; b ^= (a << 8); \
2634 c -= a; c -= b; c ^= (b >> 13); \
2635 a -= b; a -= c; a ^= (c >> 12); \
2636 b -= c; b -= a; b ^= (a << 16); \
2637 c -= a; c -= b; c ^= (b >> 5); \
2638 a -= b; a -= c; a ^= (c >> 3); \
2639 b -= c; b -= a; b ^= (a << 10); \
2640 c -= a; c -= b; c ^= (b >> 15); \
2641 } while (0)
2642
2643/*
2644 * hash function based on bridge_hash in if_bridge.c
2645 */
2646void
2647pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
2648 struct pf_poolhashkey *key, sa_family_t af)
2649{
2650 u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
2651
2652 switch (af) {
2653#ifdef INET
2654 case AF_INET:
2655 a += inaddr->addr32[0];
2656 b += key->key32[1];
2657 mix(a, b, c);
2658 hash->addr32[0] = c + key->key32[2];
2659 break;
2660#endif /* INET */
2661#ifdef INET6
2662 case AF_INET6:
2663 a += inaddr->addr32[0];
2664 b += inaddr->addr32[2];
2665 mix(a, b, c);
2666 hash->addr32[0] = c;
2667 a += inaddr->addr32[1];
2668 b += inaddr->addr32[3];
2669 c += key->key32[1];
2670 mix(a, b, c);
2671 hash->addr32[1] = c;
2672 a += inaddr->addr32[2];
2673 b += inaddr->addr32[1];
2674 c += key->key32[2];
2675 mix(a, b, c);
2676 hash->addr32[2] = c;
2677 a += inaddr->addr32[3];
2678 b += inaddr->addr32[0];
2679 c += key->key32[3];
2680 mix(a, b, c);
2681 hash->addr32[3] = c;
2682 break;
2683#endif /* INET6 */
2684 }
2685}
2686
2687int
2688pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
2689 struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
2690{
2691 unsigned char hash[16];
2692 struct pf_pool *rpool = &r->rpool;
2693 struct pf_pooladdr *acur = rpool->cur;
2694 struct pf_pooladdr *cur;
2695 struct pf_addr *raddr;
2696 struct pf_addr *rmask;
2697 struct pf_addr counter;
2698 struct pf_src_node k;
2699 int cpu = mycpu->gd_cpuid;
2700 int tblidx;
2701
2702 bzero(hash, sizeof(hash)); /* avoid gcc warnings */
2703
2704 /*
2705 * NOTE! rpool->cur and rpool->tblidx can be iterators and thus
2706 * may represent a SMP race due to the shared nature of the
2707 * rpool structure. We allow the race and ensure that updates
2708 * do not create a fatal condition.
2709 */
2710 cpu_ccfence();
2711 cur = acur;
2712 raddr = &cur->addr.v.a.addr;
2713 rmask = &cur->addr.v.a.mask;
2714
2715 if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
2716 (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
2717 k.af = af;
2718 PF_ACPY(&k.addr, saddr, af);
2719 if (r->rule_flag & PFRULE_RULESRCTRACK ||
2720 r->rpool.opts & PF_POOL_STICKYADDR)
2721 k.rule.ptr = r;
2722 else
2723 k.rule.ptr = NULL;
2724 PF_INC_SCOUNTER(SCNT_SRC_NODE_SEARCH);
2725 *sn = RB_FIND(pf_src_tree, &tree_src_tracking[cpu], &k);
2726 if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
2727 PF_ACPY(naddr, &(*sn)->raddr, af);
2728 if (pf_status.debug >= PF_DEBUG_MISC) {
2729 kprintf("pf_map_addr: src tracking maps ");
2730 pf_print_host(&k.addr, 0, af);
2731 kprintf(" to ");
2732 pf_print_host(naddr, 0, af);
2733 kprintf("\n");
2734 }
2735 return (0);
2736 }
2737 }
2738
2739 if (cur->addr.type == PF_ADDR_NOROUTE)
2740 return (1);
2741 if (cur->addr.type == PF_ADDR_DYNIFTL) {
2742 switch (af) {
2743#ifdef INET
2744 case AF_INET:
2745 if (cur->addr.p.dyn->pfid_acnt4 < 1 &&
2746 (rpool->opts & PF_POOL_TYPEMASK) !=
2747 PF_POOL_ROUNDROBIN)
2748 return (1);
2749 raddr = &cur->addr.p.dyn->pfid_addr4;
2750 rmask = &cur->addr.p.dyn->pfid_mask4;
2751 break;
2752#endif /* INET */
2753#ifdef INET6
2754 case AF_INET6:
2755 if (cur->addr.p.dyn->pfid_acnt6 < 1 &&
2756 (rpool->opts & PF_POOL_TYPEMASK) !=
2757 PF_POOL_ROUNDROBIN)
2758 return (1);
2759 raddr = &cur->addr.p.dyn->pfid_addr6;
2760 rmask = &cur->addr.p.dyn->pfid_mask6;
2761 break;
2762#endif /* INET6 */
2763 }
2764 } else if (cur->addr.type == PF_ADDR_TABLE) {
2765 if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
2766 return (1); /* unsupported */
2767 } else {
2768 raddr = &cur->addr.v.a.addr;
2769 rmask = &cur->addr.v.a.mask;
2770 }
2771
2772 switch (rpool->opts & PF_POOL_TYPEMASK) {
2773 case PF_POOL_NONE:
2774 PF_ACPY(naddr, raddr, af);
2775 break;
2776 case PF_POOL_BITMASK:
2777 PF_POOLMASK(naddr, raddr, rmask, saddr, af);
2778 break;
2779 case PF_POOL_RANDOM:
2780 if (init_addr != NULL && PF_AZERO(init_addr, af)) {
2781 switch (af) {
2782#ifdef INET
2783 case AF_INET:
2784 counter.addr32[0] = htonl(karc4random());
2785 break;
2786#endif /* INET */
2787#ifdef INET6
2788 case AF_INET6:
2789 if (rmask->addr32[3] != 0xffffffff)
2790 counter.addr32[3] =
2791 htonl(karc4random());
2792 else
2793 break;
2794 if (rmask->addr32[2] != 0xffffffff)
2795 counter.addr32[2] =
2796 htonl(karc4random());
2797 else
2798 break;
2799 if (rmask->addr32[1] != 0xffffffff)
2800 counter.addr32[1] =
2801 htonl(karc4random());
2802 else
2803 break;
2804 if (rmask->addr32[0] != 0xffffffff)
2805 counter.addr32[0] =
2806 htonl(karc4random());
2807 break;
2808#endif /* INET6 */
2809 }
2810 PF_POOLMASK(naddr, raddr, rmask, &counter, af);
2811 PF_ACPY(init_addr, naddr, af);
2812
2813 } else {
2814 counter = rpool->counter;
2815 cpu_ccfence();
2816 PF_AINC(&counter, af);
2817 PF_POOLMASK(naddr, raddr, rmask, &counter, af);
2818 rpool->counter = counter;
2819 }
2820 break;
2821 case PF_POOL_SRCHASH:
2822 pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
2823 PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
2824 break;
2825 case PF_POOL_ROUNDROBIN:
2826 tblidx = rpool->tblidx;
2827 counter = rpool->counter;
2828 if (cur->addr.type == PF_ADDR_TABLE) {
2829 if (!pfr_pool_get(cur->addr.p.tbl,
2830 &tblidx, &counter,
2831 &raddr, &rmask, af)) {
2832 goto get_addr;
2833 }
2834 } else if (cur->addr.type == PF_ADDR_DYNIFTL) {
2835 if (!pfr_pool_get(cur->addr.p.dyn->pfid_kt,
2836 &tblidx, &counter,
2837 &raddr, &rmask, af)) {
2838 goto get_addr;
2839 }
2840 } else if (pf_match_addr(0, raddr, rmask,
2841 &counter, af)) {
2842 goto get_addr;
2843 }
2844
2845 try_next:
2846 if ((cur = TAILQ_NEXT(cur, entries)) == NULL)
2847 cur = TAILQ_FIRST(&rpool->list);
2848 if (cur->addr.type == PF_ADDR_TABLE) {
2849 tblidx = -1;
2850 if (pfr_pool_get(cur->addr.p.tbl,
2851 &tblidx, &counter,
2852 &raddr, &rmask, af)) {
2853 /* table contains no address of type 'af' */
2854 if (cur != acur)
2855 goto try_next;
2856 return (1);
2857 }
2858 } else if (cur->addr.type == PF_ADDR_DYNIFTL) {
2859 tblidx = -1;
2860 if (pfr_pool_get(cur->addr.p.dyn->pfid_kt,
2861 &tblidx, &counter,
2862 &raddr, &rmask, af)) {
2863 /* table contains no address of type 'af' */
2864 if (cur != acur)
2865 goto try_next;
2866 return (1);
2867 }
2868 } else {
2869 raddr = &cur->addr.v.a.addr;
2870 rmask = &cur->addr.v.a.mask;
2871 PF_ACPY(&counter, raddr, af);
2872 }
2873
2874 get_addr:
2875 rpool->cur = cur;
2876 rpool->tblidx = tblidx;
2877 PF_ACPY(naddr, &counter, af);
2878 if (init_addr != NULL && PF_AZERO(init_addr, af))
2879 PF_ACPY(init_addr, naddr, af);
2880 PF_AINC(&counter, af);
2881 rpool->counter = counter;
2882 break;
2883 }
2884 if (*sn != NULL)
2885 PF_ACPY(&(*sn)->raddr, naddr, af);
2886
2887 if (pf_status.debug >= PF_DEBUG_MISC &&
2888 (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
2889 kprintf("pf_map_addr: selected address ");
2890 pf_print_host(naddr, 0, af);
2891 kprintf("\n");
2892 }
2893
2894 return (0);
2895}
2896
2897int
2898pf_get_sport(struct pf_pdesc *pd, sa_family_t af,
2899 u_int8_t proto, struct pf_rule *r,
2900 struct pf_addr *saddr, struct pf_addr *daddr,
2901 u_int16_t sport, u_int16_t dport,
2902 struct pf_addr *naddr, u_int16_t *nport,
2903 u_int16_t low, u_int16_t high, struct pf_src_node **sn)
2904{
2905 struct pf_state_key_cmp key;
2906 struct pf_addr init_addr;
2907 u_int16_t cut;
2908 u_int32_t hash_base = 0;
2909 int do_hash = 0;
2910
2911 bzero(&init_addr, sizeof(init_addr));
2912 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
2913 return (1);
2914
2915 if (proto == IPPROTO_ICMP) {
2916 low = 1;
2917 high = 65535;
2918 }
2919
2920 bzero(&key, sizeof(key));
2921 key.af = af;
2922 key.proto = proto;
2923 key.port[0] = dport;
2924 PF_ACPY(&key.addr[0], daddr, key.af);
2925
2926 do {
2927 PF_ACPY(&key.addr[1], naddr, key.af);
2928
2929 /*
2930 * We want to select a port that calculates to a toeplitz hash
2931 * that masks to the same cpu, otherwise the response may
2932 * not see the new state.
2933 *
2934 * We can still do this even if the kernel is disregarding
2935 * the hash and vectoring the packets to a specific cpu,
2936 * but it will reduce the number of ports we can use.
2937 */
2938 switch(af) {
2939 case AF_INET:
2940 if (proto == IPPROTO_TCP) {
2941 do_hash = 1;
2942 hash_base = toeplitz_piecemeal_port(dport) ^
2943 toeplitz_piecemeal_addr(daddr->v4.s_addr) ^
2944 toeplitz_piecemeal_addr(naddr->v4.s_addr);
2945 }
2946 break;
2947 case AF_INET6:
2948 /* XXX TODO XXX */
2949 default:
2950 /* XXX TODO XXX */
2951 break;
2952 }
2953
2954 /*
2955 * port search; start random, step;
2956 * similar 2 portloop in in_pcbbind
2957 *
2958 * WARNING! We try to match such that the kernel will
2959 * dispatch the translated host/port to the same
2960 * cpu, but this might not be possible.
2961 *
2962 * In the case where the port is fixed, or for the
2963 * UDP case (whos toeplitz does not incorporate the
2964 * port), we set not_cpu_localized which ultimately
2965 * causes the pf_state_tree element
2966 *
2967 * XXX fixed ports present a problem for cpu localization.
2968 */
2969 if (!(proto == IPPROTO_TCP ||
2970 proto == IPPROTO_UDP ||
2971 proto == IPPROTO_ICMP)) {
2972 /*
2973 * non-specific protocol, leave port intact.
2974 */
2975 key.port[1] = sport;
2976 if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2977 *nport = sport;
2978 pd->not_cpu_localized = 1;
2979 return (0);
2980 }
2981 } else if (low == 0 && high == 0) {
2982 /*
2983 * static-port same as originator.
2984 */
2985 key.port[1] = sport;
2986 if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2987 *nport = sport;
2988 pd->not_cpu_localized = 1;
2989 return (0);
2990 }
2991 } else if (low == high) {
2992 /*
2993 * specific port as specified.
2994 */
2995 key.port[1] = htons(low);
2996 if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
2997 *nport = htons(low);
2998 pd->not_cpu_localized = 1;
2999 return (0);
3000 }
3001 } else {
3002 /*
3003 * normal dynamic port
3004 */
3005 u_int16_t tmp;
3006
3007 if (low > high) {
3008 tmp = low;
3009 low = high;
3010 high = tmp;
3011 }
3012 /* low < high */
3013 cut = htonl(karc4random()) % (1 + high - low) + low;
3014 /* low <= cut <= high */
3015 for (tmp = cut; tmp <= high; ++(tmp)) {
3016 key.port[1] = htons(tmp);
3017 if (do_hash) {
3018 uint32_t hash;
3019
3020 hash = hash_base ^
3021 toeplitz_piecemeal_port(key.port[1]);
3022 if (netisr_hashcpu(hash) != mycpuid)
3023 continue;
3024 }
3025 if (pf_find_state_all(&key, PF_IN, NULL) ==
3026 NULL && !in_baddynamic(tmp, proto)) {
3027 if (proto == IPPROTO_UDP)
3028 pd->not_cpu_localized = 1;
3029 *nport = htons(tmp);
3030 return (0);
3031 }
3032 }
3033 for (tmp = cut - 1; tmp >= low; --(tmp)) {
3034 key.port[1] = htons(tmp);
3035 if (do_hash) {
3036 uint32_t hash;
3037
3038 hash = hash_base ^
3039 toeplitz_piecemeal_port(key.port[1]);
3040 if (netisr_hashcpu(hash) != mycpuid)
3041 continue;
3042 }
3043 if (pf_find_state_all(&key, PF_IN, NULL) ==
3044 NULL && !in_baddynamic(tmp, proto)) {
3045 if (proto == IPPROTO_UDP)
3046 pd->not_cpu_localized = 1;
3047 *nport = htons(tmp);
3048 return (0);
3049 }
3050 }
3051 }
3052
3053 /*
3054 * Next address
3055 */
3056 switch (r->rpool.opts & PF_POOL_TYPEMASK) {
3057 case PF_POOL_RANDOM:
3058 case PF_POOL_ROUNDROBIN:
3059 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
3060 return (1);
3061 break;
3062 case PF_POOL_NONE:
3063 case PF_POOL_SRCHASH:
3064 case PF_POOL_BITMASK:
3065 default:
3066 return (1);
3067 }
3068 } while (! PF_AEQ(&init_addr, naddr, af) );
3069 return (1); /* none available */
3070}
3071
3072struct pf_rule *
3073pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
3074 int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
3075 struct pf_addr *daddr, u_int16_t dport, int rs_num)
3076{
3077 struct pf_rule *r, *rm = NULL;
3078 struct pf_ruleset *ruleset = NULL;
3079 int tag = -1;
3080 int rtableid = -1;
3081 int asd = 0;
3082
3083 r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
3084 while (r && rm == NULL) {
3085 struct pf_rule_addr *src = NULL, *dst = NULL;
3086 struct pf_addr_wrap *xdst = NULL;
3087 struct pf_pooladdr *cur;
3088
3089 if (r->action == PF_BINAT && direction == PF_IN) {
3090 src = &r->dst;
3091 cur = r->rpool.cur; /* SMP race possible */
3092 cpu_ccfence();
3093 if (cur)
3094 xdst = &cur->addr;
3095 } else {
3096 src = &r->src;
3097 dst = &r->dst;
3098 }
3099
3100 r->evaluations++;
3101 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3102 r = r->skip[PF_SKIP_IFP].ptr;
3103 else if (r->direction && r->direction != direction)
3104 r = r->skip[PF_SKIP_DIR].ptr;
3105 else if (r->af && r->af != pd->af)
3106 r = r->skip[PF_SKIP_AF].ptr;
3107 else if (r->proto && r->proto != pd->proto)
3108 r = r->skip[PF_SKIP_PROTO].ptr;
3109 else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
3110 src->neg, kif))
3111 r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
3112 PF_SKIP_DST_ADDR].ptr;
3113 else if (src->port_op && !pf_match_port(src->port_op,
3114 src->port[0], src->port[1], sport))
3115 r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
3116 PF_SKIP_DST_PORT].ptr;
3117 else if (dst != NULL &&
3118 PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL))
3119 r = r->skip[PF_SKIP_DST_ADDR].ptr;
3120 else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
3121 0, NULL))
3122 r = TAILQ_NEXT(r, entries);
3123 else if (dst != NULL && dst->port_op &&
3124 !pf_match_port(dst->port_op, dst->port[0],
3125 dst->port[1], dport))
3126 r = r->skip[PF_SKIP_DST_PORT].ptr;
3127 else if (r->match_tag && !pf_match_tag(m, r, &tag))
3128 r = TAILQ_NEXT(r, entries);
3129 else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
3130 IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
3131 off, pd->hdr.tcp), r->os_fingerprint)))
3132 r = TAILQ_NEXT(r, entries);
3133 else {
3134 if (r->tag)
3135 tag = r->tag;
3136 if (r->rtableid >= 0)
3137 rtableid = r->rtableid;
3138 if (r->anchor == NULL) {
3139 rm = r;
3140 } else
3141 pf_step_into_anchor(&asd, &ruleset, rs_num,
3142 &r, NULL, NULL);
3143 }
3144 if (r == NULL)
3145 pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
3146 NULL, NULL);
3147 }
3148 if (pf_tag_packet(m, tag, rtableid))
3149 return (NULL);
3150 if (rm != NULL && (rm->action == PF_NONAT ||
3151 rm->action == PF_NORDR || rm->action == PF_NOBINAT))
3152 return (NULL);
3153 return (rm);
3154}
3155
3156struct pf_rule *
3157pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
3158 struct pfi_kif *kif, struct pf_src_node **sn,
3159 struct pf_state_key **skw, struct pf_state_key **sks,
3160 struct pf_state_key **skp, struct pf_state_key **nkp,
3161 struct pf_addr *saddr, struct pf_addr *daddr,
3162 u_int16_t sport, u_int16_t dport)
3163{
3164 struct pf_rule *r = NULL;
3165
3166 if (direction == PF_OUT) {
3167 r = pf_match_translation(pd, m, off, direction, kif, saddr,
3168 sport, daddr, dport, PF_RULESET_BINAT);
3169 if (r == NULL)
3170 r = pf_match_translation(pd, m, off, direction, kif,
3171 saddr, sport, daddr, dport, PF_RULESET_NAT);
3172 } else {
3173 r = pf_match_translation(pd, m, off, direction, kif, saddr,
3174 sport, daddr, dport, PF_RULESET_RDR);
3175 if (r == NULL)
3176 r = pf_match_translation(pd, m, off, direction, kif,
3177 saddr, sport, daddr, dport, PF_RULESET_BINAT);
3178 }
3179
3180 if (r != NULL) {
3181 struct pf_addr *naddr;
3182 u_int16_t *nport;
3183
3184 if (pf_state_key_setup(pd, r, skw, sks, skp, nkp,
3185 saddr, daddr, sport, dport))
3186 return r;
3187
3188 /* XXX We only modify one side for now. */
3189 naddr = &(*nkp)->addr[1];
3190 nport = &(*nkp)->port[1];
3191
3192 /*
3193 * NOTE: Currently all translations will clear
3194 * BRIDGE_MBUF_TAGGED, telling the bridge to
3195 * ignore the original input encapsulation.
3196 */
3197 switch (r->action) {
3198 case PF_NONAT:
3199 case PF_NOBINAT:
3200 case PF_NORDR:
3201 return (NULL);
3202 case PF_NAT:
3203 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3204 if (pf_get_sport(pd, pd->af, pd->proto, r,
3205 saddr, daddr, sport, dport,
3206 naddr, nport, r->rpool.proxy_port[0],
3207 r->rpool.proxy_port[1], sn)) {
3208 DPFPRINTF(PF_DEBUG_MISC,
3209 ("pf: NAT proxy port allocation "
3210 "(%u-%u) failed\n",
3211 r->rpool.proxy_port[0],
3212 r->rpool.proxy_port[1]));
3213 return (NULL);
3214 }
3215 break;
3216 case PF_BINAT:
3217 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3218 switch (direction) {
3219 case PF_OUT:
3220 if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
3221 switch (pd->af) {
3222#ifdef INET
3223 case AF_INET:
3224 if (r->rpool.cur->addr.p.dyn->
3225 pfid_acnt4 < 1)
3226 return (NULL);
3227 PF_POOLMASK(naddr,
3228 &r->rpool.cur->addr.p.dyn->
3229 pfid_addr4,
3230 &r->rpool.cur->addr.p.dyn->
3231 pfid_mask4,
3232 saddr, AF_INET);
3233 break;
3234#endif /* INET */
3235#ifdef INET6
3236 case AF_INET6:
3237 if (r->rpool.cur->addr.p.dyn->
3238 pfid_acnt6 < 1)
3239 return (NULL);
3240 PF_POOLMASK(naddr,
3241 &r->rpool.cur->addr.p.dyn->
3242 pfid_addr6,
3243 &r->rpool.cur->addr.p.dyn->
3244 pfid_mask6,
3245 saddr, AF_INET6);
3246 break;
3247#endif /* INET6 */
3248 }
3249 } else
3250 PF_POOLMASK(naddr,
3251 &r->rpool.cur->addr.v.a.addr,
3252 &r->rpool.cur->addr.v.a.mask,
3253 saddr, pd->af);
3254 break;
3255 case PF_IN:
3256 if (r->src.addr.type == PF_ADDR_DYNIFTL) {
3257 switch (pd->af) {
3258#ifdef INET
3259 case AF_INET:
3260 if (r->src.addr.p.dyn->
3261 pfid_acnt4 < 1)
3262 return (NULL);
3263 PF_POOLMASK(naddr,
3264 &r->src.addr.p.dyn->
3265 pfid_addr4,
3266 &r->src.addr.p.dyn->
3267 pfid_mask4,
3268 daddr, AF_INET);
3269 break;
3270#endif /* INET */
3271#ifdef INET6
3272 case AF_INET6:
3273 if (r->src.addr.p.dyn->
3274 pfid_acnt6 < 1)
3275 return (NULL);
3276 PF_POOLMASK(naddr,
3277 &r->src.addr.p.dyn->
3278 pfid_addr6,
3279 &r->src.addr.p.dyn->
3280 pfid_mask6,
3281 daddr, AF_INET6);
3282 break;
3283#endif /* INET6 */
3284 }
3285 } else
3286 PF_POOLMASK(naddr,
3287 &r->src.addr.v.a.addr,
3288 &r->src.addr.v.a.mask, daddr,
3289 pd->af);
3290 break;
3291 }
3292 break;
3293 case PF_RDR: {
3294 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
3295 if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
3296 return (NULL);
3297 if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
3298 PF_POOL_BITMASK)
3299 PF_POOLMASK(naddr, naddr,
3300 &r->rpool.cur->addr.v.a.mask, daddr,
3301 pd->af);
3302
3303 if (r->rpool.proxy_port[1]) {
3304 u_int32_t tmp_nport;
3305
3306 tmp_nport = ((ntohs(dport) -
3307 ntohs(r->dst.port[0])) %
3308 (r->rpool.proxy_port[1] -
3309 r->rpool.proxy_port[0] + 1)) +
3310 r->rpool.proxy_port[0];
3311
3312 /* wrap around if necessary */
3313 if (tmp_nport > 65535)
3314 tmp_nport -= 65535;
3315 *nport = htons((u_int16_t)tmp_nport);
3316 } else if (r->rpool.proxy_port[0]) {
3317 *nport = htons(r->rpool.proxy_port[0]);
3318 }
3319 pd->not_cpu_localized = 1;
3320 break;
3321 }
3322 default:
3323 return (NULL);
3324 }
3325 }
3326
3327 return (r);
3328}
3329
3330struct netmsg_hashlookup {
3331 struct netmsg_base base;
3332 struct inpcb **nm_pinp;
3333 struct inpcbinfo *nm_pcbinfo;
3334 struct pf_addr *nm_saddr;
3335 struct pf_addr *nm_daddr;
3336 uint16_t nm_sport;
3337 uint16_t nm_dport;
3338 sa_family_t nm_af;
3339};
3340
3341#ifdef PF_SOCKET_LOOKUP_DOMSG
3342static void
3343in_pcblookup_hash_handler(netmsg_t msg)
3344{
3345 struct netmsg_hashlookup *rmsg = (struct netmsg_hashlookup *)msg;
3346
3347 if (rmsg->nm_af == AF_INET)
3348 *rmsg->nm_pinp = in_pcblookup_hash(rmsg->nm_pcbinfo,
3349 rmsg->nm_saddr->v4, rmsg->nm_sport, rmsg->nm_daddr->v4,
3350 rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
3351#ifdef INET6
3352 else
3353 *rmsg->nm_pinp = in6_pcblookup_hash(rmsg->nm_pcbinfo,
3354 &rmsg->nm_saddr->v6, rmsg->nm_sport, &rmsg->nm_daddr->v6,
3355 rmsg->nm_dport, INPLOOKUP_WILDCARD, NULL);
3356#endif /* INET6 */
3357 lwkt_replymsg(&rmsg->base.lmsg, 0);
3358}
3359#endif /* PF_SOCKET_LOOKUP_DOMSG */
3360
3361int
3362pf_socket_lookup(int direction, struct pf_pdesc *pd)
3363{
3364 struct pf_addr *saddr, *daddr;
3365 u_int16_t sport, dport;
3366 struct inpcbinfo *pi;
3367 struct inpcb *inp;
3368 struct netmsg_hashlookup *msg = NULL;
3369#ifdef PF_SOCKET_LOOKUP_DOMSG
3370 struct netmsg_hashlookup msg0;
3371#endif
3372 int pi_cpu = 0;
3373
3374 if (pd == NULL)
3375 return (-1);
3376 pd->lookup.uid = UID_MAX;
3377 pd->lookup.gid = GID_MAX;
3378 pd->lookup.pid = NO_PID;
3379 if (direction == PF_IN) {
3380 saddr = pd->src;
3381 daddr = pd->dst;
3382 } else {
3383 saddr = pd->dst;
3384 daddr = pd->src;
3385 }
3386 switch (pd->proto) {
3387 case IPPROTO_TCP:
3388 if (pd->hdr.tcp == NULL)
3389 return (-1);
3390 sport = pd->hdr.tcp->th_sport;
3391 dport = pd->hdr.tcp->th_dport;
3392
3393 pi_cpu = tcp_addrcpu(saddr->v4.s_addr, sport, daddr->v4.s_addr, dport);
3394 pi = &tcbinfo[pi_cpu];
3395 /*
3396 * Our netstack runs lockless on MP systems
3397 * (only for TCP connections at the moment).
3398 *
3399 * As we are not allowed to read another CPU's tcbinfo,
3400 * we have to ask that CPU via remote call to search the
3401 * table for us.
3402 *
3403 * Prepare a msg iff data belongs to another CPU.
3404 */
3405 if (pi_cpu != mycpu->gd_cpuid) {
3406#ifdef PF_SOCKET_LOOKUP_DOMSG
3407 /*
3408 * NOTE:
3409 *
3410 * Following lwkt_domsg() is dangerous and could
3411 * lockup the network system, e.g.
3412 *
3413 * On 2 CPU system:
3414 * netisr0 domsg to netisr1 (due to lookup)
3415 * netisr1 domsg to netisr0 (due to lookup)
3416 *
3417 * We simply return -1 here, since we are probably
3418 * called before NAT, so the TCP packet should
3419 * already be on the correct CPU.
3420 */
3421 msg = &msg0;
3422 netmsg_init(&msg->base, NULL, &curthread->td_msgport,
3423 0, in_pcblookup_hash_handler);
3424 msg->nm_pinp = &inp;
3425 msg->nm_pcbinfo = pi;
3426 msg->nm_saddr = saddr;
3427 msg->nm_sport = sport;
3428 msg->nm_daddr = daddr;
3429 msg->nm_dport = dport;
3430 msg->nm_af = pd->af;
3431#else /* !PF_SOCKET_LOOKUP_DOMSG */
3432 kprintf("pf_socket_lookup: tcp packet not on the "
3433 "correct cpu %d, cur cpu %d\n",
3434 pi_cpu, mycpuid);
3435 print_backtrace(-1);
3436 return -1;
3437#endif /* PF_SOCKET_LOOKUP_DOMSG */
3438 }
3439 break;
3440 case IPPROTO_UDP:
3441 if (pd->hdr.udp == NULL)
3442 return (-1);
3443 sport = pd->hdr.udp->uh_sport;
3444 dport = pd->hdr.udp->uh_dport;
3445 pi = &udbinfo[mycpuid];
3446 break;
3447 default:
3448 return (-1);
3449 }
3450 if (direction != PF_IN) {
3451 u_int16_t p;
3452
3453 p = sport;
3454 sport = dport;
3455 dport = p;
3456 }
3457 switch (pd->af) {
3458#ifdef INET6
3459 case AF_INET6:
3460 /*
3461 * Query other CPU, second part
3462 *
3463 * msg only gets initialized when:
3464 * 1) packet is TCP
3465 * 2) the info belongs to another CPU
3466 *
3467 * Use some switch/case magic to avoid code duplication.
3468 */
3469 if (msg == NULL) {
3470 inp = in6_pcblookup_hash(pi, &saddr->v6, sport,
3471 &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL);
3472
3473 if (inp == NULL)
3474 return (-1);
3475 break;
3476 }
3477 /* FALLTHROUGH if SMP and on other CPU */
3478#endif /* INET6 */
3479 case AF_INET:
3480 if (msg != NULL) {
3481 lwkt_domsg(netisr_cpuport(pi_cpu),
3482 &msg->base.lmsg, 0);
3483 } else
3484 {
3485 inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4,
3486 dport, INPLOOKUP_WILDCARD, NULL);
3487 }
3488 if (inp == NULL)
3489 return (-1);
3490 break;
3491
3492 default:
3493 return (-1);
3494 }
3495 pd->lookup.uid = inp->inp_socket->so_cred->cr_uid;
3496 pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0];
3497 return (1);
3498}
3499
3500u_int8_t
3501pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3502{
3503 int hlen;
3504 u_int8_t hdr[60];
3505 u_int8_t *opt, optlen;
3506 u_int8_t wscale = 0;
3507
3508 hlen = th_off << 2; /* hlen <= sizeof(hdr) */
3509 if (hlen <= sizeof(struct tcphdr))
3510 return (0);
3511 if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3512 return (0);
3513 opt = hdr + sizeof(struct tcphdr);
3514 hlen -= sizeof(struct tcphdr);
3515 while (hlen >= 3) {
3516 switch (*opt) {
3517 case TCPOPT_EOL:
3518 case TCPOPT_NOP:
3519 ++opt;
3520 --hlen;
3521 break;
3522 case TCPOPT_WINDOW:
3523 wscale = opt[2];
3524 if (wscale > TCP_MAX_WINSHIFT)
3525 wscale = TCP_MAX_WINSHIFT;
3526 wscale |= PF_WSCALE_FLAG;
3527 /* FALLTHROUGH */
3528 default:
3529 optlen = opt[1];
3530 if (optlen < 2)
3531 optlen = 2;
3532 hlen -= optlen;
3533 opt += optlen;
3534 break;
3535 }
3536 }
3537 return (wscale);
3538}
3539
3540u_int16_t
3541pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3542{
3543 int hlen;
3544 u_int8_t hdr[60];
3545 u_int8_t *opt, optlen;
3546 u_int16_t mss = tcp_mssdflt;
3547
3548 hlen = th_off << 2; /* hlen <= sizeof(hdr) */
3549 if (hlen <= sizeof(struct tcphdr))
3550 return (0);
3551 if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3552 return (0);
3553 opt = hdr + sizeof(struct tcphdr);
3554 hlen -= sizeof(struct tcphdr);
3555 while (hlen >= TCPOLEN_MAXSEG) {
3556 switch (*opt) {
3557 case TCPOPT_EOL:
3558 case TCPOPT_NOP:
3559 ++opt;
3560 --hlen;
3561 break;
3562 case TCPOPT_MAXSEG:
3563 bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
3564 /* FALLTHROUGH */
3565 default:
3566 optlen = opt[1];
3567 if (optlen < 2)
3568 optlen = 2;
3569 hlen -= optlen;
3570 opt += optlen;
3571 break;
3572 }
3573 }
3574 return (mss);
3575}
3576
3577u_int16_t
3578pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer)
3579{
3580#ifdef INET
3581 struct sockaddr_in *dst;
3582 struct route ro;
3583#endif /* INET */
3584#ifdef INET6
3585 struct sockaddr_in6 *dst6;
3586 struct route_in6 ro6;
3587#endif /* INET6 */
3588 struct rtentry *rt = NULL;
3589 int hlen = 0;
3590 u_int16_t mss = tcp_mssdflt;
3591
3592 switch (af) {
3593#ifdef INET
3594 case AF_INET:
3595 hlen = sizeof(struct ip);
3596 bzero(&ro, sizeof(ro));
3597 dst = (struct sockaddr_in *)&ro.ro_dst;
3598 dst->sin_family = AF_INET;
3599 dst->sin_len = sizeof(*dst);
3600 dst->sin_addr = addr->v4;
3601 rtalloc_ign(&ro, (RTF_CLONING | RTF_PRCLONING));
3602 rt = ro.ro_rt;
3603 break;
3604#endif /* INET */
3605#ifdef INET6
3606 case AF_INET6:
3607 hlen = sizeof(struct ip6_hdr);
3608 bzero(&ro6, sizeof(ro6));
3609 dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
3610 dst6->sin6_family = AF_INET6;
3611 dst6->sin6_len = sizeof(*dst6);
3612 dst6->sin6_addr = addr->v6;
3613 rtalloc_ign((struct route *)&ro6, (RTF_CLONING | RTF_PRCLONING));
3614 rt = ro6.ro_rt;
3615 break;
3616#endif /* INET6 */
3617 }
3618
3619 if (rt && rt->rt_ifp) {
3620 mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
3621 mss = max(tcp_mssdflt, mss);
3622 RTFREE(rt);
3623 }
3624 mss = min(mss, offer);
3625 mss = max(mss, 64); /* sanity - at least max opt space */
3626 return (mss);
3627}
3628
3629void
3630pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
3631{
3632 struct pf_rule *r = s->rule.ptr;
3633
3634 s->rt_kif = NULL;
3635 if (!r->rt || r->rt == PF_FASTROUTE)
3636 return;
3637 switch (s->key[PF_SK_WIRE]->af) {
3638#ifdef INET
3639 case AF_INET:
3640 pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL,
3641 &s->nat_src_node);
3642 s->rt_kif = r->rpool.cur->kif;
3643 break;
3644#endif /* INET */
3645#ifdef INET6
3646 case AF_INET6:
3647 pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL,
3648 &s->nat_src_node);
3649 s->rt_kif = r->rpool.cur->kif;
3650 break;
3651#endif /* INET6 */
3652 }
3653}
3654
3655u_int32_t
3656pf_tcp_iss(struct pf_pdesc *pd)
3657{
3658 MD5_CTX ctx;
3659 u_int32_t digest[4];
3660
3661 if (pf_tcp_secret_init == 0) {
3662 lwkt_gettoken(&pf_gtoken);
3663 if (pf_tcp_secret_init == 0) {
3664 karc4random_buf(pf_tcp_secret, sizeof(pf_tcp_secret));
3665 MD5Init(&pf_tcp_secret_ctx);
3666 MD5Update(&pf_tcp_secret_ctx, pf_tcp_secret,
3667 sizeof(pf_tcp_secret));
3668 pf_tcp_secret_init = 1;
3669 }
3670 lwkt_reltoken(&pf_gtoken);
3671 }
3672 ctx = pf_tcp_secret_ctx;
3673
3674 MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
3675 MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
3676 if (pd->af == AF_INET6) {
3677 MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
3678 MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
3679 } else {
3680 MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
3681 MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
3682 }
3683 MD5Final((u_char *)digest, &ctx);
3684 pf_tcp_iss_off += 4096;
3685
3686 return (digest[0] + pd->hdr.tcp->th_seq + pf_tcp_iss_off);
3687}
3688
3689int
3690pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3691 struct pfi_kif *kif, struct mbuf *m, int off, void *h,
3692 struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm,
3693 struct ifqueue *ifq, struct inpcb *inp)
3694{
3695 struct pf_rule *nr = NULL;
3696 struct pf_addr *saddr = pd->src, *daddr = pd->dst;
3697 sa_family_t af = pd->af;
3698 struct pf_rule *r, *a = NULL;
3699 struct pf_ruleset *ruleset = NULL;
3700 struct pf_src_node *nsn = NULL;
3701 struct tcphdr *th = pd->hdr.tcp;
3702 struct pf_state_key *skw = NULL, *sks = NULL;
3703 struct pf_state_key *sk = NULL, *nk = NULL;
3704 u_short reason;
3705 int rewrite = 0, hdrlen = 0;
3706 int tag = -1, rtableid = -1;
3707 int asd = 0;
3708 int match = 0;
3709 int state_icmp = 0;
3710 u_int16_t sport = 0, dport = 0;
3711 u_int16_t bproto_sum = 0, bip_sum = 0;
3712 u_int8_t icmptype = 0, icmpcode = 0;
3713
3714
3715 if (direction == PF_IN && pf_check_congestion(ifq)) {
3716 REASON_SET(&reason, PFRES_CONGEST);
3717 return (PF_DROP);
3718 }
3719
3720 if (inp != NULL)
3721 pd->lookup.done = pf_socket_lookup(direction, pd);
3722 else if (debug_pfugidhack) {
3723 DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
3724 pd->lookup.done = pf_socket_lookup(direction, pd);
3725 }
3726
3727 switch (pd->proto) {
3728 case IPPROTO_TCP:
3729 sport = th->th_sport;
3730 dport = th->th_dport;
3731 hdrlen = sizeof(*th);
3732 break;
3733 case IPPROTO_UDP:
3734 sport = pd->hdr.udp->uh_sport;
3735 dport = pd->hdr.udp->uh_dport;
3736 hdrlen = sizeof(*pd->hdr.udp);
3737 break;
3738#ifdef INET
3739 case IPPROTO_ICMP:
3740 if (pd->af != AF_INET)
3741 break;
3742 sport = dport = pd->hdr.icmp->icmp_id;
3743 hdrlen = sizeof(*pd->hdr.icmp);
3744 icmptype = pd->hdr.icmp->icmp_type;
3745 icmpcode = pd->hdr.icmp->icmp_code;
3746
3747 if (icmptype == ICMP_UNREACH ||
3748 icmptype == ICMP_SOURCEQUENCH ||
3749 icmptype == ICMP_REDIRECT ||
3750 icmptype == ICMP_TIMXCEED ||
3751 icmptype == ICMP_PARAMPROB)
3752 state_icmp++;
3753 break;
3754#endif /* INET */
3755#ifdef INET6
3756 case IPPROTO_ICMPV6:
3757 if (af != AF_INET6)
3758 break;
3759 sport = dport = pd->hdr.icmp6->icmp6_id;
3760 hdrlen = sizeof(*pd->hdr.icmp6);
3761 icmptype = pd->hdr.icmp6->icmp6_type;
3762 icmpcode = pd->hdr.icmp6->icmp6_code;
3763
3764 if (icmptype == ICMP6_DST_UNREACH ||
3765 icmptype == ICMP6_PACKET_TOO_BIG ||
3766 icmptype == ICMP6_TIME_EXCEEDED ||
3767 icmptype == ICMP6_PARAM_PROB)
3768 state_icmp++;
3769 break;
3770#endif /* INET6 */
3771 default:
3772 sport = dport = hdrlen = 0;
3773 break;
3774 }
3775
3776 r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3777
3778 /* check packet for BINAT/NAT/RDR */
3779 if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn,
3780 &skw, &sks, &sk, &nk, saddr, daddr, sport, dport)) != NULL) {
3781 if (nk == NULL || sk == NULL) {
3782 REASON_SET(&reason, PFRES_MEMORY);
3783 goto cleanup;
3784 }
3785
3786 if (pd->ip_sum)
3787 bip_sum = *pd->ip_sum;
3788
3789 m->m_flags &= ~M_HASH;
3790 switch (pd->proto) {
3791 case IPPROTO_TCP:
3792 bproto_sum = th->th_sum;
3793 pd->proto_sum = &th->th_sum;
3794
3795 if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3796 nk->port[pd->sidx] != sport) {
3797 pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
3798 &th->th_sum, &nk->addr[pd->sidx],
3799 nk->port[pd->sidx], 0, af);
3800 pd->sport = &th->th_sport;
3801 sport = th->th_sport;
3802 }
3803
3804 if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3805 nk->port[pd->didx] != dport) {
3806 pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
3807 &th->th_sum, &nk->addr[pd->didx],
3808 nk->port[pd->didx], 0, af);
3809 dport = th->th_dport;
3810 pd->dport = &th->th_dport;
3811 }
3812 rewrite++;
3813 break;
3814 case IPPROTO_UDP:
3815 bproto_sum = pd->hdr.udp->uh_sum;
3816 pd->proto_sum = &pd->hdr.udp->uh_sum;
3817
3818 if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3819 nk->port[pd->sidx] != sport) {
3820 pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
3821 pd->ip_sum, &pd->hdr.udp->uh_sum,
3822 &nk->addr[pd->sidx],
3823 nk->port[pd->sidx], 1, af);
3824 sport = pd->hdr.udp->uh_sport;
3825 pd->sport = &pd->hdr.udp->uh_sport;
3826 }
3827
3828 if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3829 nk->port[pd->didx] != dport) {
3830 pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3831 pd->ip_sum, &pd->hdr.udp->uh_sum,
3832 &nk->addr[pd->didx],
3833 nk->port[pd->didx], 1, af);
3834 dport = pd->hdr.udp->uh_dport;
3835 pd->dport = &pd->hdr.udp->uh_dport;
3836 }
3837 rewrite++;
3838 break;
3839#ifdef INET
3840 case IPPROTO_ICMP:
3841 nk->port[0] = nk->port[1];
3842 if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3843 pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3844 nk->addr[pd->sidx].v4.s_addr, 0);
3845
3846 if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3847 pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3848 nk->addr[pd->didx].v4.s_addr, 0);
3849
3850 if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3851 pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3852 pd->hdr.icmp->icmp_cksum, sport,
3853 nk->port[1], 0);
3854 pd->hdr.icmp->icmp_id = nk->port[1];
3855 pd->sport = &pd->hdr.icmp->icmp_id;
3856 }
3857 m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp);
3858 break;
3859#endif /* INET */
3860#ifdef INET6
3861 case IPPROTO_ICMPV6:
3862 nk->port[0] = nk->port[1];
3863 if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3864 pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3865 &nk->addr[pd->sidx], 0);
3866
3867 if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3868 pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3869 &nk->addr[pd->didx], 0);
3870 rewrite++;
3871 break;
3872#endif /* INET */
3873 default:
3874 switch (af) {
3875#ifdef INET
3876 case AF_INET:
3877 if (PF_ANEQ(saddr,
3878 &nk->addr[pd->sidx], AF_INET))
3879 pf_change_a(&saddr->v4.s_addr,
3880 pd->ip_sum,
3881 nk->addr[pd->sidx].v4.s_addr, 0);
3882
3883 if (PF_ANEQ(daddr,
3884 &nk->addr[pd->didx], AF_INET))
3885 pf_change_a(&daddr->v4.s_addr,
3886 pd->ip_sum,
3887 nk->addr[pd->didx].v4.s_addr, 0);
3888 break;
3889#endif /* INET */
3890#ifdef INET6
3891 case AF_INET6:
3892 if (PF_ANEQ(saddr,
3893 &nk->addr[pd->sidx], AF_INET6))
3894 PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3895
3896 if (PF_ANEQ(daddr,
3897 &nk->addr[pd->didx], AF_INET6))
3898 PF_ACPY(saddr, &nk->addr[pd->didx], af);
3899 break;
3900#endif /* INET */
3901 }
3902 break;
3903 }
3904 if (nr->natpass)
3905 r = NULL;
3906 pd->nat_rule = nr;
3907 }
3908
3909 while (r != NULL) {
3910 r->evaluations++;
3911 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3912 r = r->skip[PF_SKIP_IFP].ptr;
3913 else if (r->direction && r->direction != direction)
3914 r = r->skip[PF_SKIP_DIR].ptr;
3915 else if (r->af && r->af != af)
3916 r = r->skip[PF_SKIP_AF].ptr;
3917 else if (r->proto && r->proto != pd->proto)
3918 r = r->skip[PF_SKIP_PROTO].ptr;
3919 else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3920 r->src.neg, kif))
3921 r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3922 /* tcp/udp only. port_op always 0 in other cases */
3923 else if (r->src.port_op && !pf_match_port(r->src.port_op,
3924 r->src.port[0], r->src.port[1], sport))
3925 r = r->skip[PF_SKIP_SRC_PORT].ptr;
3926 else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3927 r->dst.neg, NULL))
3928 r = r->skip[PF_SKIP_DST_ADDR].ptr;
3929 /* tcp/udp only. port_op always 0 in other cases */
3930 else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3931 r->dst.port[0], r->dst.port[1], dport))
3932 r = r->skip[PF_SKIP_DST_PORT].ptr;
3933 /* icmp only. type always 0 in other cases */
3934 else if (r->type && r->type != icmptype + 1)
3935 r = TAILQ_NEXT(r, entries);
3936 /* icmp only. type always 0 in other cases */
3937 else if (r->code && r->code != icmpcode + 1)
3938 r = TAILQ_NEXT(r, entries);
3939 else if (r->tos && !(r->tos == pd->tos))
3940 r = TAILQ_NEXT(r, entries);
3941 else if (r->rule_flag & PFRULE_FRAGMENT)
3942 r = TAILQ_NEXT(r, entries);
3943 else if (pd->proto == IPPROTO_TCP &&
3944 (r->flagset & th->th_flags) != r->flags)
3945 r = TAILQ_NEXT(r, entries);
3946 /* tcp/udp only. uid.op always 0 in other cases */
3947 else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3948 pf_socket_lookup(direction, pd), 1)) &&
3949 !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3950 pd->lookup.uid))
3951 r = TAILQ_NEXT(r, entries);
3952 /* tcp/udp only. gid.op always 0 in other cases */
3953 else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3954 pf_socket_lookup(direction, pd), 1)) &&
3955 !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3956 pd->lookup.gid))
3957 r = TAILQ_NEXT(r, entries);
3958 else if (r->prob &&
3959 r->prob <= karc4random())
3960 r = TAILQ_NEXT(r, entries);
3961 else if (r->match_tag && !pf_match_tag(m, r, &tag))
3962 r = TAILQ_NEXT(r, entries);
3963 else if (r->os_fingerprint != PF_OSFP_ANY &&
3964 (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3965 pf_osfp_fingerprint(pd, m, off, th),
3966 r->os_fingerprint)))
3967 r = TAILQ_NEXT(r, entries);
3968 else {
3969 if (r->tag)
3970 tag = r->tag;
3971 if (r->rtableid >= 0)
3972 rtableid = r->rtableid;
3973 if (r->anchor == NULL) {
3974 match = 1;
3975 *rm = r;
3976 *am = a;
3977 *rsm = ruleset;
3978 if ((*rm)->quick)
3979 break;
3980 r = TAILQ_NEXT(r, entries);
3981 } else
3982 pf_step_into_anchor(&asd, &ruleset,
3983 PF_RULESET_FILTER, &r, &a, &match);
3984 }
3985 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3986 PF_RULESET_FILTER, &r, &a, &match))
3987 break;
3988 }
3989 r = *rm;
3990 a = *am;
3991 ruleset = *rsm;
3992
3993 REASON_SET(&reason, PFRES_MATCH);
3994
3995 if (r->log || (nr != NULL && nr->log)) {
3996 if (rewrite)
3997 m_copyback(m, off, hdrlen, pd->hdr.any);
3998 PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
3999 a, ruleset, pd);
4000 }
4001
4002 if ((r->action == PF_DROP) &&
4003 ((r->rule_flag & PFRULE_RETURNRST) ||
4004 (r->rule_flag & PFRULE_RETURNICMP) ||
4005 (r->rule_flag & PFRULE_RETURN))) {
4006 /* undo NAT changes, if they have taken place */
4007 if (nr != NULL) {
4008 PF_ACPY(saddr, &sk->addr[pd->sidx], af);
4009 PF_ACPY(daddr, &sk->addr[pd->didx], af);
4010 if (pd->sport)
4011 *pd->sport = sk->port[pd->sidx];
4012 if (pd->dport)
4013 *pd->dport = sk->port[pd->didx];
4014 if (pd->proto_sum)
4015 *pd->proto_sum = bproto_sum;
4016 if (pd->ip_sum)
4017 *pd->ip_sum = bip_sum;
4018 m_copyback(m, off, hdrlen, pd->hdr.any);
4019 }
4020 if (pd->proto == IPPROTO_TCP &&
4021 ((r->rule_flag & PFRULE_RETURNRST) ||
4022 (r->rule_flag & PFRULE_RETURN)) &&
4023 !(th->th_flags & TH_RST)) {
4024 u_int32_t ack = ntohl(th->th_seq) + pd->p_len;
4025 int len = 0;
4026 struct ip *h4;
4027#ifdef INET6
4028 struct ip6_hdr *h6;
4029#endif
4030 switch (af) {
4031 case AF_INET:
4032 h4 = mtod(m, struct ip *);
4033 len = ntohs(h4->ip_len) - off;
4034 break;
4035#ifdef INET6
4036 case AF_INET6:
4037 h6 = mtod(m, struct ip6_hdr *);
4038 len = h6->ip6_plen - (off - sizeof(*h6));
4039 break;
4040#endif
4041 }
4042
4043 if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
4044 REASON_SET(&reason, PFRES_PROTCKSUM);
4045 else {
4046 if (th->th_flags & TH_SYN)
4047 ack++;
4048 if (th->th_flags & TH_FIN)
4049 ack++;
4050 pf_send_tcp(r, af, pd->dst,
4051 pd->src, th->th_dport, th->th_sport,
4052 ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
4053 r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp);
4054 }
4055 } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
4056 r->return_icmp)
4057 pf_send_icmp(m, r->return_icmp >> 8,
4058 r->return_icmp & 255, af, r);
4059 else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
4060 r->return_icmp6)
4061 pf_send_icmp(m, r->return_icmp6 >> 8,
4062 r->return_icmp6 & 255, af, r);
4063 }
4064
4065 if (r->action == PF_DROP)
4066 goto cleanup;
4067
4068 if (pf_tag_packet(m, tag, rtableid)) {
4069 REASON_SET(&reason, PFRES_MEMORY);
4070 goto cleanup;
4071 }
4072
4073 if (!state_icmp && (r->keep_state || nr != NULL ||
4074 (pd->flags & PFDESC_TCP_NORM))) {
4075 int action;
4076 action = pf_create_state(r, nr, a, pd, nsn, skw, sks, nk, sk, m,
4077 off, sport, dport, &rewrite, kif, sm, tag, bproto_sum,
4078 bip_sum, hdrlen);
4079 if (action != PF_PASS)
4080 return (action);
4081 }
4082
4083 /* copy back packet headers if we performed NAT operations */
4084 if (rewrite)
4085 m_copyback(m, off, hdrlen, pd->hdr.any);
4086
4087 return (PF_PASS);
4088
4089cleanup:
4090 if (sk != NULL)
4091 kfree(sk, M_PFSTATEKEYPL);
4092 if (nk != NULL)
4093 kfree(nk, M_PFSTATEKEYPL);
4094 return (PF_DROP);
4095}
4096
4097static __inline int
4098pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
4099 struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *skw,
4100 struct pf_state_key *sks, struct pf_state_key *nk, struct pf_state_key *sk,
4101 struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite,
4102 struct pfi_kif *kif, struct pf_state **sm, int tag, u_int16_t bproto_sum,
4103 u_int16_t bip_sum, int hdrlen)
4104{
4105 struct pf_state *s = NULL;
4106 struct pf_src_node *sn = NULL;
4107 struct tcphdr *th = pd->hdr.tcp;
4108 u_int16_t mss = tcp_mssdflt;
4109 u_short reason;
4110 int cpu = mycpu->gd_cpuid;
4111
4112 /* check maximums */
4113 if (r->max_states && (r->states_cur >= r->max_states)) {
4114 PF_INC_LCOUNTER(LCNT_STATES);
4115 REASON_SET(&reason, PFRES_MAXSTATES);
4116 return (PF_DROP);
4117 }
4118 /* src node for filter rule */
4119 if ((r->rule_flag & PFRULE_SRCTRACK ||
4120 r->rpool.opts & PF_POOL_STICKYADDR) &&
4121 pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
4122 REASON_SET(&reason, PFRES_SRCLIMIT);
4123 goto csfailed;
4124 }
4125 /* src node for translation rule */
4126 if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
4127 pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
4128 REASON_SET(&reason, PFRES_SRCLIMIT);
4129 goto csfailed;
4130 }
4131 s = kmalloc(sizeof(struct pf_state), M_PFSTATEPL, M_NOWAIT|M_ZERO);
4132 if (s == NULL) {
4133 REASON_SET(&reason, PFRES_MEMORY);
4134 goto csfailed;
4135 }
4136 lockinit(&s->lk, "pfstlk", 0, 0);
4137 s->id = 0; /* XXX Do we really need that? not in OpenBSD */
4138 s->creatorid = 0;
4139 s->rule.ptr = r;
4140 s->nat_rule.ptr = nr;
4141 s->anchor.ptr = a;
4142 s->state_flags = PFSTATE_CREATEINPROG;
4143 STATE_INC_COUNTERS(s);
4144 if (r->allow_opts)
4145 s->state_flags |= PFSTATE_ALLOWOPTS;
4146 if (r->rule_flag & PFRULE_STATESLOPPY)
4147 s->state_flags |= PFSTATE_SLOPPY;
4148 if (pd->not_cpu_localized)
4149 s->state_flags |= PFSTATE_STACK_GLOBAL;
4150
4151 s->log = r->log & PF_LOG_ALL;
4152 if (nr != NULL)
4153 s->log |= nr->log & PF_LOG_ALL;
4154 switch (pd->proto) {
4155 case IPPROTO_TCP:
4156 s->src.seqlo = ntohl(th->th_seq);
4157 s->src.seqhi = s->src.seqlo + pd->p_len + 1;
4158 if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
4159 r->keep_state == PF_STATE_MODULATE) {
4160 /* Generate sequence number modulator */
4161 if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
4162 0)
4163 s->src.seqdiff = 1;
4164 pf_change_a(&th->th_seq, &th->th_sum,
4165 htonl(s->src.seqlo + s->src.seqdiff), 0);
4166 *rewrite = 1;
4167 } else
4168 s->src.seqdiff = 0;
4169 if (th->th_flags & TH_SYN) {
4170 s->src.seqhi++;
4171 s->src.wscale = pf_get_wscale(m, off,
4172 th->th_off, pd->af);
4173 }
4174 s->src.max_win = MAX(ntohs(th->th_win), 1);
4175 if (s->src.wscale & PF_WSCALE_MASK) {
4176 /* Remove scale factor from initial window */
4177 int win = s->src.max_win;
4178 win += 1 << (s->src.wscale & PF_WSCALE_MASK);
4179 s->src.max_win = (win - 1) >>
4180 (s->src.wscale & PF_WSCALE_MASK);
4181 }
4182 if (th->th_flags & TH_FIN)
4183 s->src.seqhi++;
4184 s->dst.seqhi = 1;
4185 s->dst.max_win = 1;
4186 s->src.state = TCPS_SYN_SENT;
4187 s->dst.state = TCPS_CLOSED;
4188 s->timeout = PFTM_TCP_FIRST_PACKET;
4189 break;
4190 case IPPROTO_UDP:
4191 s->src.state = PFUDPS_SINGLE;
4192 s->dst.state = PFUDPS_NO_TRAFFIC;
4193 s->timeout = PFTM_UDP_FIRST_PACKET;
4194 break;
4195 case IPPROTO_ICMP:
4196#ifdef INET6
4197 case IPPROTO_ICMPV6:
4198#endif
4199 s->timeout = PFTM_ICMP_FIRST_PACKET;
4200 break;
4201 default:
4202 s->src.state = PFOTHERS_SINGLE;
4203 s->dst.state = PFOTHERS_NO_TRAFFIC;
4204 s->timeout = PFTM_OTHER_FIRST_PACKET;
4205 }
4206
4207 s->creation = time_second;
4208 s->expire = time_second;
4209
4210 if (sn != NULL) {
4211 s->src_node = sn;
4212 s->src_node->states++;
4213 }
4214 if (nsn != NULL) {
4215 /* XXX We only modify one side for now. */
4216 PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
4217 s->nat_src_node = nsn;
4218 s->nat_src_node->states++;
4219 }
4220 if (pd->proto == IPPROTO_TCP) {
4221 if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
4222 off, pd, th, &s->src, &s->dst)) {
4223 REASON_SET(&reason, PFRES_MEMORY);
4224 pf_src_tree_remove_state(s);
4225 STATE_DEC_COUNTERS(s);
4226 kfree(s, M_PFSTATEPL);
4227 return (PF_DROP);
4228 }
4229 if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
4230 pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
4231 &s->src, &s->dst, rewrite)) {
4232 /* This really shouldn't happen!!! */
4233 DPFPRINTF(PF_DEBUG_URGENT,
4234 ("pf_normalize_tcp_stateful failed on first pkt"));
4235 pf_normalize_tcp_cleanup(s);
4236 pf_src_tree_remove_state(s);
4237 STATE_DEC_COUNTERS(s);
4238 kfree(s, M_PFSTATEPL);
4239 return (PF_DROP);
4240 }
4241 }
4242 s->direction = pd->dir;
4243
4244 if (sk == NULL && pf_state_key_setup(pd, nr, &skw, &sks, &sk, &nk,
4245 pd->src, pd->dst, sport, dport)) {
4246 REASON_SET(&reason, PFRES_MEMORY);
4247 goto csfailed;
4248 }
4249
4250 if (pf_state_insert(BOUND_IFACE(r, kif), skw, sks, s)) {
4251 if (pd->proto == IPPROTO_TCP)
4252 pf_normalize_tcp_cleanup(s);
4253 REASON_SET(&reason, PFRES_STATEINS);
4254 pf_src_tree_remove_state(s);
4255 STATE_DEC_COUNTERS(s);
4256 kfree(s, M_PFSTATEPL);
4257 return (PF_DROP);
4258 } else
4259 *sm = s;
4260
4261 pf_set_rt_ifp(s, pd->src); /* needs s->state_key set */
4262 if (tag > 0) {
4263 pf_tag_ref(tag);
4264 s->tag = tag;
4265 }
4266 if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
4267 TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
4268 s->src.state = PF_TCPS_PROXY_SRC;
4269 /* undo NAT changes, if they have taken place */
4270 if (nr != NULL) {
4271 struct pf_state_key *skt = s->key[PF_SK_WIRE];
4272 if (pd->dir == PF_OUT)
4273 skt = s->key[PF_SK_STACK];
4274 PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
4275 PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
4276 if (pd->sport)
4277 *pd->sport = skt->port[pd->sidx];
4278 if (pd->dport)
4279 *pd->dport = skt->port[pd->didx];
4280 if (pd->proto_sum)
4281 *pd->proto_sum = bproto_sum;
4282 if (pd->ip_sum)
4283 *pd->ip_sum = bip_sum;
4284 m->m_flags &= ~M_HASH;
4285 m_copyback(m, off, hdrlen, pd->hdr.any);
4286 }
4287 s->src.seqhi = htonl(karc4random());
4288 /* Find mss option */
4289 mss = pf_get_mss(m, off, th->th_off, pd->af);
4290 mss = pf_calc_mss(pd->src, pd->af, mss);
4291 mss = pf_calc_mss(pd->dst, pd->af, mss);
4292 s->src.mss = mss;
4293 s->state_flags &= ~PFSTATE_CREATEINPROG;
4294 pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
4295 th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
4296 TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL);
4297 REASON_SET(&reason, PFRES_SYNPROXY);
4298 return (PF_SYNPROXY_DROP);
4299 }
4300
4301 s->state_flags &= ~PFSTATE_CREATEINPROG;
4302 return (PF_PASS);
4303
4304csfailed:
4305 if (sk != NULL)
4306 kfree(sk, M_PFSTATEKEYPL);
4307 if (nk != NULL)
4308 kfree(nk, M_PFSTATEKEYPL);
4309
4310 if (sn != NULL && sn->states == 0 && sn->expire == 0) {
4311 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], sn);
4312 PF_INC_SCOUNTER(SCNT_SRC_NODE_REMOVALS);
4313 atomic_add_int(&pf_status.src_nodes, -1);
4314 kfree(sn, M_PFSRCTREEPL);
4315 }
4316 if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
4317 RB_REMOVE(pf_src_tree, &tree_src_tracking[cpu], nsn);
4318 PF_INC_SCOUNTER(SCNT_SRC_NODE_REMOVALS);
4319 atomic_add_int(&pf_status.src_nodes, -1);
4320 kfree(nsn, M_PFSRCTREEPL);
4321 }
4322 if (s) {
4323 pf_src_tree_remove_state(s);
4324 STATE_DEC_COUNTERS(s);
4325 kfree(s, M_PFSTATEPL);
4326 }
4327
4328 return (PF_DROP);
4329}
4330
4331int
4332pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
4333 struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
4334 struct pf_ruleset **rsm)
4335{
4336 struct pf_rule *r, *a = NULL;
4337 struct pf_ruleset *ruleset = NULL;
4338 sa_family_t af = pd->af;
4339 u_short reason;
4340 int tag = -1;
4341 int asd = 0;
4342 int match = 0;
4343
4344 r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
4345 while (r != NULL) {
4346 r->evaluations++;
4347 if (pfi_kif_match(r->kif, kif) == r->ifnot)
4348 r = r->skip[PF_SKIP_IFP].ptr;
4349 else if (r->direction && r->direction != direction)
4350 r = r->skip[PF_SKIP_DIR].ptr;
4351 else if (r->af && r->af != af)
4352 r = r->skip[PF_SKIP_AF].ptr;
4353 else if (r->proto && r->proto != pd->proto)
4354 r = r->skip[PF_SKIP_PROTO].ptr;
4355 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
4356 r->src.neg, kif))
4357 r = r->skip[PF_SKIP_SRC_ADDR].ptr;
4358 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
4359 r->dst.neg, NULL))
4360 r = r->skip[PF_SKIP_DST_ADDR].ptr;
4361 else if (r->tos && !(r->tos == pd->tos))
4362 r = TAILQ_NEXT(r, entries);
4363 else if (r->os_fingerprint != PF_OSFP_ANY)
4364 r = TAILQ_NEXT(r, entries);
4365 else if (pd->proto == IPPROTO_UDP &&
4366 (r->src.port_op || r->dst.port_op))
4367 r = TAILQ_NEXT(r, entries);
4368 else if (pd->proto == IPPROTO_TCP &&
4369 (r->src.port_op || r->dst.port_op || r->flagset))
4370 r = TAILQ_NEXT(r, entries);
4371 else if ((pd->proto == IPPROTO_ICMP ||
4372 pd->proto == IPPROTO_ICMPV6) &&
4373 (r->type || r->code))
4374 r = TAILQ_NEXT(r, entries);
4375 else if (r->prob && r->prob <= karc4random())
4376 r = TAILQ_NEXT(r, entries);
4377 else if (r->match_tag && !pf_match_tag(m, r, &tag))
4378 r = TAILQ_NEXT(r, entries);
4379 else {
4380 if (r->anchor == NULL) {
4381 match = 1;
4382 *rm = r;
4383 *am = a;
4384 *rsm = ruleset;
4385 if ((*rm)->quick)
4386 break;
4387 r = TAILQ_NEXT(r, entries);
4388 } else
4389 pf_step_into_anchor(&asd, &ruleset,
4390 PF_RULESET_FILTER, &r, &a, &match);
4391 }
4392 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
4393 PF_RULESET_FILTER, &r, &a, &match))
4394 break;
4395 }
4396 r = *rm;
4397 a = *am;
4398 ruleset = *rsm;
4399
4400 REASON_SET(&reason, PFRES_MATCH);
4401
4402 if (r->log)
4403 PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset,
4404 pd);
4405
4406 if (r->action != PF_PASS)
4407 return (PF_DROP);
4408
4409 if (pf_tag_packet(m, tag, -1)) {
4410 REASON_SET(&reason, PFRES_MEMORY);
4411 return (PF_DROP);
4412 }
4413
4414 return (PF_PASS);
4415}
4416
4417/*
4418 * Called with state locked
4419 */
4420int
4421pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
4422 struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
4423 struct pf_pdesc *pd, u_short *reason, int *copyback)
4424{
4425 struct tcphdr *th = pd->hdr.tcp;
4426 u_int16_t win = ntohs(th->th_win);
4427 u_int32_t ack, end, seq, orig_seq;
4428 u_int8_t sws, dws;
4429 int ackskew;
4430
4431 if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
4432 sws = src->wscale & PF_WSCALE_MASK;
4433 dws = dst->wscale & PF_WSCALE_MASK;
4434 } else {
4435 sws = dws = 0;
4436 }
4437
4438 /*
4439 * Sequence tracking algorithm from Guido van Rooij's paper:
4440 * http://www.madison-gurkha.com/publications/tcp_filtering/
4441 * tcp_filtering.ps
4442 */
4443
4444 orig_seq = seq = ntohl(th->th_seq);
4445 if (src->seqlo == 0) {
4446 /* First packet from this end. Set its state */
4447
4448 if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
4449 src->scrub == NULL) {
4450 if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
4451 REASON_SET(reason, PFRES_MEMORY);
4452 return (PF_DROP);
4453 }
4454 }
4455
4456 /* Deferred generation of sequence number modulator */
4457 if (dst->seqdiff && !src->seqdiff) {
4458 /* use random iss for the TCP server */
4459 while ((src->seqdiff = karc4random() - seq) == 0)
4460 ;
4461 ack = ntohl(th->th_ack) - dst->seqdiff;
4462 pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
4463 src->seqdiff), 0);
4464 pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
4465 *copyback = 1;
4466 } else {
4467 ack = ntohl(th->th_ack);
4468 }
4469
4470 end = seq + pd->p_len;
4471 if (th->th_flags & TH_SYN) {
4472 end++;
4473 (*state)->sync_flags |= PFSTATE_GOT_SYN2;
4474 if (dst->wscale & PF_WSCALE_FLAG) {
4475 src->wscale = pf_get_wscale(m, off, th->th_off,
4476 pd->af);
4477 if (src->wscale & PF_WSCALE_FLAG) {
4478 /* Remove scale factor from initial
4479 * window */
4480 sws = src->wscale & PF_WSCALE_MASK;
4481 win = ((u_int32_t)win + (1 << sws) - 1)
4482 >> sws;
4483 dws = dst->wscale & PF_WSCALE_MASK;
4484 } else {
4485 /* fixup other window */
4486 dst->max_win <<= dst->wscale &
4487 PF_WSCALE_MASK;
4488 /* in case of a retrans SYN|ACK */
4489 dst->wscale = 0;
4490 }
4491 }
4492 }
4493 if (th->th_flags & TH_FIN)
4494 end++;
4495
4496 src->seqlo = seq;
4497 if (src->state < TCPS_SYN_SENT)
4498 src->state = TCPS_SYN_SENT;
4499
4500 /*
4501 * May need to slide the window (seqhi may have been set by
4502 * the crappy stack check or if we picked up the connection
4503 * after establishment)
4504 */
4505 if (src->seqhi == 1 ||
4506 SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
4507 src->seqhi = end + MAX(1, dst->max_win << dws);
4508 if (win > src->max_win)
4509 src->max_win = win;
4510
4511 } else {
4512 ack = ntohl(th->th_ack) - dst->seqdiff;
4513 if (src->seqdiff) {
4514 /* Modulate sequence numbers */
4515 pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
4516 src->seqdiff), 0);
4517 pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
4518 *copyback = 1;
4519 }
4520 end = seq + pd->p_len;
4521 if (th->th_flags & TH_SYN)
4522 end++;
4523 if (th->th_flags & TH_FIN)
4524 end++;
4525 }
4526
4527 if ((th->th_flags & TH_ACK) == 0) {
4528 /* Let it pass through the ack skew check */
4529 ack = dst->seqlo;
4530 } else if ((ack == 0 &&
4531 (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
4532 /* broken tcp stacks do not set ack */
4533 (dst->state < TCPS_SYN_SENT)) {
4534 /*
4535 * Many stacks (ours included) will set the ACK number in an
4536 * FIN|ACK if the SYN times out -- no sequence to ACK.
4537 */
4538 ack = dst->seqlo;
4539 }
4540
4541 if (seq == end) {
4542 /* Ease sequencing restrictions on no data packets */
4543 seq = src->seqlo;
4544 end = seq;
4545 }
4546
4547 ackskew = dst->seqlo - ack;
4548
4549
4550 /*
4551 * Need to demodulate the sequence numbers in any TCP SACK options
4552 * (Selective ACK). We could optionally validate the SACK values
4553 * against the current ACK window, either forwards or backwards, but
4554 * I'm not confident that SACK has been implemented properly
4555 * everywhere. It wouldn't surprise me if several stacks accidently
4556 * SACK too far backwards of previously ACKed data. There really aren't
4557 * any security implications of bad SACKing unless the target stack
4558 * doesn't validate the option length correctly. Someone trying to
4559 * spoof into a TCP connection won't bother blindly sending SACK
4560 * options anyway.
4561 */
4562 if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
4563 if (pf_modulate_sack(m, off, pd, th, dst))
4564 *copyback = 1;
4565 }
4566
4567
4568#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
4569 if (SEQ_GEQ(src->seqhi, end) &&
4570 /* Last octet inside other's window space */
4571 SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
4572 /* Retrans: not more than one window back */
4573 (ackskew >= -MAXACKWINDOW) &&
4574 /* Acking not more than one reassembled fragment backwards */
4575 (ackskew <= (MAXACKWINDOW << sws)) &&
4576 /* Acking not more than one window forward */
4577 ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
4578 (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo)))
4579 {
4580 /*
4581 * Require an exact/+1 sequence match on resets
4582 * when possible
4583 */
4584 if (dst->scrub || src->scrub) {
4585 if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4586 *state, src, dst, copyback))
4587 return (PF_DROP);
4588 }
4589
4590 /* update max window */
4591 if (src->max_win < win)
4592 src->max_win = win;
4593 /* synchronize sequencing */
4594 if (SEQ_GT(end, src->seqlo))
4595 src->seqlo = end;
4596 /* slide the window of what the other end can send */
4597 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4598 dst->seqhi = ack + MAX((win << sws), 1);
4599
4600
4601 /* update states */
4602 if (th->th_flags & TH_SYN)
4603 if (src->state < TCPS_SYN_SENT)
4604 src->state = TCPS_SYN_SENT;
4605 if (th->th_flags & TH_FIN)
4606 if (src->state < TCPS_CLOSING)
4607 src->state = TCPS_CLOSING;
4608 if (th->th_flags & TH_ACK) {
4609 if (dst->state == TCPS_SYN_SENT) {
4610 dst->state = TCPS_ESTABLISHED;
4611 if (src->state == TCPS_ESTABLISHED &&
4612 (*state)->src_node != NULL &&
4613 pf_src_connlimit(*state)) {
4614 REASON_SET(reason, PFRES_SRCLIMIT);
4615 return (PF_DROP);
4616 }
4617 } else if (dst->state == TCPS_CLOSING)
4618 dst->state = TCPS_FIN_WAIT_2;
4619 }
4620 if (th->th_flags & TH_RST)
4621 src->state = dst->state = TCPS_TIME_WAIT;
4622
4623 /* update expire time */
4624 (*state)->expire = time_second;
4625 if (src->state >= TCPS_FIN_WAIT_2 &&
4626 dst->state >= TCPS_FIN_WAIT_2)
4627 (*state)->timeout = PFTM_TCP_CLOSED;
4628 else if (src->state >= TCPS_CLOSING &&
4629 dst->state >= TCPS_CLOSING)
4630 (*state)->timeout = PFTM_TCP_FIN_WAIT;
4631 else if (src->state < TCPS_ESTABLISHED ||
4632 dst->state < TCPS_ESTABLISHED)
4633 (*state)->timeout = PFTM_TCP_OPENING;
4634 else if (src->state >= TCPS_CLOSING ||
4635 dst->state >= TCPS_CLOSING)
4636 (*state)->timeout = PFTM_TCP_CLOSING;
4637 else if ((th->th_flags & TH_SYN) &&
4638 ((*state)->state_flags & PFSTATE_SLOPPY))
4639 (*state)->timeout = PFTM_TCP_FIRST_PACKET;
4640 else
4641 (*state)->timeout = PFTM_TCP_ESTABLISHED;
4642
4643 /* Fall through to PASS packet */
4644
4645 } else if ((dst->state < TCPS_SYN_SENT ||
4646 dst->state >= TCPS_FIN_WAIT_2 ||
4647 src->state >= TCPS_FIN_WAIT_2) &&
4648 SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
4649 /* Within a window forward of the originating packet */
4650 SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
4651 /* Within a window backward of the originating packet */
4652
4653 /*
4654 * This currently handles three situations:
4655 * 1) Stupid stacks will shotgun SYNs before their peer
4656 * replies.
4657 * 2) When PF catches an already established stream (the
4658 * firewall rebooted, the state table was flushed, routes
4659 * changed...)
4660 * 3) Packets get funky immediately after the connection
4661 * closes (this should catch Solaris spurious ACK|FINs
4662 * that web servers like to spew after a close)
4663 *
4664 * This must be a little more careful than the above code
4665 * since packet floods will also be caught here. We don't
4666 * update the TTL here to mitigate the damage of a packet
4667 * flood and so the same code can handle awkward establishment
4668 * and a loosened connection close.
4669 * In the establishment case, a correct peer response will
4670 * validate the connection, go through the normal state code
4671 * and keep updating the state TTL.
4672 */
4673
4674 if (pf_status.debug >= PF_DEBUG_MISC) {
4675 kprintf("pf: loose state match: ");
4676 pf_print_state(*state);
4677 pf_print_flags(th->th_flags);
4678 kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4679 "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len,
4680 ackskew, (unsigned long long)(*state)->packets[0],
4681 (unsigned long long)(*state)->packets[1],
4682 pd->dir == PF_IN ? "in" : "out",
4683 pd->dir == (*state)->direction ? "fwd" : "rev");
4684 }
4685
4686 if (dst->scrub || src->scrub) {
4687 if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4688 *state, src, dst, copyback))
4689 return (PF_DROP);
4690 }
4691
4692 /* update max window */
4693 if (src->max_win < win)
4694 src->max_win = win;
4695 /* synchronize sequencing */
4696 if (SEQ_GT(end, src->seqlo))
4697 src->seqlo = end;
4698 /* slide the window of what the other end can send */
4699 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4700 dst->seqhi = ack + MAX((win << sws), 1);
4701
4702 /*
4703 * Cannot set dst->seqhi here since this could be a shotgunned
4704 * SYN and not an already established connection.
4705 */
4706
4707 if (th->th_flags & TH_FIN)
4708 if (src->state < TCPS_CLOSING)
4709 src->state = TCPS_CLOSING;
4710 if (th->th_flags & TH_RST)
4711 src->state = dst->state = TCPS_TIME_WAIT;
4712
4713 /* Fall through to PASS packet */
4714
4715 } else if ((*state)->pickup_mode == PF_PICKUPS_HASHONLY ||
4716 ((*state)->pickup_mode == PF_PICKUPS_ENABLED &&
4717 ((*state)->sync_flags & PFSTATE_GOT_SYN_MASK) !=
4718 PFSTATE_GOT_SYN_MASK)) {
4719 /*
4720 * If pickup mode is hash only, do not fail on sequence checks.
4721 *
4722 * If pickup mode is enabled and we did not see the SYN in
4723 * both direction, do not fail on sequence checks because
4724 * we do not have complete information on window scale.
4725 *
4726 * Adjust expiration and fall through to PASS packet.
4727 * XXX Add a FIN check to reduce timeout?
4728 */
4729 (*state)->expire = time_second;
4730 } else {
4731 /*
4732 * Failure processing
4733 */
4734 if ((*state)->dst.state == TCPS_SYN_SENT &&
4735 (*state)->src.state == TCPS_SYN_SENT) {
4736 /* Send RST for state mismatches during handshake */
4737 if (!(th->th_flags & TH_RST))
4738 pf_send_tcp((*state)->rule.ptr, pd->af,
4739 pd->dst, pd->src, th->th_dport,
4740 th->th_sport, ntohl(th->th_ack), 0,
4741 TH_RST, 0, 0,
4742 (*state)->rule.ptr->return_ttl, 1, 0,
4743 pd->eh, kif->pfik_ifp);
4744 src->seqlo = 0;
4745 src->seqhi = 1;
4746 src->max_win = 1;
4747 } else if (pf_status.debug >= PF_DEBUG_MISC) {
4748 kprintf("pf: BAD state: ");
4749 pf_print_state(*state);
4750 pf_print_flags(th->th_flags);
4751 kprintf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4752 "pkts=%llu:%llu dir=%s,%s\n",
4753 seq, orig_seq, ack, pd->p_len, ackskew,
4754 (unsigned long long)(*state)->packets[0],
4755 (unsigned long long)(*state)->packets[1],
4756 pd->dir == PF_IN ? "in" : "out",
4757 pd->dir == (*state)->direction ? "fwd" : "rev");
4758 kprintf("pf: State failure on: %c %c %c %c | %c %c\n",
4759 SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4760 SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4761 ' ': '2',
4762 (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4763 (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4764 SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4765 SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4766 }
4767 REASON_SET(reason, PFRES_BADSTATE);
4768 return (PF_DROP);
4769 }
4770
4771 return (PF_PASS);
4772}
4773
4774/*
4775 * Called with state locked
4776 */
4777int
4778pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4779 struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4780{
4781 struct tcphdr *th = pd->hdr.tcp;
4782
4783 if (th->th_flags & TH_SYN)
4784 if (src->state < TCPS_SYN_SENT)
4785 src->state = TCPS_SYN_SENT;
4786 if (th->th_flags & TH_FIN)
4787 if (src->state < TCPS_CLOSING)
4788 src->state = TCPS_CLOSING;
4789 if (th->th_flags & TH_ACK) {
4790 if (dst->state == TCPS_SYN_SENT) {
4791 dst->state = TCPS_ESTABLISHED;
4792 if (src->state == TCPS_ESTABLISHED &&
4793 (*state)->src_node != NULL &&
4794 pf_src_connlimit(*state)) {
4795 REASON_SET(reason, PFRES_SRCLIMIT);
4796 return (PF_DROP);
4797 }
4798 } else if (dst->state == TCPS_CLOSING) {
4799 dst->state = TCPS_FIN_WAIT_2;
4800 } else if (src->state == TCPS_SYN_SENT &&
4801 dst->state < TCPS_SYN_SENT) {
4802 /*
4803 * Handle a special sloppy case where we only see one
4804 * half of the connection. If there is a ACK after
4805 * the initial SYN without ever seeing a packet from
4806 * the destination, set the connection to established.
4807 */
4808 dst->state = src->state = TCPS_ESTABLISHED;
4809 if ((*state)->src_node != NULL &&
4810 pf_src_connlimit(*state)) {
4811 REASON_SET(reason, PFRES_SRCLIMIT);
4812 return (PF_DROP);
4813 }
4814 } else if (src->state == TCPS_CLOSING &&
4815 dst->state == TCPS_ESTABLISHED &&
4816 dst->seqlo == 0) {
4817 /*
4818 * Handle the closing of half connections where we
4819 * don't see the full bidirectional FIN/ACK+ACK
4820 * handshake.
4821 */
4822 dst->state = TCPS_CLOSING;
4823 }
4824 }
4825 if (th->th_flags & TH_RST)
4826 src->state = dst->state = TCPS_TIME_WAIT;
4827
4828 /* update expire time */
4829 (*state)->expire = time_second;
4830 if (src->state >= TCPS_FIN_WAIT_2 &&
4831 dst->state >= TCPS_FIN_WAIT_2)
4832 (*state)->timeout = PFTM_TCP_CLOSED;
4833 else if (src->state >= TCPS_CLOSING &&
4834 dst->state >= TCPS_CLOSING)
4835 (*state)->timeout = PFTM_TCP_FIN_WAIT;
4836 else if (src->state < TCPS_ESTABLISHED ||
4837 dst->state < TCPS_ESTABLISHED)
4838 (*state)->timeout = PFTM_TCP_OPENING;
4839 else if (src->state >= TCPS_CLOSING ||
4840 dst->state >= TCPS_CLOSING)
4841 (*state)->timeout = PFTM_TCP_CLOSING;
4842 else if ((th->th_flags & TH_SYN) &&
4843 ((*state)->state_flags & PFSTATE_SLOPPY))
4844 (*state)->timeout = PFTM_TCP_FIRST_PACKET;
4845 else
4846 (*state)->timeout = PFTM_TCP_ESTABLISHED;
4847
4848 return (PF_PASS);
4849}
4850
4851/*
4852 * Test TCP connection state. Caller must hold the state locked.
4853 */
4854int
4855pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4856 struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4857 u_short *reason)
4858{
4859 struct pf_state_key_cmp key;
4860 struct tcphdr *th = pd->hdr.tcp;
4861 int copyback = 0;
4862 int error;
4863 struct pf_state_peer *src, *dst;
4864 struct pf_state_key *sk;
4865
4866 bzero(&key, sizeof(key));
4867 key.af = pd->af;
4868 key.proto = IPPROTO_TCP;
4869 if (direction == PF_IN) { /* wire side, straight */
4870 PF_ACPY(&key.addr[0], pd->src, key.af);
4871 PF_ACPY(&key.addr[1], pd->dst, key.af);
4872 key.port[0] = th->th_sport;
4873 key.port[1] = th->th_dport;
4874 if (pf_status.debug >= PF_DEBUG_MISC) {
4875 kprintf("test-tcp IN (%08x:%d) -> (%08x:%d)\n",
4876 ntohl(key.addr[0].addr32[0]),
4877 ntohs(key.port[0]),
4878 ntohl(key.addr[1].addr32[0]),
4879 ntohs(key.port[1]));
4880 }
4881 } else { /* stack side, reverse */
4882 PF_ACPY(&key.addr[1], pd->src, key.af);
4883 PF_ACPY(&key.addr[0], pd->dst, key.af);
4884 key.port[1] = th->th_sport;
4885 key.port[0] = th->th_dport;
4886 if (pf_status.debug >= PF_DEBUG_MISC) {
4887 kprintf("test-tcp OUT (%08x:%d) <- (%08x:%d)\n",
4888 ntohl(key.addr[0].addr32[0]),
4889 ntohs(key.port[0]),
4890 ntohl(key.addr[1].addr32[0]),
4891 ntohs(key.port[1]));
4892 }
4893 }
4894
4895 STATE_LOOKUP(kif, &key, direction, *state, m);
4896 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
4897
4898 if (direction == (*state)->direction) {
4899 src = &(*state)->src;
4900 dst = &(*state)->dst;
4901 } else {
4902 src = &(*state)->dst;
4903 dst = &(*state)->src;
4904 }
4905
4906 sk = (*state)->key[pd->didx];
4907
4908 if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4909 if (direction != (*state)->direction) {
4910 REASON_SET(reason, PFRES_SYNPROXY);
4911 FAIL (PF_SYNPROXY_DROP);
4912 }
4913 if (th->th_flags & TH_SYN) {
4914 if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4915 REASON_SET(reason, PFRES_SYNPROXY);
4916 FAIL (PF_DROP);
4917 }
4918 pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
4919 pd->src, th->th_dport, th->th_sport,
4920 (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4921 TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1,
4922 0, NULL, NULL);
4923 REASON_SET(reason, PFRES_SYNPROXY);
4924 FAIL (PF_SYNPROXY_DROP);
4925 } else if (!(th->th_flags & TH_ACK) ||
4926 (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4927 (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4928 REASON_SET(reason, PFRES_SYNPROXY);
4929 FAIL (PF_DROP);
4930 } else if ((*state)->src_node != NULL &&
4931 pf_src_connlimit(*state)) {
4932 REASON_SET(reason, PFRES_SRCLIMIT);
4933 FAIL (PF_DROP);
4934 } else
4935 (*state)->src.state = PF_TCPS_PROXY_DST;
4936 }
4937 if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4938 if (direction == (*state)->direction) {
4939 if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4940 (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4941 (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4942 REASON_SET(reason, PFRES_SYNPROXY);
4943 FAIL (PF_DROP);
4944 }
4945 (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4946 if ((*state)->dst.seqhi == 1)
4947 (*state)->dst.seqhi = htonl(karc4random());
4948 pf_send_tcp((*state)->rule.ptr, pd->af,
4949 &sk->addr[pd->sidx], &sk->addr[pd->didx],
4950 sk->port[pd->sidx], sk->port[pd->didx],
4951 (*state)->dst.seqhi, 0, TH_SYN, 0,
4952 (*state)->src.mss, 0, 0, (*state)->tag, NULL, NULL);
4953 REASON_SET(reason, PFRES_SYNPROXY);
4954 FAIL (PF_SYNPROXY_DROP);
4955 } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4956 (TH_SYN|TH_ACK)) ||
4957 (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4958 REASON_SET(reason, PFRES_SYNPROXY);
4959 FAIL (PF_DROP);
4960 } else {
4961 (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4962 (*state)->dst.seqlo = ntohl(th->th_seq);
4963 pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
4964 pd->src, th->th_dport, th->th_sport,
4965 ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4966 TH_ACK, (*state)->src.max_win, 0, 0, 0,
4967 (*state)->tag, NULL, NULL);
4968 pf_send_tcp((*state)->rule.ptr, pd->af,
4969 &sk->addr[pd->sidx], &sk->addr[pd->didx],
4970 sk->port[pd->sidx], sk->port[pd->didx],
4971 (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4972 TH_ACK, (*state)->dst.max_win, 0, 0, 1,
4973 0, NULL, NULL);
4974 (*state)->src.seqdiff = (*state)->dst.seqhi -
4975 (*state)->src.seqlo;
4976 (*state)->dst.seqdiff = (*state)->src.seqhi -
4977 (*state)->dst.seqlo;
4978 (*state)->src.seqhi = (*state)->src.seqlo +
4979 (*state)->dst.max_win;
4980 (*state)->dst.seqhi = (*state)->dst.seqlo +
4981 (*state)->src.max_win;
4982 (*state)->src.wscale = (*state)->dst.wscale = 0;
4983 (*state)->src.state = (*state)->dst.state =
4984 TCPS_ESTABLISHED;
4985 REASON_SET(reason, PFRES_SYNPROXY);
4986 FAIL (PF_SYNPROXY_DROP);
4987 }
4988 }
4989
4990 /*
4991 * Check for connection (addr+port pair) reuse. We can't actually
4992 * unlink the state if we don't own it.
4993 */
4994 if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4995 dst->state >= TCPS_FIN_WAIT_2 &&
4996 src->state >= TCPS_FIN_WAIT_2) {
4997 if (pf_status.debug >= PF_DEBUG_MISC) {
4998 kprintf("pf: state reuse ");
4999 pf_print_state(*state);
5000 pf_print_flags(th->th_flags);
5001 kprintf("\n");
5002 }
5003 /* XXX make sure it's the same direction ?? */
5004 (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
5005 if ((*state)->cpuid == mycpu->gd_cpuid) {
5006 pf_unlink_state(*state);
5007 *state = NULL;
5008 } else {
5009 (*state)->timeout = PFTM_PURGE;
5010 }
5011 FAIL (PF_DROP);
5012 }
5013
5014 if ((*state)->state_flags & PFSTATE_SLOPPY) {
5015 if (pf_tcp_track_sloppy(src, dst, state, pd,
5016 reason) == PF_DROP) {
5017 FAIL (PF_DROP);
5018 }
5019 } else {
5020 if (pf_tcp_track_full(src, dst, state, kif, m, off, pd,
5021 reason, &copyback) == PF_DROP) {
5022 FAIL (PF_DROP);
5023 }
5024 }
5025
5026 /* translate source/destination address, if necessary */
5027 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5028 struct pf_state_key *nk = (*state)->key[pd->didx];
5029
5030 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5031 nk->port[pd->sidx] != th->th_sport) {
5032 /*
5033 * The translated source address may be completely
5034 * unrelated to the saved link header, make sure
5035 * a bridge doesn't try to use it.
5036 */
5037 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
5038 pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
5039 &th->th_sum, &nk->addr[pd->sidx],
5040 nk->port[pd->sidx], 0, pd->af);
5041 }
5042
5043 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5044 nk->port[pd->didx] != th->th_dport) {
5045 /*
5046 * If we don't redispatch the packet will go into
5047 * the protocol stack on the wrong cpu for the
5048 * post-translated address.
5049 */
5050 pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
5051 &th->th_sum, &nk->addr[pd->didx],
5052 nk->port[pd->didx], 0, pd->af);
5053 }
5054 copyback = 1;
5055 }
5056
5057 /* Copyback sequence modulation or stateful scrub changes if needed */
5058 if (copyback) {
5059 m->m_flags &= ~M_HASH;
5060 m_copyback(m, off, sizeof(*th), th);
5061 }
5062
5063 pfsync_update_state(*state);
5064 error = PF_PASS;
5065done:
5066 if (*state)
5067 lockmgr(&(*state)->lk, LK_RELEASE);
5068 return (error);
5069}
5070
5071/*
5072 * Test UDP connection state. Caller must hold the state locked.
5073 */
5074int
5075pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
5076 struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
5077{
5078 struct pf_state_peer *src, *dst;
5079 struct pf_state_key_cmp key;
5080 struct udphdr *uh = pd->hdr.udp;
5081
5082 bzero(&key, sizeof(key));
5083 key.af = pd->af;
5084 key.proto = IPPROTO_UDP;
5085 if (direction == PF_IN) { /* wire side, straight */
5086 PF_ACPY(&key.addr[0], pd->src, key.af);
5087 PF_ACPY(&key.addr[1], pd->dst, key.af);
5088 key.port[0] = uh->uh_sport;
5089 key.port[1] = uh->uh_dport;
5090 } else { /* stack side, reverse */
5091 PF_ACPY(&key.addr[1], pd->src, key.af);
5092 PF_ACPY(&key.addr[0], pd->dst, key.af);
5093 key.port[1] = uh->uh_sport;
5094 key.port[0] = uh->uh_dport;
5095 }
5096
5097 STATE_LOOKUP(kif, &key, direction, *state, m);
5098 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5099
5100 if (direction == (*state)->direction) {
5101 src = &(*state)->src;
5102 dst = &(*state)->dst;
5103 } else {
5104 src = &(*state)->dst;
5105 dst = &(*state)->src;
5106 }
5107
5108 /* update states */
5109 if (src->state < PFUDPS_SINGLE)
5110 src->state = PFUDPS_SINGLE;
5111 if (dst->state == PFUDPS_SINGLE)
5112 dst->state = PFUDPS_MULTIPLE;
5113
5114 /* update expire time */
5115 (*state)->expire = time_second;
5116 if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
5117 (*state)->timeout = PFTM_UDP_MULTIPLE;
5118 else
5119 (*state)->timeout = PFTM_UDP_SINGLE;
5120
5121 /* translate source/destination address, if necessary */
5122 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5123 struct pf_state_key *nk = (*state)->key[pd->didx];
5124
5125 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
5126 nk->port[pd->sidx] != uh->uh_sport) {
5127 /*
5128 * The translated source address may be completely
5129 * unrelated to the saved link header, make sure
5130 * a bridge doesn't try to use it.
5131 */
5132 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
5133 m->m_flags &= ~M_HASH;
5134 pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
5135 &uh->uh_sum, &nk->addr[pd->sidx],
5136 nk->port[pd->sidx], 1, pd->af);
5137 }
5138
5139 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
5140 nk->port[pd->didx] != uh->uh_dport) {
5141 /*
5142 * If we don't redispatch the packet will go into
5143 * the protocol stack on the wrong cpu for the
5144 * post-translated address.
5145 */
5146 m->m_flags &= ~M_HASH;
5147 pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
5148 &uh->uh_sum, &nk->addr[pd->didx],
5149 nk->port[pd->didx], 1, pd->af);
5150 }
5151 m_copyback(m, off, sizeof(*uh), uh);
5152 }
5153
5154 pfsync_update_state(*state);
5155 lockmgr(&(*state)->lk, LK_RELEASE);
5156 return (PF_PASS);
5157}
5158
5159/*
5160 * Test ICMP connection state. Caller must hold the state locked.
5161 */
5162int
5163pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
5164 struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
5165 u_short *reason)
5166{
5167 struct pf_addr *saddr = pd->src, *daddr = pd->dst;
5168 u_int16_t icmpid = 0, *icmpsum = NULL;
5169 u_int8_t icmptype = 0;
5170 int state_icmp = 0;
5171 int error;
5172 struct pf_state_key_cmp key;
5173
5174 bzero(&key, sizeof(key));
5175
5176 switch (pd->proto) {
5177#ifdef INET
5178 case IPPROTO_ICMP:
5179 icmptype = pd->hdr.icmp->icmp_type;
5180 icmpid = pd->hdr.icmp->icmp_id;
5181 icmpsum = &pd->hdr.icmp->icmp_cksum;
5182
5183 if (icmptype == ICMP_UNREACH ||
5184 icmptype == ICMP_SOURCEQUENCH ||
5185 icmptype == ICMP_REDIRECT ||
5186 icmptype == ICMP_TIMXCEED ||
5187 icmptype == ICMP_PARAMPROB)
5188 state_icmp++;
5189 break;
5190#endif /* INET */
5191#ifdef INET6
5192 case IPPROTO_ICMPV6:
5193 icmptype = pd->hdr.icmp6->icmp6_type;
5194 icmpid = pd->hdr.icmp6->icmp6_id;
5195 icmpsum = &pd->hdr.icmp6->icmp6_cksum;
5196
5197 if (icmptype == ICMP6_DST_UNREACH ||
5198 icmptype == ICMP6_PACKET_TOO_BIG ||
5199 icmptype == ICMP6_TIME_EXCEEDED ||
5200 icmptype == ICMP6_PARAM_PROB)
5201 state_icmp++;
5202 break;
5203#endif /* INET6 */
5204 }
5205
5206 if (!state_icmp) {
5207
5208 /*
5209 * ICMP query/reply message not related to a TCP/UDP packet.
5210 * Search for an ICMP state.
5211 */
5212 key.af = pd->af;
5213 key.proto = pd->proto;
5214 key.port[0] = key.port[1] = icmpid;
5215 if (direction == PF_IN) { /* wire side, straight */
5216 PF_ACPY(&key.addr[0], pd->src, key.af);
5217 PF_ACPY(&key.addr[1], pd->dst, key.af);
5218 } else { /* stack side, reverse */
5219 PF_ACPY(&key.addr[1], pd->src, key.af);
5220 PF_ACPY(&key.addr[0], pd->dst, key.af);
5221 }
5222
5223 STATE_LOOKUP(kif, &key, direction, *state, m);
5224 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5225
5226 (*state)->expire = time_second;
5227 (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
5228
5229 /* translate source/destination address, if necessary */
5230 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5231 struct pf_state_key *nk = (*state)->key[pd->didx];
5232
5233 switch (pd->af) {
5234#ifdef INET
5235 case AF_INET:
5236 if (PF_ANEQ(pd->src,
5237 &nk->addr[pd->sidx], AF_INET))
5238 pf_change_a(&saddr->v4.s_addr,
5239 pd->ip_sum,
5240 nk->addr[pd->sidx].v4.s_addr, 0);
5241
5242 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
5243 AF_INET))
5244 pf_change_a(&daddr->v4.s_addr,
5245 pd->ip_sum,
5246 nk->addr[pd->didx].v4.s_addr, 0);
5247
5248 if (nk->port[0] !=
5249 pd->hdr.icmp->icmp_id) {
5250 pd->hdr.icmp->icmp_cksum =
5251 pf_cksum_fixup(
5252 pd->hdr.icmp->icmp_cksum, icmpid,
5253 nk->port[pd->sidx], 0);
5254 pd->hdr.icmp->icmp_id =
5255 nk->port[pd->sidx];
5256 }
5257
5258 m->m_flags &= ~M_HASH;
5259 m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp);
5260 break;
5261#endif /* INET */
5262#ifdef INET6
5263 case AF_INET6:
5264 if (PF_ANEQ(pd->src,
5265 &nk->addr[pd->sidx], AF_INET6))
5266 pf_change_a6(saddr,
5267 &pd->hdr.icmp6->icmp6_cksum,
5268 &nk->addr[pd->sidx], 0);
5269
5270 if (PF_ANEQ(pd->dst,
5271 &nk->addr[pd->didx], AF_INET6))
5272 pf_change_a6(daddr,
5273 &pd->hdr.icmp6->icmp6_cksum,
5274 &nk->addr[pd->didx], 0);
5275
5276 m->m_flags &= ~M_HASH;
5277 m_copyback(m, off, sizeof(struct icmp6_hdr),
5278 pd->hdr.icmp6);
5279 break;
5280#endif /* INET6 */
5281 }
5282 }
5283 } else {
5284 /*
5285 * ICMP error message in response to a TCP/UDP packet.
5286 * Extract the inner TCP/UDP header and search for that state.
5287 */
5288
5289 struct pf_pdesc pd2;
5290#ifdef INET
5291 struct ip h2;
5292#endif /* INET */
5293#ifdef INET6
5294 struct ip6_hdr h2_6;
5295 int terminal = 0;
5296#endif /* INET6 */
5297 int ipoff2;
5298 int off2;
5299
5300 pd2.not_cpu_localized = 1;
5301 pd2.af = pd->af;
5302 /* Payload packet is from the opposite direction. */
5303 pd2.sidx = (direction == PF_IN) ? 1 : 0;
5304 pd2.didx = (direction == PF_IN) ? 0 : 1;
5305 switch (pd->af) {
5306#ifdef INET
5307 case AF_INET:
5308 /* offset of h2 in mbuf chain */
5309 ipoff2 = off + ICMP_MINLEN;
5310
5311 if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
5312 NULL, reason, pd2.af)) {
5313 DPFPRINTF(PF_DEBUG_MISC,
5314 ("pf: ICMP error message too short "
5315 "(ip)\n"));
5316 FAIL (PF_DROP);
5317 }
5318 /*
5319 * ICMP error messages don't refer to non-first
5320 * fragments
5321 */
5322 if (h2.ip_off & htons(IP_OFFMASK)) {
5323 REASON_SET(reason, PFRES_FRAG);
5324 FAIL (PF_DROP);
5325 }
5326
5327 /* offset of protocol header that follows h2 */
5328 off2 = ipoff2 + (h2.ip_hl << 2);
5329
5330 pd2.proto = h2.ip_p;
5331 pd2.src = (struct pf_addr *)&h2.ip_src;
5332 pd2.dst = (struct pf_addr *)&h2.ip_dst;
5333 pd2.ip_sum = &h2.ip_sum;
5334 break;
5335#endif /* INET */
5336#ifdef INET6
5337 case AF_INET6:
5338 ipoff2 = off + sizeof(struct icmp6_hdr);
5339
5340 if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
5341 NULL, reason, pd2.af)) {
5342 DPFPRINTF(PF_DEBUG_MISC,
5343 ("pf: ICMP error message too short "
5344 "(ip6)\n"));
5345 FAIL (PF_DROP);
5346 }
5347 pd2.proto = h2_6.ip6_nxt;
5348 pd2.src = (struct pf_addr *)&h2_6.ip6_src;
5349 pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
5350 pd2.ip_sum = NULL;
5351 off2 = ipoff2 + sizeof(h2_6);
5352 do {
5353 switch (pd2.proto) {
5354 case IPPROTO_FRAGMENT:
5355 /*
5356 * ICMPv6 error messages for
5357 * non-first fragments
5358 */
5359 REASON_SET(reason, PFRES_FRAG);
5360 FAIL (PF_DROP);
5361 case IPPROTO_AH:
5362 case IPPROTO_HOPOPTS:
5363 case IPPROTO_ROUTING:
5364 case IPPROTO_DSTOPTS: {
5365 /* get next header and header length */
5366 struct ip6_ext opt6;
5367
5368 if (!pf_pull_hdr(m, off2, &opt6,
5369 sizeof(opt6), NULL, reason,
5370 pd2.af)) {
5371 DPFPRINTF(PF_DEBUG_MISC,
5372 ("pf: ICMPv6 short opt\n"));
5373 FAIL (PF_DROP);
5374 }
5375 if (pd2.proto == IPPROTO_AH)
5376 off2 += (opt6.ip6e_len + 2) * 4;
5377 else
5378 off2 += (opt6.ip6e_len + 1) * 8;
5379 pd2.proto = opt6.ip6e_nxt;
5380 /* goto the next header */
5381 break;
5382 }
5383 default:
5384 terminal++;
5385 break;
5386 }
5387 } while (!terminal);
5388 break;
5389#endif /* INET6 */
5390 default:
5391 DPFPRINTF(PF_DEBUG_MISC,
5392 ("pf: ICMP AF %d unknown (ip6)\n", pd->af));
5393 FAIL (PF_DROP);
5394 break;
5395 }
5396
5397 switch (pd2.proto) {
5398 case IPPROTO_TCP: {
5399 struct tcphdr th;
5400 u_int32_t seq;
5401 struct pf_state_peer *src, *dst;
5402 u_int8_t dws;
5403 int copyback = 0;
5404
5405 /*
5406 * Only the first 8 bytes of the TCP header can be
5407 * expected. Don't access any TCP header fields after
5408 * th_seq, an ackskew test is not possible.
5409 */
5410 if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
5411 pd2.af)) {
5412 DPFPRINTF(PF_DEBUG_MISC,
5413 ("pf: ICMP error message too short "
5414 "(tcp)\n"));
5415 FAIL (PF_DROP);
5416 }
5417
5418 key.af = pd2.af;
5419 key.proto = IPPROTO_TCP;
5420 PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5421 PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5422 key.port[pd2.sidx] = th.th_sport;
5423 key.port[pd2.didx] = th.th_dport;
5424
5425 STATE_LOOKUP(kif, &key, direction, *state, m);
5426 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5427
5428 if (direction == (*state)->direction) {
5429 src = &(*state)->dst;
5430 dst = &(*state)->src;
5431 } else {
5432 src = &(*state)->src;
5433 dst = &(*state)->dst;
5434 }
5435
5436 if (src->wscale && dst->wscale)
5437 dws = dst->wscale & PF_WSCALE_MASK;
5438 else
5439 dws = 0;
5440
5441 /* Demodulate sequence number */
5442 seq = ntohl(th.th_seq) - src->seqdiff;
5443 if (src->seqdiff) {
5444 pf_change_a(&th.th_seq, icmpsum,
5445 htonl(seq), 0);
5446 copyback = 1;
5447 }
5448
5449 if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
5450 (!SEQ_GEQ(src->seqhi, seq) ||
5451 !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
5452 if (pf_status.debug >= PF_DEBUG_MISC) {
5453 kprintf("pf: BAD ICMP %d:%d ",
5454 icmptype, pd->hdr.icmp->icmp_code);
5455 pf_print_host(pd->src, 0, pd->af);
5456 kprintf(" -> ");
5457 pf_print_host(pd->dst, 0, pd->af);
5458 kprintf(" state: ");
5459 pf_print_state(*state);
5460 kprintf(" seq=%u\n", seq);
5461 }
5462 REASON_SET(reason, PFRES_BADSTATE);
5463 FAIL (PF_DROP);
5464 } else {
5465 if (pf_status.debug >= PF_DEBUG_MISC) {
5466 kprintf("pf: OK ICMP %d:%d ",
5467 icmptype, pd->hdr.icmp->icmp_code);
5468 pf_print_host(pd->src, 0, pd->af);
5469 kprintf(" -> ");
5470 pf_print_host(pd->dst, 0, pd->af);
5471 kprintf(" state: ");
5472 pf_print_state(*state);
5473 kprintf(" seq=%u\n", seq);
5474 }
5475 }
5476
5477 /* translate source/destination address, if necessary */
5478 if ((*state)->key[PF_SK_WIRE] !=
5479 (*state)->key[PF_SK_STACK]) {
5480 struct pf_state_key *nk =
5481 (*state)->key[pd->didx];
5482
5483 if (PF_ANEQ(pd2.src,
5484 &nk->addr[pd2.sidx], pd2.af) ||
5485 nk->port[pd2.sidx] != th.th_sport)
5486 pf_change_icmp(pd2.src, &th.th_sport,
5487 daddr, &nk->addr[pd2.sidx],
5488 nk->port[pd2.sidx], NULL,
5489 pd2.ip_sum, icmpsum,
5490 pd->ip_sum, 0, pd2.af);
5491
5492 if (PF_ANEQ(pd2.dst,
5493 &nk->addr[pd2.didx], pd2.af) ||
5494 nk->port[pd2.didx] != th.th_dport)
5495 pf_change_icmp(pd2.dst, &th.th_dport,
5496 NULL, /* XXX Inbound NAT? */
5497 &nk->addr[pd2.didx],
5498 nk->port[pd2.didx], NULL,
5499 pd2.ip_sum, icmpsum,
5500 pd->ip_sum, 0, pd2.af);
5501 copyback = 1;
5502 }
5503
5504 if (copyback) {
5505 switch (pd2.af) {
5506#ifdef INET
5507 case AF_INET:
5508 m_copyback(m, off, ICMP_MINLEN,
5509 pd->hdr.icmp);
5510 m_copyback(m, ipoff2, sizeof(h2),
5511 &h2);
5512 break;
5513#endif /* INET */
5514#ifdef INET6
5515 case AF_INET6:
5516 m_copyback(m, off,
5517 sizeof(struct icmp6_hdr),
5518 pd->hdr.icmp6);
5519 m_copyback(m, ipoff2, sizeof(h2_6),
5520 &h2_6);
5521 break;
5522#endif /* INET6 */
5523 }
5524 m->m_flags &= ~M_HASH;
5525 m_copyback(m, off2, 8, &th);
5526 }
5527 break;
5528 }
5529 case IPPROTO_UDP: {
5530 struct udphdr uh;
5531
5532 if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
5533 NULL, reason, pd2.af)) {
5534 DPFPRINTF(PF_DEBUG_MISC,
5535 ("pf: ICMP error message too short "
5536 "(udp)\n"));
5537 return (PF_DROP);
5538 }
5539
5540 key.af = pd2.af;
5541 key.proto = IPPROTO_UDP;
5542 PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5543 PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5544 key.port[pd2.sidx] = uh.uh_sport;
5545 key.port[pd2.didx] = uh.uh_dport;
5546
5547 STATE_LOOKUP(kif, &key, direction, *state, m);
5548 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5549
5550 /* translate source/destination address, if necessary */
5551 if ((*state)->key[PF_SK_WIRE] !=
5552 (*state)->key[PF_SK_STACK]) {
5553 struct pf_state_key *nk =
5554 (*state)->key[pd->didx];
5555
5556 if (PF_ANEQ(pd2.src,
5557 &nk->addr[pd2.sidx], pd2.af) ||
5558 nk->port[pd2.sidx] != uh.uh_sport)
5559 pf_change_icmp(pd2.src, &uh.uh_sport,
5560 daddr, &nk->addr[pd2.sidx],
5561 nk->port[pd2.sidx], &uh.uh_sum,
5562 pd2.ip_sum, icmpsum,
5563 pd->ip_sum, 1, pd2.af);
5564
5565 if (PF_ANEQ(pd2.dst,
5566 &nk->addr[pd2.didx], pd2.af) ||
5567 nk->port[pd2.didx] != uh.uh_dport)
5568 pf_change_icmp(pd2.dst, &uh.uh_dport,
5569 NULL, /* XXX Inbound NAT? */
5570 &nk->addr[pd2.didx],
5571 nk->port[pd2.didx], &uh.uh_sum,
5572 pd2.ip_sum, icmpsum,
5573 pd->ip_sum, 1, pd2.af);
5574
5575 switch (pd2.af) {
5576#ifdef INET
5577 case AF_INET:
5578 m_copyback(m, off, ICMP_MINLEN,
5579 pd->hdr.icmp);
5580 m_copyback(m, ipoff2, sizeof(h2),
5581 &h2);
5582 break;
5583#endif /* INET */
5584#ifdef INET6
5585 case AF_INET6:
5586 m_copyback(m, off,
5587 sizeof(struct icmp6_hdr),
5588 pd->hdr.icmp6);
5589 m_copyback(m, ipoff2, sizeof(h2_6),
5590 &h2_6);
5591 break;
5592#endif /* INET6 */
5593 }
5594 m->m_flags &= ~M_HASH;
5595 m_copyback(m, off2, sizeof(uh), &uh);
5596 }
5597 break;
5598 }
5599#ifdef INET
5600 case IPPROTO_ICMP: {
5601 struct icmp iih;
5602
5603 if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
5604 NULL, reason, pd2.af)) {
5605 DPFPRINTF(PF_DEBUG_MISC,
5606 ("pf: ICMP error message too short i"
5607 "(icmp)\n"));
5608 return (PF_DROP);
5609 }
5610
5611 key.af = pd2.af;
5612 key.proto = IPPROTO_ICMP;
5613 PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5614 PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5615 key.port[0] = key.port[1] = iih.icmp_id;
5616
5617 STATE_LOOKUP(kif, &key, direction, *state, m);
5618 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5619
5620 /* translate source/destination address, if necessary */
5621 if ((*state)->key[PF_SK_WIRE] !=
5622 (*state)->key[PF_SK_STACK]) {
5623 struct pf_state_key *nk =
5624 (*state)->key[pd->didx];
5625
5626 if (PF_ANEQ(pd2.src,
5627 &nk->addr[pd2.sidx], pd2.af) ||
5628 nk->port[pd2.sidx] != iih.icmp_id)
5629 pf_change_icmp(pd2.src, &iih.icmp_id,
5630 daddr, &nk->addr[pd2.sidx],
5631 nk->port[pd2.sidx], NULL,
5632 pd2.ip_sum, icmpsum,
5633 pd->ip_sum, 0, AF_INET);
5634
5635 if (PF_ANEQ(pd2.dst,
5636 &nk->addr[pd2.didx], pd2.af) ||
5637 nk->port[pd2.didx] != iih.icmp_id)
5638 pf_change_icmp(pd2.dst, &iih.icmp_id,
5639 NULL, /* XXX Inbound NAT? */
5640 &nk->addr[pd2.didx],
5641 nk->port[pd2.didx], NULL,
5642 pd2.ip_sum, icmpsum,
5643 pd->ip_sum, 0, AF_INET);
5644
5645 m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp);
5646 m_copyback(m, ipoff2, sizeof(h2), &h2);
5647 m_copyback(m, off2, ICMP_MINLEN, &iih);
5648 m->m_flags &= ~M_HASH;
5649 }
5650 break;
5651 }
5652#endif /* INET */
5653#ifdef INET6
5654 case IPPROTO_ICMPV6: {
5655 struct icmp6_hdr iih;
5656
5657 if (!pf_pull_hdr(m, off2, &iih,
5658 sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
5659 DPFPRINTF(PF_DEBUG_MISC,
5660 ("pf: ICMP error message too short "
5661 "(icmp6)\n"));
5662 FAIL (PF_DROP);
5663 }
5664
5665 key.af = pd2.af;
5666 key.proto = IPPROTO_ICMPV6;
5667 PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5668 PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5669 key.port[0] = key.port[1] = iih.icmp6_id;
5670
5671 STATE_LOOKUP(kif, &key, direction, *state, m);
5672 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5673
5674 /* translate source/destination address, if necessary */
5675 if ((*state)->key[PF_SK_WIRE] !=
5676 (*state)->key[PF_SK_STACK]) {
5677 struct pf_state_key *nk =
5678 (*state)->key[pd->didx];
5679
5680 if (PF_ANEQ(pd2.src,
5681 &nk->addr[pd2.sidx], pd2.af) ||
5682 nk->port[pd2.sidx] != iih.icmp6_id)
5683 pf_change_icmp(pd2.src, &iih.icmp6_id,
5684 daddr, &nk->addr[pd2.sidx],
5685 nk->port[pd2.sidx], NULL,
5686 pd2.ip_sum, icmpsum,
5687 pd->ip_sum, 0, AF_INET6);
5688
5689 if (PF_ANEQ(pd2.dst,
5690 &nk->addr[pd2.didx], pd2.af) ||
5691 nk->port[pd2.didx] != iih.icmp6_id)
5692 pf_change_icmp(pd2.dst, &iih.icmp6_id,
5693 NULL, /* XXX Inbound NAT? */
5694 &nk->addr[pd2.didx],
5695 nk->port[pd2.didx], NULL,
5696 pd2.ip_sum, icmpsum,
5697 pd->ip_sum, 0, AF_INET6);
5698
5699 m_copyback(m, off, sizeof(struct icmp6_hdr),
5700 pd->hdr.icmp6);
5701 m_copyback(m, ipoff2, sizeof(h2_6), &h2_6);
5702 m_copyback(m, off2, sizeof(struct icmp6_hdr),
5703 &iih);
5704 m->m_flags &= ~M_HASH;
5705 }
5706 break;
5707 }
5708#endif /* INET6 */
5709 default: {
5710 key.af = pd2.af;
5711 key.proto = pd2.proto;
5712 PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5713 PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5714 key.port[0] = key.port[1] = 0;
5715
5716 STATE_LOOKUP(kif, &key, direction, *state, m);
5717 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5718
5719 /* translate source/destination address, if necessary */
5720 if ((*state)->key[PF_SK_WIRE] !=
5721 (*state)->key[PF_SK_STACK]) {
5722 struct pf_state_key *nk =
5723 (*state)->key[pd->didx];
5724
5725 if (PF_ANEQ(pd2.src,
5726 &nk->addr[pd2.sidx], pd2.af))
5727 pf_change_icmp(pd2.src, NULL, daddr,
5728 &nk->addr[pd2.sidx], 0, NULL,
5729 pd2.ip_sum, icmpsum,
5730 pd->ip_sum, 0, pd2.af);
5731
5732 if (PF_ANEQ(pd2.dst,
5733 &nk->addr[pd2.didx], pd2.af))
5734 pf_change_icmp(pd2.src, NULL,
5735 NULL, /* XXX Inbound NAT? */
5736 &nk->addr[pd2.didx], 0, NULL,
5737 pd2.ip_sum, icmpsum,
5738 pd->ip_sum, 0, pd2.af);
5739
5740 switch (pd2.af) {
5741#ifdef INET
5742 case AF_INET:
5743 m_copyback(m, off, ICMP_MINLEN,
5744 pd->hdr.icmp);
5745 m_copyback(m, ipoff2, sizeof(h2),
5746 &h2);
5747 m->m_flags &= ~M_HASH;
5748 break;
5749#endif /* INET */
5750#ifdef INET6
5751 case AF_INET6:
5752 m_copyback(m, off,
5753 sizeof(struct icmp6_hdr),
5754 pd->hdr.icmp6);
5755 m_copyback(m, ipoff2, sizeof(h2_6),
5756 &h2_6);
5757 m->m_flags &= ~M_HASH;
5758 break;
5759#endif /* INET6 */
5760 }
5761 }
5762 break;
5763 }
5764 }
5765 }
5766
5767 pfsync_update_state(*state);
5768 error = PF_PASS;
5769done:
5770 if (*state)
5771 lockmgr(&(*state)->lk, LK_RELEASE);
5772 return (error);
5773}
5774
5775/*
5776 * Test other connection state. Caller must hold the state locked.
5777 */
5778int
5779pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
5780 struct mbuf *m, struct pf_pdesc *pd)
5781{
5782 struct pf_state_peer *src, *dst;
5783 struct pf_state_key_cmp key;
5784
5785 bzero(&key, sizeof(key));
5786 key.af = pd->af;
5787 key.proto = pd->proto;
5788 if (direction == PF_IN) {
5789 PF_ACPY(&key.addr[0], pd->src, key.af);
5790 PF_ACPY(&key.addr[1], pd->dst, key.af);
5791 key.port[0] = key.port[1] = 0;
5792 } else {
5793 PF_ACPY(&key.addr[1], pd->src, key.af);
5794 PF_ACPY(&key.addr[0], pd->dst, key.af);
5795 key.port[1] = key.port[0] = 0;
5796 }
5797
5798 STATE_LOOKUP(kif, &key, direction, *state, m);
5799 lockmgr(&(*state)->lk, LK_EXCLUSIVE);
5800
5801 if (direction == (*state)->direction) {
5802 src = &(*state)->src;
5803 dst = &(*state)->dst;
5804 } else {
5805 src = &(*state)->dst;
5806 dst = &(*state)->src;
5807 }
5808
5809 /* update states */
5810 if (src->state < PFOTHERS_SINGLE)
5811 src->state = PFOTHERS_SINGLE;
5812 if (dst->state == PFOTHERS_SINGLE)
5813 dst->state = PFOTHERS_MULTIPLE;
5814
5815 /* update expire time */
5816 (*state)->expire = time_second;
5817 if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5818 (*state)->timeout = PFTM_OTHER_MULTIPLE;
5819 else
5820 (*state)->timeout = PFTM_OTHER_SINGLE;
5821
5822 /* translate source/destination address, if necessary */
5823 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5824 struct pf_state_key *nk = (*state)->key[pd->didx];
5825
5826 KKASSERT(nk);
5827 KKASSERT(pd);
5828 KKASSERT(pd->src);
5829 KKASSERT(pd->dst);
5830 switch (pd->af) {
5831#ifdef INET
5832 case AF_INET:
5833 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5834 pf_change_a(&pd->src->v4.s_addr,
5835 pd->ip_sum,
5836 nk->addr[pd->sidx].v4.s_addr,
5837 0);
5838
5839
5840 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5841 pf_change_a(&pd->dst->v4.s_addr,
5842 pd->ip_sum,
5843 nk->addr[pd->didx].v4.s_addr,
5844 0);
5845
5846 break;
5847#endif /* INET */
5848#ifdef INET6
5849 case AF_INET6:
5850 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET6))
5851 PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5852
5853 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET6))
5854 PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5855#endif /* INET6 */
5856 }
5857 }
5858
5859 pfsync_update_state(*state);
5860 lockmgr(&(*state)->lk, LK_RELEASE);
5861 return (PF_PASS);
5862}
5863
5864/*
5865 * ipoff and off are measured from the start of the mbuf chain.
5866 * h must be at "ipoff" on the mbuf chain.
5867 */
5868void *
5869pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5870 u_short *actionp, u_short *reasonp, sa_family_t af)
5871{
5872 switch (af) {
5873#ifdef INET
5874 case AF_INET: {
5875 struct ip *h = mtod(m, struct ip *);
5876 u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
5877
5878 if (fragoff) {
5879 if (fragoff >= len)
5880 ACTION_SET(actionp, PF_PASS);
5881 else {
5882 ACTION_SET(actionp, PF_DROP);
5883 REASON_SET(reasonp, PFRES_FRAG);
5884 }
5885 return (NULL);
5886 }
5887 if (m->m_pkthdr.len < off + len ||
5888 ntohs(h->ip_len) < off + len) {
5889 ACTION_SET(actionp, PF_DROP);
5890 REASON_SET(reasonp, PFRES_SHORT);
5891 return (NULL);
5892 }
5893 break;
5894 }
5895#endif /* INET */
5896#ifdef INET6
5897 case AF_INET6: {
5898 struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
5899
5900 if (m->m_pkthdr.len < off + len ||
5901 (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5902 (unsigned)(off + len)) {
5903 ACTION_SET(actionp, PF_DROP);
5904 REASON_SET(reasonp, PFRES_SHORT);
5905 return (NULL);
5906 }
5907 break;
5908 }
5909#endif /* INET6 */
5910 }
5911 m_copydata(m, off, len, p);
5912 return (p);
5913}
5914
5915int
5916pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif)
5917{
5918 struct sockaddr_in *dst;
5919 int ret = 1;
5920 int check_mpath;
5921#ifdef INET6
5922 struct sockaddr_in6 *dst6;
5923 struct route_in6 ro;
5924#else
5925 struct route ro;
5926#endif
5927 struct radix_node *rn;
5928 struct rtentry *rt;
5929 struct ifnet *ifp;
5930
5931 check_mpath = 0;
5932 bzero(&ro, sizeof(ro));
5933 switch (af) {
5934 case AF_INET:
5935 dst = satosin(&ro.ro_dst);
5936 dst->sin_family = AF_INET;
5937 dst->sin_len = sizeof(*dst);
5938 dst->sin_addr = addr->v4;
5939 break;
5940#ifdef INET6
5941 case AF_INET6:
5942 /*
5943 * Skip check for addresses with embedded interface scope,
5944 * as they would always match anyway.
5945 */
5946 if (IN6_IS_SCOPE_EMBED(&addr->v6))
5947 goto out;
5948 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5949 dst6->sin6_family = AF_INET6;
5950 dst6->sin6_len = sizeof(*dst6);
5951 dst6->sin6_addr = addr->v6;
5952 break;
5953#endif /* INET6 */
5954 default:
5955 return (0);
5956 }
5957
5958 /* Skip checks for ipsec interfaces */
5959 if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5960 goto out;
5961
5962 rtalloc_ign((struct route *)&ro, 0);
5963
5964 if (ro.ro_rt != NULL) {
5965 /* No interface given, this is a no-route check */
5966 if (kif == NULL)
5967 goto out;
5968
5969 if (kif->pfik_ifp == NULL) {
5970 ret = 0;
5971 goto out;
5972 }
5973
5974 /* Perform uRPF check if passed input interface */
5975 ret = 0;
5976 rn = (struct radix_node *)ro.ro_rt;
5977 do {
5978 rt = (struct rtentry *)rn;
5979 ifp = rt->rt_ifp;
5980
5981 if (kif->pfik_ifp == ifp)
5982 ret = 1;
5983 rn = NULL;
5984 } while (check_mpath == 1 && rn != NULL && ret == 0);
5985 } else
5986 ret = 0;
5987out:
5988 if (ro.ro_rt != NULL)
5989 RTFREE(ro.ro_rt);
5990 return (ret);
5991}
5992
5993int
5994pf_rtlabel_match(struct pf_addr *addr, sa_family_t af, struct pf_addr_wrap *aw)
5995{
5996 struct sockaddr_in *dst;
5997#ifdef INET6
5998 struct sockaddr_in6 *dst6;
5999 struct route_in6 ro;
6000#else
6001 struct route ro;
6002#endif
6003 int ret = 0;
6004
6005 ASSERT_LWKT_TOKEN_HELD(&pf_token);
6006
6007 bzero(&ro, sizeof(ro));
6008 switch (af) {
6009 case AF_INET:
6010 dst = satosin(&ro.ro_dst);
6011 dst->sin_family = AF_INET;
6012 dst->sin_len = sizeof(*dst);
6013 dst->sin_addr = addr->v4;
6014 break;
6015#ifdef INET6
6016 case AF_INET6:
6017 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
6018 dst6->sin6_family = AF_INET6;
6019 dst6->sin6_len = sizeof(*dst6);
6020 dst6->sin6_addr = addr->v6;
6021 break;
6022#endif /* INET6 */
6023 default:
6024 return (0);
6025 }
6026
6027rtalloc_ign((struct route *)&ro, (RTF_CLONING | RTF_PRCLONING));
6028
6029 if (ro.ro_rt != NULL) {
6030 RTFREE(ro.ro_rt);
6031 }
6032
6033 return (ret);
6034}
6035
6036#ifdef INET
6037void
6038pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
6039 struct pf_state *s, struct pf_pdesc *pd)
6040{
6041 struct mbuf *m0, *m1;
6042 struct route iproute;
6043 struct route *ro = NULL;
6044 struct sockaddr_in *dst;
6045 struct ip *ip;
6046 struct ifnet *ifp = NULL;
6047 struct pf_addr naddr;
6048 struct pf_src_node *sn = NULL;
6049 int error = 0;
6050 int sw_csum;
6051
6052 ASSERT_LWKT_TOKEN_HELD(&pf_token);
6053
6054 if (m == NULL || *m == NULL || r == NULL ||
6055 (dir != PF_IN && dir != PF_OUT) || oifp == NULL)
6056 panic("pf_route: invalid parameters");
6057
6058 if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
6059 (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED;
6060 (*m)->m_pkthdr.pf.routed = 1;
6061 } else {
6062 if ((*m)->m_pkthdr.pf.routed++ > 3) {
6063 m0 = *m;
6064 *m = NULL;
6065 goto bad;
6066 }
6067 }
6068
6069 if (r->rt == PF_DUPTO) {
6070 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
6071 return;
6072 }
6073 } else {
6074 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
6075 return;
6076 }
6077 m0 = *m;
6078 }
6079
6080 if (m0->m_len < sizeof(struct ip)) {
6081 DPFPRINTF(PF_DEBUG_URGENT,
6082 ("pf_route: m0->m_len < sizeof(struct ip)\n"));
6083 goto bad;
6084 }
6085
6086 ip = mtod(m0, struct ip *);
6087
6088 ro = &iproute;
6089 bzero((caddr_t)ro, sizeof(*ro));
6090 dst = satosin(&ro->ro_dst);
6091 dst->sin_family = AF_INET;
6092 dst->sin_len = sizeof(*dst);
6093 dst->sin_addr = ip->ip_dst;
6094
6095 if (r->rt == PF_FASTROUTE) {
6096 rtalloc(ro);
6097 if (ro->ro_rt == 0) {
6098 ipstat.ips_noroute++;
6099 goto bad;
6100 }
6101
6102 ifp = ro->ro_rt->rt_ifp;
6103 ro->ro_rt->rt_use++;
6104
6105 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
6106 dst = satosin(ro->ro_rt->rt_gateway);
6107 } else {
6108 if (TAILQ_EMPTY(&r->rpool.list)) {
6109 DPFPRINTF(PF_DEBUG_URGENT,
6110 ("pf_route: TAILQ_EMPTY(&r->rpool.list)\n"));
6111 goto bad;
6112 }
6113 if (s == NULL) {
6114 pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
6115 &naddr, NULL, &sn);
6116 if (!PF_AZERO(&naddr, AF_INET))
6117 dst->sin_addr.s_addr = naddr.v4.s_addr;
6118 ifp = r->rpool.cur->kif ?
6119 r->rpool.cur->kif->pfik_ifp : NULL;
6120 } else {
6121 if (!PF_AZERO(&s->rt_addr, AF_INET))
6122 dst->sin_addr.s_addr =
6123 s->rt_addr.v4.s_addr;
6124 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
6125 }
6126 }
6127 if (ifp == NULL)
6128 goto bad;
6129
6130 if (oifp != ifp) {
6131 if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
6132 goto bad;
6133 } else if (m0 == NULL) {
6134 goto done;
6135 }
6136 if (m0->m_len < sizeof(struct ip)) {
6137 DPFPRINTF(PF_DEBUG_URGENT,
6138 ("pf_route: m0->m_len < sizeof(struct ip)\n"));
6139 goto bad;
6140 }
6141 ip = mtod(m0, struct ip *);
6142 }
6143
6144 /* Copied from FreeBSD 5.1-CURRENT ip_output. */
6145 m0->m_pkthdr.csum_flags |= CSUM_IP;
6146 sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
6147 if (sw_csum & CSUM_DELAY_DATA) {
6148 in_delayed_cksum(m0);
6149 sw_csum &= ~CSUM_DELAY_DATA;
6150 }
6151 m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
6152 m0->m_pkthdr.csum_iphlen = (ip->ip_hl << 2);
6153
6154 /*
6155 * WARNING! We cannot fragment if the packet was modified from an
6156 * original which expected to be using TSO. In this
6157 * situation we pray that the target interface is
6158 * compatible with the originating interface.
6159 */
6160 if (ntohs(ip->ip_len) <= ifp->if_mtu ||
6161 (m0->m_pkthdr.csum_flags & CSUM_TSO) ||
6162 ((ifp->if_hwassist & CSUM_FRAGMENT) &&
6163 (ip->ip_off & htons(IP_DF)) == 0)) {
6164 ip->ip_sum = 0;
6165 if (sw_csum & CSUM_DELAY_IP) {
6166 /* From KAME */
6167 if (ip->ip_v == IPVERSION &&
6168 (ip->ip_hl << 2) == sizeof(*ip)) {
6169 ip->ip_sum = in_cksum_hdr(ip);
6170 } else {
6171 ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
6172 }
6173 }
6174 lwkt_reltoken(&pf_token);
6175 error = ifp->if_output(ifp, m0, sintosa(dst), ro->ro_rt);
6176 lwkt_gettoken(&pf_token);
6177 goto done;
6178 }
6179
6180 /*
6181 * Too large for interface; fragment if possible.
6182 * Must be able to put at least 8 bytes per fragment.
6183 */
6184 if (ip->ip_off & htons(IP_DF)) {
6185 ipstat.ips_cantfrag++;
6186 if (r->rt != PF_DUPTO) {
6187 icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
6188 ifp->if_mtu);
6189 goto done;
6190 } else
6191 goto bad;
6192 }
6193
6194 m1 = m0;
6195 error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
6196 if (error) {
6197 goto bad;
6198 }
6199
6200 for (m0 = m1; m0; m0 = m1) {
6201 m1 = m0->m_nextpkt;
6202 m0->m_nextpkt = 0;
6203 if (error == 0) {
6204 lwkt_reltoken(&pf_token);
6205 error = (*ifp->if_output)(ifp, m0, sintosa(dst),
6206 NULL);
6207 lwkt_gettoken(&pf_token);
6208 } else
6209 m_freem(m0);
6210 }
6211
6212 if (error == 0)
6213 ipstat.ips_fragmented++;
6214
6215done:
6216 if (r->rt != PF_DUPTO)
6217 *m = NULL;
6218 if (ro == &iproute && ro->ro_rt)
6219 RTFREE(ro->ro_rt);
6220 return;
6221
6222bad:
6223 m_freem(m0);
6224 goto done;
6225}
6226#endif /* INET */
6227
6228#ifdef INET6
6229void
6230pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
6231 struct pf_state *s, struct pf_pdesc *pd)
6232{
6233 struct mbuf *m0;
6234 struct route_in6 ip6route;
6235 struct route_in6 *ro;
6236 struct sockaddr_in6 *dst;
6237 struct ip6_hdr *ip6;
6238 struct ifnet *ifp = NULL;
6239 struct pf_addr naddr;
6240 struct pf_src_node *sn = NULL;
6241
6242 if (m == NULL || *m == NULL || r == NULL ||
6243 (dir != PF_IN && dir != PF_OUT) || oifp == NULL)
6244 panic("pf_route6: invalid parameters");
6245
6246 if (((*m)->m_pkthdr.fw_flags & PF_MBUF_ROUTED) == 0) {
6247 (*m)->m_pkthdr.fw_flags |= PF_MBUF_ROUTED;
6248 (*m)->m_pkthdr.pf.routed = 1;
6249 } else {
6250 if ((*m)->m_pkthdr.pf.routed++ > 3) {
6251 m0 = *m;
6252 *m = NULL;
6253 goto bad;
6254 }
6255 }
6256
6257 if (r->rt == PF_DUPTO) {
6258 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL)
6259 return;
6260 } else {
6261 if ((r->rt == PF_REPLYTO) == (r->direction == dir))
6262 return;
6263 m0 = *m;
6264 }
6265
6266 if (m0->m_len < sizeof(struct ip6_hdr)) {
6267 DPFPRINTF(PF_DEBUG_URGENT,
6268 ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
6269 goto bad;
6270 }
6271 ip6 = mtod(m0, struct ip6_hdr *);
6272
6273 ro = &ip6route;
6274 bzero((caddr_t)ro, sizeof(*ro));
6275 dst = (struct sockaddr_in6 *)&ro->ro_dst;
6276 dst->sin6_family = AF_INET6;
6277 dst->sin6_len = sizeof(*dst);
6278 dst->sin6_addr = ip6->ip6_dst;
6279
6280 /*
6281 * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6282 * so make sure pf.flags is clear.
6283 *
6284 * Cheat. XXX why only in the v6 case???
6285 */
6286 if (r->rt == PF_FASTROUTE) {
6287 m0->m_pkthdr.fw_flags |= PF_MBUF_TAGGED;
6288 m0->m_pkthdr.pf.flags = 0;
6289 /* XXX Re-Check when Upgrading to > 4.4 */
6290 m0->m_pkthdr.pf.statekey = NULL;
6291 ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
6292 return;
6293 }
6294
6295 if (TAILQ_EMPTY(&r->rpool.list)) {
6296 DPFPRINTF(PF_DEBUG_URGENT,
6297 ("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n"));
6298 goto bad;
6299 }
6300 if (s == NULL) {
6301 pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
6302 &naddr, NULL, &sn);
6303 if (!PF_AZERO(&naddr, AF_INET6))
6304 PF_ACPY((struct pf_addr *)&dst->sin6_addr,
6305 &naddr, AF_INET6);
6306 ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
6307 } else {
6308 if (!PF_AZERO(&s->rt_addr, AF_INET6))
6309 PF_ACPY((struct pf_addr *)&dst->sin6_addr,
6310 &s->rt_addr, AF_INET6);
6311 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
6312 }
6313 if (ifp == NULL)
6314 goto bad;
6315
6316 if (oifp != ifp) {
6317 if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
6318 goto bad;
6319 } else if (m0 == NULL) {
6320 goto done;
6321 }
6322 if (m0->m_len < sizeof(struct ip6_hdr)) {
6323 DPFPRINTF(PF_DEBUG_URGENT,
6324 ("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
6325 goto bad;
6326 }
6327 ip6 = mtod(m0, struct ip6_hdr *);
6328 }
6329
6330 /*
6331 * If the packet is too large for the outgoing interface,
6332 * send back an icmp6 error.
6333 */
6334 if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr))
6335 dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index);
6336 if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
6337 nd6_output(ifp, ifp, m0, dst, NULL);
6338 } else {
6339 in6_ifstat_inc(ifp, ifs6_in_toobig);
6340 if (r->rt != PF_DUPTO)
6341 icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
6342 else
6343 goto bad;
6344 }
6345
6346done:
6347 if (r->rt != PF_DUPTO)
6348 *m = NULL;
6349 return;
6350
6351bad:
6352 m_freem(m0);
6353 goto done;
6354}
6355#endif /* INET6 */
6356
6357
6358/*
6359 * check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag
6360 * off is the offset where the protocol header starts
6361 * len is the total length of protocol header plus payload
6362 * returns 0 when the checksum is valid, otherwise returns 1.
6363 */
6364/*
6365 * XXX
6366 * FreeBSD supports cksum offload for the following drivers.
6367 * em(4), gx(4), lge(4), nge(4), ti(4), xl(4)
6368 * If we can make full use of it we would outperform ipfw/ipfilter in
6369 * very heavy traffic.
6370 * I have not tested 'cause I don't have NICs that supports cksum offload.
6371 * (There might be problems. Typical phenomena would be
6372 * 1. No route message for UDP packet.
6373 * 2. No connection acceptance from external hosts regardless of rule set.)
6374 */
6375int
6376pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p,
6377 sa_family_t af)
6378{
6379 u_int16_t sum = 0;
6380 int hw_assist = 0;
6381 struct ip *ip;
6382
6383 if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
6384 return (1);
6385 if (m->m_pkthdr.len < off + len)
6386 return (1);
6387
6388 switch (p) {
6389 case IPPROTO_TCP:
6390 case IPPROTO_UDP:
6391 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
6392 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6393 sum = m->m_pkthdr.csum_data;
6394 } else {
6395 ip = mtod(m, struct ip *);
6396 sum = in_pseudo(ip->ip_src.s_addr,
6397 ip->ip_dst.s_addr, htonl((u_short)len +
6398 m->m_pkthdr.csum_data + p));
6399 }
6400 sum ^= 0xffff;
6401 ++hw_assist;
6402 }
6403 break;
6404 case IPPROTO_ICMP:
6405#ifdef INET6
6406 case IPPROTO_ICMPV6:
6407#endif /* INET6 */
6408 break;
6409 default:
6410 return (1);
6411 }
6412
6413 if (!hw_assist) {
6414 switch (af) {
6415 case AF_INET:
6416 if (p == IPPROTO_ICMP) {
6417 if (m->m_len < off)
6418 return (1);
6419 m->m_data += off;
6420 m->m_len -= off;
6421 sum = in_cksum(m, len);
6422 m->m_data -= off;
6423 m->m_len += off;
6424 } else {
6425 if (m->m_len < sizeof(struct ip))
6426 return (1);
6427 sum = in_cksum_range(m, p, off, len);
6428 if (sum == 0) {
6429 m->m_pkthdr.csum_flags |=
6430 (CSUM_DATA_VALID |
6431 CSUM_PSEUDO_HDR);
6432 m->m_pkthdr.csum_data = 0xffff;
6433 }
6434 }
6435 break;
6436#ifdef INET6
6437 case AF_INET6:
6438 if (m->m_len < sizeof(struct ip6_hdr))
6439 return (1);
6440 sum = in6_cksum(m, p, off, len);
6441 /*
6442 * XXX
6443 * IPv6 H/W cksum off-load not supported yet!
6444 *
6445 * if (sum == 0) {
6446 * m->m_pkthdr.csum_flags |=
6447 * (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
6448 * m->m_pkthdr.csum_data = 0xffff;
6449 *}
6450 */
6451 break;
6452#endif /* INET6 */
6453 default:
6454 return (1);
6455 }
6456 }
6457 if (sum) {
6458 switch (p) {
6459 case IPPROTO_TCP:
6460 tcpstat.tcps_rcvbadsum++;
6461 break;
6462 case IPPROTO_UDP:
6463 udp_stat.udps_badsum++;
6464 break;
6465 case IPPROTO_ICMP:
6466 icmpstat.icps_checksum++;
6467 break;
6468#ifdef INET6
6469 case IPPROTO_ICMPV6:
6470 icmp6stat.icp6s_checksum++;
6471 break;
6472#endif /* INET6 */
6473 }
6474 return (1);
6475 }
6476 return (0);
6477}
6478
6479struct pf_divert *
6480pf_find_divert(struct mbuf *m)
6481{
6482 struct m_tag *mtag;
6483
6484 if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL)
6485 return (NULL);
6486
6487 return ((struct pf_divert *)(mtag + 1));
6488}
6489
6490struct pf_divert *
6491pf_get_divert(struct mbuf *m)
6492{
6493 struct m_tag *mtag;
6494
6495 if ((mtag = m_tag_find(m, PACKET_TAG_PF_DIVERT, NULL)) == NULL) {
6496 mtag = m_tag_get(PACKET_TAG_PF_DIVERT, sizeof(struct pf_divert),
6497 M_NOWAIT);
6498 if (mtag == NULL)
6499 return (NULL);
6500 bzero(mtag + 1, sizeof(struct pf_divert));
6501 m_tag_prepend(m, mtag);
6502 }
6503
6504 return ((struct pf_divert *)(mtag + 1));
6505}
6506
6507#ifdef INET
6508
6509/*
6510 * WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
6511 */
6512int
6513pf_test(int dir, struct ifnet *ifp, struct mbuf **m0,
6514 struct ether_header *eh, struct inpcb *inp)
6515{
6516 struct pfi_kif *kif;
6517 u_short action, reason = 0, log = 0;
6518 struct mbuf *m = *m0;
6519 struct ip *h = NULL;
6520 struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr;
6521 struct pf_state *s = NULL;
6522 struct pf_ruleset *ruleset = NULL;
6523 struct pf_pdesc pd;
6524 int off, dirndx;
6525#ifdef ALTQ
6526 int pqid = 0;
6527#endif
6528
6529 if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6530 /* Skip us; continue in ipfw. */
6531 return (PF_PASS);
6532 }
6533
6534 if (!pf_status.running)
6535 return (PF_PASS);
6536
6537 memset(&pd, 0, sizeof(pd));
6538#ifdef foo
6539 if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
6540 kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
6541 else
6542#endif
6543 kif = (struct pfi_kif *)ifp->if_pf_kif;
6544
6545 if (kif == NULL) {
6546 DPFPRINTF(PF_DEBUG_URGENT,
6547 ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
6548 return (PF_DROP);
6549 }
6550 if (kif->pfik_flags & PFI_IFLAG_SKIP)
6551 return (PF_PASS);
6552
6553#ifdef DIAGNOSTIC
6554 if ((m->m_flags & M_PKTHDR) == 0)
6555 panic("non-M_PKTHDR is passed to pf_test");
6556#endif /* DIAGNOSTIC */
6557
6558 if (m->m_pkthdr.len < (int)sizeof(*h)) {
6559 action = PF_DROP;
6560 REASON_SET(&reason, PFRES_SHORT);
6561 log = 1;
6562 goto done;
6563 }
6564
6565 /*
6566 * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6567 * so make sure pf.flags is clear.
6568 */
6569 if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
6570 return (PF_PASS);
6571 m->m_pkthdr.pf.flags = 0;
6572 /* Re-Check when updating to > 4.4 */
6573 m->m_pkthdr.pf.statekey = NULL;
6574
6575 /* We do IP header normalization and packet reassembly here */
6576 if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
6577 action = PF_DROP;
6578 goto done;
6579 }
6580 m = *m0; /* pf_normalize messes with m0 */
6581 h = mtod(m, struct ip *);
6582
6583 off = h->ip_hl << 2;
6584 if (off < (int)sizeof(*h)) {
6585 action = PF_DROP;
6586 REASON_SET(&reason, PFRES_SHORT);
6587 log = 1;
6588 goto done;
6589 }
6590
6591 pd.src = (struct pf_addr *)&h->ip_src;
6592 pd.dst = (struct pf_addr *)&h->ip_dst;
6593 pd.sport = pd.dport = NULL;
6594 pd.ip_sum = &h->ip_sum;
6595 pd.proto_sum = NULL;
6596 pd.proto = h->ip_p;
6597 pd.dir = dir;
6598 pd.sidx = (dir == PF_IN) ? 0 : 1;
6599 pd.didx = (dir == PF_IN) ? 1 : 0;
6600 pd.af = AF_INET;
6601 pd.tos = h->ip_tos;
6602 pd.tot_len = ntohs(h->ip_len);
6603 pd.eh = eh;
6604
6605 /* handle fragments that didn't get reassembled by normalization */
6606 if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
6607 action = pf_test_fragment(&r, dir, kif, m, h,
6608 &pd, &a, &ruleset);
6609 goto done;
6610 }
6611
6612 switch (h->ip_p) {
6613
6614 case IPPROTO_TCP: {
6615 struct tcphdr th;
6616
6617 pd.hdr.tcp = &th;
6618 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6619 &action, &reason, AF_INET)) {
6620 log = action != PF_PASS;
6621 goto done;
6622 }
6623 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6624#ifdef ALTQ
6625 if ((th.th_flags & TH_ACK) && pd.p_len == 0)
6626 pqid = 1;
6627#endif
6628 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6629 if (action == PF_DROP)
6630 goto done;
6631 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6632 &reason);
6633 if (action == PF_PASS) {
6634 r = s->rule.ptr;
6635 a = s->anchor.ptr;
6636 log = s->log;
6637 } else if (s == NULL) {
6638 action = pf_test_rule(&r, &s, dir, kif,
6639 m, off, h, &pd, &a,
6640 &ruleset, NULL, inp);
6641 }
6642 break;
6643 }
6644
6645 case IPPROTO_UDP: {
6646 struct udphdr uh;
6647
6648 pd.hdr.udp = &uh;
6649 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6650 &action, &reason, AF_INET)) {
6651 log = action != PF_PASS;
6652 goto done;
6653 }
6654 if (uh.uh_dport == 0 ||
6655 ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6656 ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6657 action = PF_DROP;
6658 REASON_SET(&reason, PFRES_SHORT);
6659 goto done;
6660 }
6661 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6662 if (action == PF_PASS) {
6663 r = s->rule.ptr;
6664 a = s->anchor.ptr;
6665 log = s->log;
6666 } else if (s == NULL) {
6667 action = pf_test_rule(&r, &s, dir, kif,
6668 m, off, h, &pd, &a,
6669 &ruleset, NULL, inp);
6670 }
6671 break;
6672 }
6673
6674 case IPPROTO_ICMP: {
6675 struct icmp ih;
6676
6677 pd.hdr.icmp = &ih;
6678 if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
6679 &action, &reason, AF_INET)) {
6680 log = action != PF_PASS;
6681 goto done;
6682 }
6683 action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
6684 &reason);
6685 if (action == PF_PASS) {
6686 r = s->rule.ptr;
6687 a = s->anchor.ptr;
6688 log = s->log;
6689 } else if (s == NULL) {
6690 action = pf_test_rule(&r, &s, dir, kif,
6691 m, off, h, &pd, &a,
6692 &ruleset, NULL, inp);
6693 }
6694 break;
6695 }
6696
6697 default:
6698 action = pf_test_state_other(&s, dir, kif, m, &pd);
6699 if (action == PF_PASS) {
6700 r = s->rule.ptr;
6701 a = s->anchor.ptr;
6702 log = s->log;
6703 } else if (s == NULL) {
6704 action = pf_test_rule(&r, &s, dir, kif, m, off, h,
6705 &pd, &a, &ruleset, NULL, inp);
6706 }
6707 break;
6708 }
6709
6710done:
6711 if (action == PF_PASS && h->ip_hl > 5 &&
6712 !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6713 action = PF_DROP;
6714 REASON_SET(&reason, PFRES_IPOPTIONS);
6715 log = 1;
6716 DPFPRINTF(PF_DEBUG_MISC,
6717 ("pf: dropping packet with ip options\n"));
6718 }
6719
6720 if ((s && s->tag) || r->rtableid)
6721 pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
6722
6723#if 0
6724 if (dir == PF_IN && s && s->key[PF_SK_STACK])
6725 m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
6726#endif
6727
6728#ifdef ALTQ
6729 /*
6730 * Generate a hash code and qid request for ALTQ. A qid of 0
6731 * is allowed and will cause altq to select the default queue.
6732 */
6733 if (action == PF_PASS) {
6734 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
6735 if (pqid || (pd.tos & IPTOS_LOWDELAY))
6736 m->m_pkthdr.pf.qid = r->pqid;
6737 else
6738 m->m_pkthdr.pf.qid = r->qid;
6739 m->m_pkthdr.pf.ecn_af = AF_INET;
6740 m->m_pkthdr.pf.hdr = h;
6741 /* add connection hash for fairq */
6742 if (s) {
6743 /* for fairq */
6744 m->m_pkthdr.pf.state_hash = s->hash;
6745 m->m_pkthdr.pf.flags |= PF_TAG_STATE_HASHED;
6746 }
6747 }
6748#endif /* ALTQ */
6749
6750 /*
6751 * connections redirected to loopback should not match sockets
6752 * bound specifically to loopback due to security implications,
6753 * see tcp_input() and in_pcblookup_listen().
6754 */
6755 if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6756 pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6757 (s->nat_rule.ptr->action == PF_RDR ||
6758 s->nat_rule.ptr->action == PF_BINAT) &&
6759 (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
6760 {
6761 m->m_pkthdr.pf.flags |= PF_TAG_TRANSLATE_LOCALHOST;
6762 }
6763
6764 if (dir == PF_IN && action == PF_PASS && r->divert.port) {
6765 struct pf_divert *divert;
6766
6767 if ((divert = pf_get_divert(m))) {
6768 m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED;
6769 divert->port = r->divert.port;
6770 divert->addr.ipv4 = r->divert.addr.v4;
6771 }
6772 }
6773
6774 if (log) {
6775 struct pf_rule *lr;
6776
6777 if (s != NULL && s->nat_rule.ptr != NULL &&
6778 s->nat_rule.ptr->log & PF_LOG_ALL)
6779 lr = s->nat_rule.ptr;
6780 else
6781 lr = r;
6782 PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset,
6783 &pd);
6784 }
6785
6786 kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6787 kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
6788
6789 if (action == PF_PASS || r->action == PF_DROP) {
6790 dirndx = (dir == PF_OUT);
6791 r->packets[dirndx]++;
6792 r->bytes[dirndx] += pd.tot_len;
6793 if (a != NULL) {
6794 a->packets[dirndx]++;
6795 a->bytes[dirndx] += pd.tot_len;
6796 }
6797 if (s != NULL) {
6798 if (s->nat_rule.ptr != NULL) {
6799 s->nat_rule.ptr->packets[dirndx]++;
6800 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6801 }
6802 if (s->src_node != NULL) {
6803 s->src_node->packets[dirndx]++;
6804 s->src_node->bytes[dirndx] += pd.tot_len;
6805 }
6806 if (s->nat_src_node != NULL) {
6807 s->nat_src_node->packets[dirndx]++;
6808 s->nat_src_node->bytes[dirndx] += pd.tot_len;
6809 }
6810 dirndx = (dir == s->direction) ? 0 : 1;
6811 s->packets[dirndx]++;
6812 s->bytes[dirndx] += pd.tot_len;
6813 }
6814 tr = r;
6815 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6816 if (nr != NULL && r == &pf_default_rule)
6817 tr = nr;
6818 if (tr->src.addr.type == PF_ADDR_TABLE)
6819 pfr_update_stats(tr->src.addr.p.tbl,
6820 (s == NULL) ? pd.src :
6821 &s->key[(s->direction == PF_IN)]->
6822 addr[(s->direction == PF_OUT)],
6823 pd.af, pd.tot_len, dir == PF_OUT,
6824 r->action == PF_PASS, tr->src.neg);
6825 if (tr->dst.addr.type == PF_ADDR_TABLE)
6826 pfr_update_stats(tr->dst.addr.p.tbl,
6827 (s == NULL) ? pd.dst :
6828 &s->key[(s->direction == PF_IN)]->
6829 addr[(s->direction == PF_IN)],
6830 pd.af, pd.tot_len, dir == PF_OUT,
6831 r->action == PF_PASS, tr->dst.neg);
6832 }
6833
6834
6835 if (action == PF_SYNPROXY_DROP) {
6836 m_freem(*m0);
6837 *m0 = NULL;
6838 action = PF_PASS;
6839 } else if (r->rt) {
6840 /* pf_route can free the mbuf causing *m0 to become NULL */
6841 pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
6842 }
6843
6844 return (action);
6845}
6846#endif /* INET */
6847
6848#ifdef INET6
6849
6850/*
6851 * WARNING: pf_token held shared on entry, THIS IS CPU LOCALIZED CODE
6852 */
6853int
6854pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0,
6855 struct ether_header *eh, struct inpcb *inp)
6856{
6857 struct pfi_kif *kif;
6858 u_short action, reason = 0, log = 0;
6859 struct mbuf *m = *m0, *n = NULL;
6860 struct ip6_hdr *h = NULL;
6861 struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr;
6862 struct pf_state *s = NULL;
6863 struct pf_ruleset *ruleset = NULL;
6864 struct pf_pdesc pd;
6865 int off, terminal = 0, dirndx, rh_cnt = 0;
6866
6867 if (!pf_status.running)
6868 return (PF_PASS);
6869
6870 memset(&pd, 0, sizeof(pd));
6871#ifdef foo
6872 if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
6873 kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
6874 else
6875#endif
6876 kif = (struct pfi_kif *)ifp->if_pf_kif;
6877
6878 if (kif == NULL) {
6879 DPFPRINTF(PF_DEBUG_URGENT,
6880 ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6881 return (PF_DROP);
6882 }
6883 if (kif->pfik_flags & PFI_IFLAG_SKIP)
6884 return (PF_PASS);
6885
6886#ifdef DIAGNOSTIC
6887 if ((m->m_flags & M_PKTHDR) == 0)
6888 panic("non-M_PKTHDR is passed to pf_test6");
6889#endif /* DIAGNOSTIC */
6890
6891 if (m->m_pkthdr.len < (int)sizeof(*h)) {
6892 action = PF_DROP;
6893 REASON_SET(&reason, PFRES_SHORT);
6894 log = 1;
6895 goto done;
6896 }
6897
6898 /*
6899 * DragonFly doesn't zero the auxillary pkghdr fields, only fw_flags,
6900 * so make sure pf.flags is clear.
6901 */
6902 if (m->m_pkthdr.fw_flags & PF_MBUF_TAGGED)
6903 return (PF_PASS);
6904 m->m_pkthdr.pf.flags = 0;
6905 /* Re-Check when updating to > 4.4 */
6906 m->m_pkthdr.pf.statekey = NULL;
6907
6908 /* We do IP header normalization and packet reassembly here */
6909 if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6910 action = PF_DROP;
6911 goto done;
6912 }
6913 m = *m0; /* pf_normalize messes with m0 */
6914 h = mtod(m, struct ip6_hdr *);
6915
6916#if 1
6917 /*
6918 * we do not support jumbogram yet. if we keep going, zero ip6_plen
6919 * will do something bad, so drop the packet for now.
6920 */
6921 if (htons(h->ip6_plen) == 0) {
6922 action = PF_DROP;
6923 REASON_SET(&reason, PFRES_NORM); /*XXX*/
6924 goto done;
6925 }
6926#endif
6927
6928 pd.src = (struct pf_addr *)&h->ip6_src;
6929 pd.dst = (struct pf_addr *)&h->ip6_dst;
6930 pd.sport = pd.dport = NULL;
6931 pd.ip_sum = NULL;
6932 pd.proto_sum = NULL;
6933 pd.dir = dir;
6934 pd.sidx = (dir == PF_IN) ? 0 : 1;
6935 pd.didx = (dir == PF_IN) ? 1 : 0;
6936 pd.af = AF_INET6;
6937 pd.tos = 0;
6938 pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6939 pd.eh = eh;
6940
6941 off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6942 pd.proto = h->ip6_nxt;
6943 do {
6944 switch (pd.proto) {
6945 case IPPROTO_FRAGMENT:
6946 action = pf_test_fragment(&r, dir, kif, m, h,
6947 &pd, &a, &ruleset);
6948 if (action == PF_DROP)
6949 REASON_SET(&reason, PFRES_FRAG);
6950 goto done;
6951 case IPPROTO_ROUTING: {
6952 struct ip6_rthdr rthdr;
6953
6954 if (rh_cnt++) {
6955 DPFPRINTF(PF_DEBUG_MISC,
6956 ("pf: IPv6 more than one rthdr\n"));
6957 action = PF_DROP;
6958 REASON_SET(&reason, PFRES_IPOPTIONS);
6959 log = 1;
6960 goto done;
6961 }
6962 if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6963 &reason, pd.af)) {
6964 DPFPRINTF(PF_DEBUG_MISC,
6965 ("pf: IPv6 short rthdr\n"));
6966 action = PF_DROP;
6967 REASON_SET(&reason, PFRES_SHORT);
6968 log = 1;
6969 goto done;
6970 }
6971 if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6972 DPFPRINTF(PF_DEBUG_MISC,
6973 ("pf: IPv6 rthdr0\n"));
6974 action = PF_DROP;
6975 REASON_SET(&reason, PFRES_IPOPTIONS);
6976 log = 1;
6977 goto done;
6978 }
6979 /* FALLTHROUGH */
6980 }
6981 case IPPROTO_AH:
6982 case IPPROTO_HOPOPTS:
6983 case IPPROTO_DSTOPTS: {
6984 /* get next header and header length */
6985 struct ip6_ext opt6;
6986
6987 if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6988 NULL, &reason, pd.af)) {
6989 DPFPRINTF(PF_DEBUG_MISC,
6990 ("pf: IPv6 short opt\n"));
6991 action = PF_DROP;
6992 log = 1;
6993 goto done;
6994 }
6995 if (pd.proto == IPPROTO_AH)
6996 off += (opt6.ip6e_len + 2) * 4;
6997 else
6998 off += (opt6.ip6e_len + 1) * 8;
6999 pd.proto = opt6.ip6e_nxt;
7000 /* goto the next header */
7001 break;
7002 }
7003 default:
7004 terminal++;
7005 break;
7006 }
7007 } while (!terminal);
7008
7009 /* if there's no routing header, use unmodified mbuf for checksumming */
7010 if (!n)
7011 n = m;
7012
7013 switch (pd.proto) {
7014
7015 case IPPROTO_TCP: {
7016 struct tcphdr th;
7017
7018 pd.hdr.tcp = &th;
7019 if (!pf_pull_hdr(m, off, &th, sizeof(th),
7020 &action, &reason, AF_INET6)) {
7021 log = action != PF_PASS;
7022 goto done;
7023 }
7024 pd.p_len = pd.tot_len - off - (th.th_off << 2);
7025 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
7026 if (action == PF_DROP)
7027 goto done;
7028 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
7029 &reason);
7030 if (action == PF_PASS) {
7031 r = s->rule.ptr;
7032 a = s->anchor.ptr;
7033 log = s->log;
7034 } else if (s == NULL) {
7035 action = pf_test_rule(&r, &s, dir, kif,
7036 m, off, h, &pd, &a,
7037 &ruleset, NULL, inp);
7038 }
7039 break;
7040 }
7041
7042 case IPPROTO_UDP: {
7043 struct udphdr uh;
7044
7045 pd.hdr.udp = &uh;
7046 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
7047 &action, &reason, AF_INET6)) {
7048 log = action != PF_PASS;
7049 goto done;
7050 }
7051 if (uh.uh_dport == 0 ||
7052 ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
7053 ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
7054 action = PF_DROP;
7055 REASON_SET(&reason, PFRES_SHORT);
7056 goto done;
7057 }
7058 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
7059 if (action == PF_PASS) {
7060 r = s->rule.ptr;
7061 a = s->anchor.ptr;
7062 log = s->log;
7063 } else if (s == NULL) {
7064 action = pf_test_rule(&r, &s, dir, kif,
7065 m, off, h, &pd, &a,
7066 &ruleset, NULL, inp);
7067 }
7068 break;
7069 }
7070
7071 case IPPROTO_ICMPV6: {
7072 struct icmp6_hdr ih;
7073
7074 pd.hdr.icmp6 = &ih;
7075 if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
7076 &action, &reason, AF_INET6)) {
7077 log = action != PF_PASS;
7078 goto done;
7079 }
7080 action = pf_test_state_icmp(&s, dir, kif,
7081 m, off, h, &pd, &reason);
7082 if (action == PF_PASS) {
7083 r = s->rule.ptr;
7084 a = s->anchor.ptr;
7085 log = s->log;
7086 } else if (s == NULL) {
7087 action = pf_test_rule(&r, &s, dir, kif,
7088 m, off, h, &pd, &a,
7089 &ruleset, NULL, inp);
7090 }
7091 break;
7092 }
7093
7094 default:
7095 action = pf_test_state_other(&s, dir, kif, m, &pd);
7096 if (action == PF_PASS) {
7097 r = s->rule.ptr;
7098 a = s->anchor.ptr;
7099 log = s->log;
7100 } else if (s == NULL) {
7101 action = pf_test_rule(&r, &s, dir, kif, m, off, h,
7102 &pd, &a, &ruleset, NULL, inp);
7103 }
7104 break;
7105 }
7106
7107done:
7108 if (n != m) {
7109 m_freem(n);
7110 n = NULL;
7111 }
7112
7113 /* handle dangerous IPv6 extension headers. */
7114 if (action == PF_PASS && rh_cnt &&
7115 !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
7116 action = PF_DROP;
7117 REASON_SET(&reason, PFRES_IPOPTIONS);
7118 log = 1;
7119 DPFPRINTF(PF_DEBUG_MISC,
7120 ("pf: dropping packet with dangerous v6 headers\n"));
7121 }
7122
7123 if ((s && s->tag) || r->rtableid)
7124 pf_tag_packet(m, s ? s->tag : 0, r->rtableid);
7125
7126#if 0
7127 if (dir == PF_IN && s && s->key[PF_SK_STACK])
7128 m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
7129#endif
7130
7131#ifdef ALTQ
7132 /*
7133 * Generate a hash code and qid request for ALTQ. A qid of 0
7134 * is allowed and will cause altq to select the default queue.
7135 */
7136 if (action == PF_PASS) {
7137 m->m_pkthdr.fw_flags |= PF_MBUF_STRUCTURE;
7138 if (pd.tos & IPTOS_LOWDELAY)
7139 m->m_pkthdr.pf.qid = r->pqid;
7140 else
7141 m->m_pkthdr.pf.qid = r->qid;
7142 m->m_pkthdr.pf.ecn_af = AF_INET6;
7143 m->m_pkthdr.pf.hdr = h;
7144 if (s) {
7145 /* for fairq */
7146 m->m_pkthdr.pf.state_hash = s->hash;
7147 m->m_pkthdr.pf.flags |= PF_TAG_STATE_HASHED;
7148 }
7149 }
7150#endif /* ALTQ */
7151
7152 if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
7153 pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
7154 (s->nat_rule.ptr->action == PF_RDR ||
7155 s->nat_rule.ptr->action == PF_BINAT) &&
7156 IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
7157 {
7158 m->m_pkthdr.pf.flags |= PF_TAG_TRANSLATE_LOCALHOST;
7159 }
7160
7161 if (dir == PF_IN && action == PF_PASS && r->divert.port) {
7162 struct pf_divert *divert;
7163
7164 if ((divert = pf_get_divert(m))) {
7165 m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED;
7166 divert->port = r->divert.port;
7167 divert->addr.ipv6 = r->divert.addr.v6;
7168 }
7169 }
7170
7171 if (log) {
7172 struct pf_rule *lr;
7173
7174 if (s != NULL && s->nat_rule.ptr != NULL &&
7175 s->nat_rule.ptr->log & PF_LOG_ALL)
7176 lr = s->nat_rule.ptr;
7177 else
7178 lr = r;
7179 PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset,
7180 &pd);
7181 }
7182
7183 kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
7184 kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
7185
7186 if (action == PF_PASS || r->action == PF_DROP) {
7187 dirndx = (dir == PF_OUT);
7188 r->packets[dirndx]++;
7189 r->bytes[dirndx] += pd.tot_len;
7190 if (a != NULL) {
7191 a->packets[dirndx]++;
7192 a->bytes[dirndx] += pd.tot_len;
7193 }
7194 if (s != NULL) {
7195 if (s->nat_rule.ptr != NULL) {
7196 s->nat_rule.ptr->packets[dirndx]++;
7197 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
7198 }
7199 if (s->src_node != NULL) {
7200 s->src_node->packets[dirndx]++;
7201 s->src_node->bytes[dirndx] += pd.tot_len;
7202 }
7203 if (s->nat_src_node != NULL) {
7204 s->nat_src_node->packets[dirndx]++;
7205 s->nat_src_node->bytes[dirndx] += pd.tot_len;
7206 }
7207 dirndx = (dir == s->direction) ? 0 : 1;
7208 s->packets[dirndx]++;
7209 s->bytes[dirndx] += pd.tot_len;
7210 }
7211 tr = r;
7212 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
7213 if (nr != NULL && r == &pf_default_rule)
7214 tr = nr;
7215 if (tr->src.addr.type == PF_ADDR_TABLE)
7216 pfr_update_stats(tr->src.addr.p.tbl,
7217 (s == NULL) ? pd.src :
7218 &s->key[(s->direction == PF_IN)]->addr[0],
7219 pd.af, pd.tot_len, dir == PF_OUT,
7220 r->action == PF_PASS, tr->src.neg);
7221 if (tr->dst.addr.type == PF_ADDR_TABLE)
7222 pfr_update_stats(tr->dst.addr.p.tbl,
7223 (s == NULL) ? pd.dst :
7224 &s->key[(s->direction == PF_IN)]->addr[1],
7225 pd.af, pd.tot_len, dir == PF_OUT,
7226 r->action == PF_PASS, tr->dst.neg);
7227 }
7228
7229
7230 if (action == PF_SYNPROXY_DROP) {
7231 m_freem(*m0);
7232 *m0 = NULL;
7233 action = PF_PASS;
7234 } else if (r->rt)
7235 /* pf_route6 can free the mbuf causing *m0 to become NULL */
7236 pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
7237
7238 return (action);
7239}
7240#endif /* INET6 */
7241
7242int
7243pf_check_congestion(struct ifqueue *ifq)
7244{
7245 return (0);
7246}