fdaae265fe19482b8f1b80507d4ac8429fa36e14
[dragonfly.git] / sys / net / pf / pf_norm.c
1 /*      $FreeBSD: src/sys/contrib/pf/net/pf_norm.c,v 1.10 2004/08/14 15:32:40 dwmalone Exp $    */
2 /*      $OpenBSD: pf_norm.c,v 1.80.2.1 2004/04/30 21:46:33 brad Exp $ */
3 /* add  $OpenBSD: pf_norm.c,v 1.87 2004/05/11 07:34:11 dhartmei Exp $ */
4 /*      $DragonFly: src/sys/net/pf/pf_norm.c,v 1.10 2008/09/04 09:08:22 hasso Exp $ */
5 /*      $OpenBSD: pf_norm.c,v 1.107 2006/04/16 00:59:52 pascoe Exp $ */
6
7 /*
8  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
9  *
10  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/mbuf.h>
40 #include <sys/filio.h>
41 #include <sys/fcntl.h>
42 #include <sys/socket.h>
43 #include <sys/kernel.h>
44 #include <sys/time.h>
45 #include <vm/vm_zone.h>
46
47 #include <net/if.h>
48 #include <net/if_types.h>
49 #include <net/bpf.h>
50 #include <net/route.h>
51 #include <net/pf/if_pflog.h>
52
53 #include <netinet/in.h>
54 #include <netinet/in_var.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip_var.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/udp.h>
61 #include <netinet/ip_icmp.h>
62
63 #ifdef INET6
64 #include <netinet/ip6.h>
65 #endif /* INET6 */
66
67 #include <net/pf/pfvar.h>
68
69 #define PFFRAG_SEENLAST 0x0001          /* Seen the last fragment for this */
70 #define PFFRAG_NOBUFFER 0x0002          /* Non-buffering fragment cache */
71 #define PFFRAG_DROP     0x0004          /* Drop all fragments */
72 #define BUFFER_FRAGMENTS(fr)    (!((fr)->fr_flags & PFFRAG_NOBUFFER))
73
74
75 TAILQ_HEAD(pf_fragqueue, pf_fragment)   pf_fragqueue;
76 TAILQ_HEAD(pf_cachequeue, pf_fragment)  pf_cachequeue;
77
78 static __inline int      pf_frag_compare(struct pf_fragment *,
79                             struct pf_fragment *);
80 RB_HEAD(pf_frag_tree, pf_fragment)      pf_frag_tree, pf_cache_tree;
81 RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
82 RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
83
84 /* Private prototypes */
85 void                     pf_ip2key(struct pf_fragment *, struct ip *);
86 void                     pf_remove_fragment(struct pf_fragment *);
87 void                     pf_flush_fragments(void);
88 void                     pf_free_fragment(struct pf_fragment *);
89 struct pf_fragment      *pf_find_fragment(struct ip *, struct pf_frag_tree *);
90 struct mbuf             *pf_reassemble(struct mbuf **, struct pf_fragment **,
91                             struct pf_frent *, int);
92 struct mbuf             *pf_fragcache(struct mbuf **, struct ip*,
93                             struct pf_fragment **, int, int, int *);
94 int                      pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
95                             struct tcphdr *, int);
96
97 #define DPFPRINTF(x) do {                               \
98         if (pf_status.debug >= PF_DEBUG_MISC) {         \
99                 kprintf("%s: ", __func__);              \
100                 kprintf x ;                             \
101         }                                               \
102 } while(0)
103
104 /* Globals */
105 vm_zone_t                pf_frent_pl, pf_frag_pl, pf_cache_pl, pf_cent_pl;
106 vm_zone_t                pf_state_scrub_pl;
107 int                      pf_nfrents, pf_ncache;
108
109 void
110 pf_normalize_init(void)
111 {
112         /* XXX
113         pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT);
114         pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
115         pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0);
116         pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0);
117         */
118
119         TAILQ_INIT(&pf_fragqueue);
120         TAILQ_INIT(&pf_cachequeue);
121 }
122
123 static __inline int
124 pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
125 {
126         int     diff;
127
128         if ((diff = a->fr_id - b->fr_id))
129                 return (diff);
130         else if ((diff = a->fr_p - b->fr_p))
131                 return (diff);
132         else if (a->fr_src.s_addr < b->fr_src.s_addr)
133                 return (-1);
134         else if (a->fr_src.s_addr > b->fr_src.s_addr)
135                 return (1);
136         else if (a->fr_dst.s_addr < b->fr_dst.s_addr)
137                 return (-1);
138         else if (a->fr_dst.s_addr > b->fr_dst.s_addr)
139                 return (1);
140         return (0);
141 }
142
143 void
144 pf_purge_expired_fragments(void)
145 {
146         struct pf_fragment      *frag;
147         u_int32_t                expire = time_second -
148                                     pf_default_rule.timeout[PFTM_FRAG];
149
150         while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) {
151                 KASSERT((BUFFER_FRAGMENTS(frag)),
152                         ("BUFFER_FRAGMENTS(frag) == 0: %s", __func__));
153                 if (frag->fr_timeout > expire)
154                         break;
155
156                 DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
157                 pf_free_fragment(frag);
158         }
159
160         while ((frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue)) != NULL) {
161                 KASSERT((!BUFFER_FRAGMENTS(frag)),
162                         ("BUFFER_FRAGMENTS(frag) != 0: %s", __func__));
163                 if (frag->fr_timeout > expire)
164                         break;
165
166                 DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
167                 pf_free_fragment(frag);
168                 KASSERT((TAILQ_EMPTY(&pf_cachequeue) ||
169                     TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag),
170                     ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
171                     __func__));
172         }
173 }
174
175 /*
176  * Try to flush old fragments to make space for new ones
177  */
178
179 void
180 pf_flush_fragments(void)
181 {
182         struct pf_fragment      *frag;
183         int                      goal;
184
185         goal = pf_nfrents * 9 / 10;
186         DPFPRINTF(("trying to free > %d frents\n",
187             pf_nfrents - goal));
188         while (goal < pf_nfrents) {
189                 frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue);
190                 if (frag == NULL)
191                         break;
192                 pf_free_fragment(frag);
193         }
194
195
196         goal = pf_ncache * 9 / 10;
197         DPFPRINTF(("trying to free > %d cache entries\n",
198             pf_ncache - goal));
199         while (goal < pf_ncache) {
200                 frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue);
201                 if (frag == NULL)
202                         break;
203                 pf_free_fragment(frag);
204         }
205 }
206
207 /* Frees the fragments and all associated entries */
208
209 void
210 pf_free_fragment(struct pf_fragment *frag)
211 {
212         struct pf_frent         *frent;
213         struct pf_frcache       *frcache;
214
215         /* Free all fragments */
216         if (BUFFER_FRAGMENTS(frag)) {
217                 for (frent = LIST_FIRST(&frag->fr_queue); frent;
218                     frent = LIST_FIRST(&frag->fr_queue)) {
219                         LIST_REMOVE(frent, fr_next);
220
221                         m_freem(frent->fr_m);
222                         pool_put(&pf_frent_pl, frent);
223                         pf_nfrents--;
224                 }
225         } else {
226                 for (frcache = LIST_FIRST(&frag->fr_cache); frcache;
227                     frcache = LIST_FIRST(&frag->fr_cache)) {
228                         LIST_REMOVE(frcache, fr_next);
229
230                         KASSERT((LIST_EMPTY(&frag->fr_cache) ||
231                             LIST_FIRST(&frag->fr_cache)->fr_off >
232                             frcache->fr_end),
233                             ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >"
234                              " frcache->fr_end): %s", __func__));
235
236                         pool_put(&pf_cent_pl, frcache);
237                         pf_ncache--;
238                 }
239         }
240
241         pf_remove_fragment(frag);
242 }
243
244 void
245 pf_ip2key(struct pf_fragment *key, struct ip *ip)
246 {
247         key->fr_p = ip->ip_p;
248         key->fr_id = ip->ip_id;
249         key->fr_src.s_addr = ip->ip_src.s_addr;
250         key->fr_dst.s_addr = ip->ip_dst.s_addr;
251 }
252
253 struct pf_fragment *
254 pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree)
255 {
256         struct pf_fragment       key;
257         struct pf_fragment      *frag;
258
259         pf_ip2key(&key, ip);
260
261         frag = RB_FIND(pf_frag_tree, tree, &key);
262         if (frag != NULL) {
263                 /* XXX Are we sure we want to update the timeout? */
264                 frag->fr_timeout = time_second;
265                 if (BUFFER_FRAGMENTS(frag)) {
266                         TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
267                         TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
268                 } else {
269                         TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
270                         TAILQ_INSERT_HEAD(&pf_cachequeue, frag, frag_next);
271                 }
272         }
273
274         return (frag);
275 }
276
277 /* Removes a fragment from the fragment queue and frees the fragment */
278
279 void
280 pf_remove_fragment(struct pf_fragment *frag)
281 {
282         if (BUFFER_FRAGMENTS(frag)) {
283                 RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag);
284                 TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
285                 pool_put(&pf_frag_pl, frag);
286         } else {
287                 RB_REMOVE(pf_frag_tree, &pf_cache_tree, frag);
288                 TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
289                 pool_put(&pf_cache_pl, frag);
290         }
291 }
292
293 #define FR_IP_OFF(fr)   ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
294 struct mbuf *
295 pf_reassemble(struct mbuf **m0, struct pf_fragment **frag,
296     struct pf_frent *frent, int mff)
297 {
298         struct mbuf     *m = *m0, *m2;
299         struct pf_frent *frea, *next;
300         struct pf_frent *frep = NULL;
301         struct ip       *ip = frent->fr_ip;
302         int              hlen = ip->ip_hl << 2;
303         u_int16_t        off = (ip->ip_off & IP_OFFMASK) << 3;
304         u_int16_t        ip_len = ip->ip_len - ip->ip_hl * 4;
305         u_int16_t        max = ip_len + off;
306
307         KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)),
308             ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __func__));
309
310         /* Strip off ip header */
311         m->m_data += hlen;
312         m->m_len -= hlen;
313
314         /* Create a new reassembly queue for this packet */
315         if (*frag == NULL) {
316                 *frag = pool_get(&pf_frag_pl, PR_NOWAIT);
317                 if (*frag == NULL) {
318                         pf_flush_fragments();
319                         *frag = pool_get(&pf_frag_pl, PR_NOWAIT);
320                         if (*frag == NULL)
321                                 goto drop_fragment;
322                 }
323
324                 (*frag)->fr_flags = 0;
325                 (*frag)->fr_max = 0;
326                 (*frag)->fr_src = frent->fr_ip->ip_src;
327                 (*frag)->fr_dst = frent->fr_ip->ip_dst;
328                 (*frag)->fr_p = frent->fr_ip->ip_p;
329                 (*frag)->fr_id = frent->fr_ip->ip_id;
330                 (*frag)->fr_timeout = time_second;
331                 LIST_INIT(&(*frag)->fr_queue);
332
333                 RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag);
334                 TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next);
335
336                 /* We do not have a previous fragment */
337                 frep = NULL;
338                 goto insert;
339         }
340
341         /*
342          * Find a fragment after the current one:
343          *  - off contains the real shifted offset.
344          */
345         LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
346                 if (FR_IP_OFF(frea) > off)
347                         break;
348                 frep = frea;
349         }
350
351         KASSERT((frep != NULL || frea != NULL),
352             ("!(frep != NULL || frea != NULL): %s", __func__));
353
354         if (frep != NULL &&
355             FR_IP_OFF(frep) + frep->fr_ip->ip_len - frep->fr_ip->ip_hl *
356             4 > off)
357         {
358                 u_int16_t       precut;
359
360                 precut = FR_IP_OFF(frep) + frep->fr_ip->ip_len -
361                     frep->fr_ip->ip_hl * 4 - off;
362                 if (precut >= ip_len)
363                         goto drop_fragment;
364                 m_adj(frent->fr_m, precut);
365                 DPFPRINTF(("overlap -%d\n", precut));
366                 /* Enforce 8 byte boundaries */
367                 ip->ip_off = ip->ip_off + (precut >> 3);
368                 off = (ip->ip_off & IP_OFFMASK) << 3;
369                 ip_len -= precut;
370                 ip->ip_len = ip_len;
371         }
372
373         for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
374             frea = next)
375         {
376                 u_int16_t       aftercut;
377
378                 aftercut = ip_len + off - FR_IP_OFF(frea);
379                 DPFPRINTF(("adjust overlap %d\n", aftercut));
380                 if (aftercut < frea->fr_ip->ip_len - frea->fr_ip->ip_hl
381                     * 4)
382                 {
383                         frea->fr_ip->ip_len =
384                             frea->fr_ip->ip_len - aftercut;
385                         frea->fr_ip->ip_off = frea->fr_ip->ip_off +
386                             (aftercut >> 3);
387                         m_adj(frea->fr_m, aftercut);
388                         break;
389                 }
390
391                 /* This fragment is completely overlapped, lose it */
392                 next = LIST_NEXT(frea, fr_next);
393                 m_freem(frea->fr_m);
394                 LIST_REMOVE(frea, fr_next);
395                 pool_put(&pf_frent_pl, frea);
396                 pf_nfrents--;
397         }
398
399  insert:
400         /* Update maximum data size */
401         if ((*frag)->fr_max < max)
402                 (*frag)->fr_max = max;
403         /* This is the last segment */
404         if (!mff)
405                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
406
407         if (frep == NULL)
408                 LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
409         else
410                 LIST_INSERT_AFTER(frep, frent, fr_next);
411
412         /* Check if we are completely reassembled */
413         if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
414                 return (NULL);
415
416         /* Check if we have all the data */
417         off = 0;
418         for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
419                 next = LIST_NEXT(frep, fr_next);
420
421                 off += frep->fr_ip->ip_len - frep->fr_ip->ip_hl * 4;
422                 if (off < (*frag)->fr_max &&
423                     (next == NULL || FR_IP_OFF(next) != off))
424                 {
425                         DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
426                             off, next == NULL ? -1 : FR_IP_OFF(next),
427                             (*frag)->fr_max));
428                         return (NULL);
429                 }
430         }
431         DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
432         if (off < (*frag)->fr_max)
433                 return (NULL);
434
435         /* We have all the data */
436         frent = LIST_FIRST(&(*frag)->fr_queue);
437         KASSERT((frent != NULL), ("frent == NULL: %s", __func__));
438         if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
439                 DPFPRINTF(("drop: too big: %d\n", off));
440                 pf_free_fragment(*frag);
441                 *frag = NULL;
442                 return (NULL);
443         }
444         next = LIST_NEXT(frent, fr_next);
445
446         /* Magic from ip_input */
447         ip = frent->fr_ip;
448         m = frent->fr_m;
449         m2 = m->m_next;
450         m->m_next = NULL;
451         m_cat(m, m2);
452         pool_put(&pf_frent_pl, frent);
453         pf_nfrents--;
454         for (frent = next; frent != NULL; frent = next) {
455                 next = LIST_NEXT(frent, fr_next);
456
457                 m2 = frent->fr_m;
458                 pool_put(&pf_frent_pl, frent);
459                 pf_nfrents--;
460                 m_cat(m, m2);
461         }
462
463         ip->ip_src = (*frag)->fr_src;
464         ip->ip_dst = (*frag)->fr_dst;
465
466         /* Remove from fragment queue */
467         pf_remove_fragment(*frag);
468         *frag = NULL;
469
470         hlen = ip->ip_hl << 2;
471         ip->ip_len = off + hlen;
472         m->m_len += hlen;
473         m->m_data -= hlen;
474
475         /* some debugging cruft by sklower, below, will go away soon */
476         /* XXX this should be done elsewhere */
477         if (m->m_flags & M_PKTHDR) {
478                 int plen = 0;
479                 for (m2 = m; m2; m2 = m2->m_next)
480                         plen += m2->m_len;
481                 m->m_pkthdr.len = plen;
482         }
483
484         DPFPRINTF(("complete: %p(%d)\n", m, ip->ip_len));
485         return (m);
486
487  drop_fragment:
488         /* Oops - fail safe - drop packet */
489         pool_put(&pf_frent_pl, frent);
490         pf_nfrents--;
491         m_freem(m);
492         return (NULL);
493 }
494
495 struct mbuf *
496 pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
497     int drop, int *nomem)
498 {
499         struct mbuf             *m = *m0;
500         struct pf_frcache       *frp, *fra, *cur = NULL;
501         int                      ip_len = h->ip_len - (h->ip_hl << 2);
502         u_int16_t                off = h->ip_off << 3;
503         u_int16_t                max = ip_len + off;
504         int                      hosed = 0;
505
506         KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)),
507             ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __func__));
508
509         /* Create a new range queue for this packet */
510         if (*frag == NULL) {
511                 *frag = pool_get(&pf_cache_pl, PR_NOWAIT);
512                 if (*frag == NULL) {
513                         pf_flush_fragments();
514                         *frag = pool_get(&pf_cache_pl, PR_NOWAIT);
515                         if (*frag == NULL)
516                                 goto no_mem;
517                 }
518
519                 /* Get an entry for the queue */
520                 cur = pool_get(&pf_cent_pl, PR_NOWAIT);
521                 if (cur == NULL) {
522                         pool_put(&pf_cache_pl, *frag);
523                         *frag = NULL;
524                         goto no_mem;
525                 }
526                 pf_ncache++;
527
528                 (*frag)->fr_flags = PFFRAG_NOBUFFER;
529                 (*frag)->fr_max = 0;
530                 (*frag)->fr_src = h->ip_src;
531                 (*frag)->fr_dst = h->ip_dst;
532                 (*frag)->fr_p = h->ip_p;
533                 (*frag)->fr_id = h->ip_id;
534                 (*frag)->fr_timeout = time_second;
535
536                 cur->fr_off = off;
537                 cur->fr_end = max;
538                 LIST_INIT(&(*frag)->fr_cache);
539                 LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
540
541                 RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag);
542                 TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next);
543
544                 DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max));
545
546                 goto pass;
547         }
548
549         /*
550          * Find a fragment after the current one:
551          *  - off contains the real shifted offset.
552          */
553         frp = NULL;
554         LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
555                 if (fra->fr_off > off)
556                         break;
557                 frp = fra;
558         }
559
560         KASSERT((frp != NULL || fra != NULL),
561             ("!(frp != NULL || fra != NULL): %s", __func__));
562
563         if (frp != NULL) {
564                 int     precut;
565
566                 precut = frp->fr_end - off;
567                 if (precut >= ip_len) {
568                         /* Fragment is entirely a duplicate */
569                         DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
570                             h->ip_id, frp->fr_off, frp->fr_end, off, max));
571                         goto drop_fragment;
572                 }
573                 if (precut == 0) {
574                         /* They are adjacent.  Fixup cache entry */
575                         DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
576                             h->ip_id, frp->fr_off, frp->fr_end, off, max));
577                         frp->fr_end = max;
578                 } else if (precut > 0) {
579                         /* The first part of this payload overlaps with a
580                          * fragment that has already been passed.
581                          * Need to trim off the first part of the payload.
582                          * But to do so easily, we need to create another
583                          * mbuf to throw the original header into.
584                          */
585
586                         DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
587                             h->ip_id, precut, frp->fr_off, frp->fr_end, off,
588                             max));
589
590                         off += precut;
591                         max -= precut;
592                         /* Update the previous frag to encompass this one */
593                         frp->fr_end = max;
594
595                         if (!drop) {
596                                 /* XXX Optimization opportunity
597                                  * This is a very heavy way to trim the payload.
598                                  * we could do it much faster by diddling mbuf
599                                  * internals but that would be even less legible
600                                  * than this mbuf magic.  For my next trick,
601                                  * I'll pull a rabbit out of my laptop.
602                                  */
603                                 *m0 = m_dup(m, MB_DONTWAIT);
604                                 /* From KAME Project : We have missed this! */
605                                 m_adj(*m0, (h->ip_hl << 2) -
606                                     (*m0)->m_pkthdr.len);
607                                 if (*m0 == NULL)
608                                         goto no_mem;
609                                 KASSERT(((*m0)->m_next == NULL), 
610                                     ("(*m0)->m_next != NULL: %s", 
611                                     __func__));
612                                 m_adj(m, precut + (h->ip_hl << 2));
613                                 m_cat(*m0, m);
614                                 m = *m0;
615                                 if (m->m_flags & M_PKTHDR) {
616                                         int plen = 0;
617                                         struct mbuf *t;
618                                         for (t = m; t; t = t->m_next)
619                                                 plen += t->m_len;
620                                         m->m_pkthdr.len = plen;
621                                 }
622
623
624                                 h = mtod(m, struct ip *);
625
626                                 KASSERT(((int)m->m_len ==
627                                     h->ip_len - precut),
628                                     ("m->m_len != h->ip_len - precut: %s",
629                                     __func__));
630                                 h->ip_off = h->ip_off +
631                                     (precut >> 3);
632                                 h->ip_len = h->ip_len - precut;
633                         } else {
634                                 hosed++;
635                         }
636                 } else {
637                         /* There is a gap between fragments */
638
639                         DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
640                             h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
641                             max));
642
643                         cur = pool_get(&pf_cent_pl, PR_NOWAIT);
644                         if (cur == NULL)
645                                 goto no_mem;
646                         pf_ncache++;
647
648                         cur->fr_off = off;
649                         cur->fr_end = max;
650                         LIST_INSERT_AFTER(frp, cur, fr_next);
651                 }
652         }
653
654         if (fra != NULL) {
655                 int     aftercut;
656                 int     merge = 0;
657
658                 aftercut = max - fra->fr_off;
659                 if (aftercut == 0) {
660                         /* Adjacent fragments */
661                         DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
662                             h->ip_id, off, max, fra->fr_off, fra->fr_end));
663                         fra->fr_off = off;
664                         merge = 1;
665                 } else if (aftercut > 0) {
666                         /* Need to chop off the tail of this fragment */
667                         DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
668                             h->ip_id, aftercut, off, max, fra->fr_off,
669                             fra->fr_end));
670                         fra->fr_off = off;
671                         max -= aftercut;
672
673                         merge = 1;
674
675                         if (!drop) {
676                                 m_adj(m, -aftercut);
677                                 if (m->m_flags & M_PKTHDR) {
678                                         int plen = 0;
679                                         struct mbuf *t;
680                                         for (t = m; t; t = t->m_next)
681                                                 plen += t->m_len;
682                                         m->m_pkthdr.len = plen;
683                                 }
684                                 h = mtod(m, struct ip *);
685                                 KASSERT(((int)m->m_len == h->ip_len - aftercut),
686                                     ("m->m_len != h->ip_len - aftercut: %s",
687                                     __func__));
688                                 h->ip_len = h->ip_len - aftercut;
689                         } else {
690                                 hosed++;
691                         }
692                 } else if (frp == NULL) {
693                         /* There is a gap between fragments */
694                         DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
695                             h->ip_id, -aftercut, off, max, fra->fr_off,
696                             fra->fr_end));
697
698                         cur = pool_get(&pf_cent_pl, PR_NOWAIT);
699                         if (cur == NULL)
700                                 goto no_mem;
701                         pf_ncache++;
702
703                         cur->fr_off = off;
704                         cur->fr_end = max;
705                         LIST_INSERT_BEFORE(fra, cur, fr_next);
706                 }
707
708
709                 /* Need to glue together two separate fragment descriptors */
710                 if (merge) {
711                         if (cur && fra->fr_off <= cur->fr_end) {
712                                 /* Need to merge in a previous 'cur' */
713                                 DPFPRINTF(("fragcache[%d]: adjacent(merge "
714                                     "%d-%d) %d-%d (%d-%d)\n",
715                                     h->ip_id, cur->fr_off, cur->fr_end, off,
716                                     max, fra->fr_off, fra->fr_end));
717                                 fra->fr_off = cur->fr_off;
718                                 LIST_REMOVE(cur, fr_next);
719                                 pool_put(&pf_cent_pl, cur);
720                                 pf_ncache--;
721                                 cur = NULL;
722
723                         } else if (frp && fra->fr_off <= frp->fr_end) {
724                                 /* Need to merge in a modified 'frp' */
725                                 KASSERT((cur == NULL), ("cur != NULL: %s",
726                                     __func__));
727                                 DPFPRINTF(("fragcache[%d]: adjacent(merge "
728                                     "%d-%d) %d-%d (%d-%d)\n",
729                                     h->ip_id, frp->fr_off, frp->fr_end, off,
730                                     max, fra->fr_off, fra->fr_end));
731                                 fra->fr_off = frp->fr_off;
732                                 LIST_REMOVE(frp, fr_next);
733                                 pool_put(&pf_cent_pl, frp);
734                                 pf_ncache--;
735                                 frp = NULL;
736
737                         }
738                 }
739         }
740
741         if (hosed) {
742                 /*
743                  * We must keep tracking the overall fragment even when
744                  * we're going to drop it anyway so that we know when to
745                  * free the overall descriptor.  Thus we drop the frag late.
746                  */
747                 goto drop_fragment;
748         }
749
750
751  pass:
752         /* Update maximum data size */
753         if ((*frag)->fr_max < max)
754                 (*frag)->fr_max = max;
755
756         /* This is the last segment */
757         if (!mff)
758                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
759
760         /* Check if we are completely reassembled */
761         if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
762             LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
763             LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
764                 /* Remove from fragment queue */
765                 DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
766                     (*frag)->fr_max));
767                 pf_free_fragment(*frag);
768                 *frag = NULL;
769         }
770
771         return (m);
772
773  no_mem:
774         *nomem = 1;
775
776         /* Still need to pay attention to !IP_MF */
777         if (!mff && *frag != NULL)
778                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
779
780         m_freem(m);
781         return (NULL);
782
783  drop_fragment:
784
785         /* Still need to pay attention to !IP_MF */
786         if (!mff && *frag != NULL)
787                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
788
789         if (drop) {
790                 /* This fragment has been deemed bad.  Don't reass */
791                 if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
792                         DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
793                             h->ip_id));
794                 (*frag)->fr_flags |= PFFRAG_DROP;
795         }
796
797         m_freem(m);
798         return (NULL);
799 }
800
801 int
802 pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
803     struct pf_pdesc *pd)
804 {
805         struct mbuf             *m = *m0;
806         struct pf_rule          *r;
807         struct pf_frent         *frent;
808         struct pf_fragment      *frag = NULL;
809         struct ip               *h = mtod(m, struct ip *);
810         int                      mff = (h->ip_off & IP_MF);
811         int                      hlen = h->ip_hl << 2;
812         u_int16_t                fragoff = (h->ip_off & IP_OFFMASK) << 3;
813         u_int16_t                max;
814         int                      ip_len;
815         int                      ip_off;
816
817         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
818         while (r != NULL) {
819                 r->evaluations++;
820                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
821                         r = r->skip[PF_SKIP_IFP].ptr;
822                 else if (r->direction && r->direction != dir)
823                         r = r->skip[PF_SKIP_DIR].ptr;
824                 else if (r->af && r->af != AF_INET)
825                         r = r->skip[PF_SKIP_AF].ptr;
826                 else if (r->proto && r->proto != h->ip_p)
827                         r = r->skip[PF_SKIP_PROTO].ptr;
828                 else if (PF_MISMATCHAW(&r->src.addr,
829                     (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
830                     r->src.neg, kif))
831                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
832                 else if (PF_MISMATCHAW(&r->dst.addr,
833                     (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
834                     r->dst.neg, NULL))
835                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
836                 else
837                         break;
838         }
839
840         if (r == NULL || r->action == PF_NOSCRUB)
841                 return (PF_PASS);
842         else {
843                 r->packets[dir == PF_OUT]++;
844                 r->bytes[dir == PF_OUT] += pd->tot_len;
845         }
846
847         /* Check for illegal packets */
848         if (hlen < (int)sizeof(struct ip))
849                 goto drop;
850
851         if (hlen > h->ip_len)
852                 goto drop;
853
854         /* Clear IP_DF if the rule uses the no-df option */
855         if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
856                 u_int16_t ip_off = h->ip_off;
857
858                 h->ip_off &= ~IP_DF;
859                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
860         }
861
862         /* We will need other tests here */
863         if (!fragoff && !mff)
864                 goto no_fragment;
865
866         /* We're dealing with a fragment now. Don't allow fragments
867          * with IP_DF to enter the cache. If the flag was cleared by
868          * no-df above, fine. Otherwise drop it.
869          */
870         if (h->ip_off & IP_DF) {
871                 DPFPRINTF(("IP_DF\n"));
872                 goto bad;
873         }
874
875         ip_len = h->ip_len - hlen;
876         ip_off = (h->ip_off & IP_OFFMASK) << 3;
877
878         /* All fragments are 8 byte aligned */
879         if (mff && (ip_len & 0x7)) {
880                 DPFPRINTF(("mff and %d\n", ip_len));
881                 goto bad;
882         }
883
884         /* Respect maximum length */
885         if (fragoff + ip_len > IP_MAXPACKET) {
886                 DPFPRINTF(("max packet %d\n", fragoff + ip_len));
887                 goto bad;
888         }
889         max = fragoff + ip_len;
890
891         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
892                 /* Fully buffer all of the fragments */
893
894                 frag = pf_find_fragment(h, &pf_frag_tree);
895
896                 /* Check if we saw the last fragment already */
897                 if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
898                     max > frag->fr_max)
899                         goto bad;
900
901                 /* Get an entry for the fragment queue */
902                 frent = pool_get(&pf_frent_pl, PR_NOWAIT);
903                 if (frent == NULL) {
904                         REASON_SET(reason, PFRES_MEMORY);
905                         return (PF_DROP);
906                 }
907                 pf_nfrents++;
908                 frent->fr_ip = h;
909                 frent->fr_m = m;
910
911                 /* Might return a completely reassembled mbuf, or NULL */
912                 DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
913                 *m0 = m = pf_reassemble(m0, &frag, frent, mff);
914
915                 if (m == NULL)
916                         return (PF_DROP);
917
918                 /* use mtag from concatenated mbuf chain */
919                 pd->pf_mtag = pf_find_mtag(m);
920 #ifdef DIAGNOSTIC
921                 if (pd->pf_mtag == NULL) {
922                         kprintf("%s: pf_find_mtag returned NULL(1)\n", __func__);
923                         if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
924                                 m_freem(m);
925                                 *m0 = NULL;
926                                 goto no_mem;
927                         }
928                 }
929 #endif
930                 if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
931                         goto drop;
932
933                 h = mtod(m, struct ip *);
934         } else {
935                 /* non-buffering fragment cache (drops or masks overlaps) */
936                 int     nomem = 0;
937
938                 if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) {
939                         /*
940                          * Already passed the fragment cache in the
941                          * input direction.  If we continued, it would
942                          * appear to be a dup and would be dropped.
943                          */
944                         goto fragment_pass;
945                 }
946
947                 frag = pf_find_fragment(h, &pf_cache_tree);
948
949                 /* Check if we saw the last fragment already */
950                 if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
951                     max > frag->fr_max) {
952                         if (r->rule_flag & PFRULE_FRAGDROP)
953                                 frag->fr_flags |= PFFRAG_DROP;
954                         goto bad;
955                 }
956
957                 *m0 = m = pf_fragcache(m0, h, &frag, mff,
958                     (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
959                 if (m == NULL) {
960                         if (nomem)
961                                 goto no_mem;
962                         goto drop;
963                 }
964
965                 /* use mtag from copied and trimmed mbuf chain */
966                 pd->pf_mtag = pf_find_mtag(m);
967 #ifdef DIAGNOSTIC
968                 if (pd->pf_mtag == NULL) {
969                         kprintf("%s: pf_find_mtag returned NULL(2)\n", __func__);
970                         if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
971                                 m_freem(m);
972                                 *m0 = NULL;
973                                 goto no_mem;
974                         }
975                 }
976 #endif
977                 if (dir == PF_IN)
978                         pd->pf_mtag->flags |= PF_TAG_FRAGCACHE;
979
980                 if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
981                         goto drop;
982                 goto fragment_pass;
983         }
984
985  no_fragment:
986         /* At this point, only IP_DF is allowed in ip_off */
987         if (h->ip_off & IP_DF) {
988                 u_int16_t ip_off = h->ip_off;
989
990                 h->ip_off &= IP_DF;
991                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
992         }
993
994         /* Enforce a minimum ttl, may cause endless packet loops */
995         if (r->min_ttl && h->ip_ttl < r->min_ttl) {
996                 u_int16_t ip_ttl = h->ip_ttl;
997
998                 h->ip_ttl = r->min_ttl;
999                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1000         }
1001
1002         if (r->rule_flag & PFRULE_RANDOMID) {
1003                 u_int16_t ip_id = h->ip_id;
1004
1005                 h->ip_id = ip_randomid();
1006                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
1007         }
1008         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1009                 pd->flags |= PFDESC_IP_REAS;
1010
1011         return (PF_PASS);
1012
1013  fragment_pass:
1014         /* Enforce a minimum ttl, may cause endless packet loops */
1015         if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1016                 u_int16_t ip_ttl = h->ip_ttl;
1017
1018                 h->ip_ttl = r->min_ttl;
1019                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1020         }
1021         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1022                 pd->flags |= PFDESC_IP_REAS;
1023         return (PF_PASS);
1024
1025  no_mem:
1026         REASON_SET(reason, PFRES_MEMORY);
1027         if (r != NULL && r->log)
1028                 PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1029         return (PF_DROP);
1030
1031  drop:
1032         REASON_SET(reason, PFRES_NORM);
1033         if (r != NULL && r->log)
1034                 PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1035         return (PF_DROP);
1036
1037  bad:
1038         DPFPRINTF(("dropping bad fragment\n"));
1039
1040         /* Free associated fragments */
1041         if (frag != NULL)
1042                 pf_free_fragment(frag);
1043
1044         REASON_SET(reason, PFRES_FRAG);
1045         if (r != NULL && r->log)
1046                 PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1047
1048         return (PF_DROP);
1049 }
1050
1051 #ifdef INET6
1052 int
1053 pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
1054     u_short *reason, struct pf_pdesc *pd)
1055 {
1056         struct mbuf             *m = *m0;
1057         struct pf_rule          *r;
1058         struct ip6_hdr          *h = mtod(m, struct ip6_hdr *);
1059         int                      off;
1060         struct ip6_ext           ext;
1061         struct ip6_opt           opt;
1062         struct ip6_opt_jumbo     jumbo;
1063         struct ip6_frag          frag;
1064         u_int32_t                jumbolen = 0, plen;
1065         u_int16_t                fragoff = 0;
1066         int                      optend;
1067         int                      ooff;
1068         u_int8_t                 proto;
1069         int                      terminal;
1070
1071         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1072         while (r != NULL) {
1073                 r->evaluations++;
1074                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
1075                         r = r->skip[PF_SKIP_IFP].ptr;
1076                 else if (r->direction && r->direction != dir)
1077                         r = r->skip[PF_SKIP_DIR].ptr;
1078                 else if (r->af && r->af != AF_INET6)
1079                         r = r->skip[PF_SKIP_AF].ptr;
1080 #if 0 /* header chain! */
1081                 else if (r->proto && r->proto != h->ip6_nxt)
1082                         r = r->skip[PF_SKIP_PROTO].ptr;
1083 #endif
1084                 else if (PF_MISMATCHAW(&r->src.addr,
1085                     (struct pf_addr *)&h->ip6_src, AF_INET6,
1086                     r->src.neg, kif))
1087                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1088                 else if (PF_MISMATCHAW(&r->dst.addr,
1089                     (struct pf_addr *)&h->ip6_dst, AF_INET6,
1090                     r->dst.neg, NULL))
1091                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
1092                 else
1093                         break;
1094         }
1095
1096         if (r == NULL || r->action == PF_NOSCRUB)
1097                 return (PF_PASS);
1098         else {
1099                 r->packets[dir == PF_OUT]++;
1100                 r->bytes[dir == PF_OUT] += pd->tot_len;
1101         }
1102
1103         /* Check for illegal packets */
1104         if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
1105                 goto drop;
1106
1107         off = sizeof(struct ip6_hdr);
1108         proto = h->ip6_nxt;
1109         terminal = 0;
1110         do {
1111                 switch (proto) {
1112                 case IPPROTO_FRAGMENT:
1113                         goto fragment;
1114                         break;
1115                 case IPPROTO_AH:
1116                 case IPPROTO_ROUTING:
1117                 case IPPROTO_DSTOPTS:
1118                         if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1119                             NULL, AF_INET6))
1120                                 goto shortpkt;
1121                         if (proto == IPPROTO_AH)
1122                                 off += (ext.ip6e_len + 2) * 4;
1123                         else
1124                                 off += (ext.ip6e_len + 1) * 8;
1125                         proto = ext.ip6e_nxt;
1126                         break;
1127                 case IPPROTO_HOPOPTS:
1128                         if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1129                             NULL, AF_INET6))
1130                                 goto shortpkt;
1131                         optend = off + (ext.ip6e_len + 1) * 8;
1132                         ooff = off + sizeof(ext);
1133                         do {
1134                                 if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
1135                                     sizeof(opt.ip6o_type), NULL, NULL,
1136                                     AF_INET6))
1137                                         goto shortpkt;
1138                                 if (opt.ip6o_type == IP6OPT_PAD1) {
1139                                         ooff++;
1140                                         continue;
1141                                 }
1142                                 if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
1143                                     NULL, NULL, AF_INET6))
1144                                         goto shortpkt;
1145                                 if (ooff + sizeof(opt) + opt.ip6o_len > optend)
1146                                         goto drop;
1147                                 switch (opt.ip6o_type) {
1148                                 case IP6OPT_JUMBO:
1149                                         if (h->ip6_plen != 0)
1150                                                 goto drop;
1151                                         if (!pf_pull_hdr(m, ooff, &jumbo,
1152                                             sizeof(jumbo), NULL, NULL,
1153                                             AF_INET6))
1154                                                 goto shortpkt;
1155                                         memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
1156                                             sizeof(jumbolen));
1157                                         jumbolen = ntohl(jumbolen);
1158                                         if (jumbolen <= IPV6_MAXPACKET)
1159                                                 goto drop;
1160                                         if (sizeof(struct ip6_hdr) + jumbolen !=
1161                                             m->m_pkthdr.len)
1162                                                 goto drop;
1163                                         break;
1164                                 default:
1165                                         break;
1166                                 }
1167                                 ooff += sizeof(opt) + opt.ip6o_len;
1168                         } while (ooff < optend);
1169
1170                         off = optend;
1171                         proto = ext.ip6e_nxt;
1172                         break;
1173                 default:
1174                         terminal = 1;
1175                         break;
1176                 }
1177         } while (!terminal);
1178
1179         /* jumbo payload option must be present, or plen > 0 */
1180         if (ntohs(h->ip6_plen) == 0)
1181                 plen = jumbolen;
1182         else
1183                 plen = ntohs(h->ip6_plen);
1184         if (plen == 0)
1185                 goto drop;
1186         if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
1187                 goto shortpkt;
1188
1189         /* Enforce a minimum ttl, may cause endless packet loops */
1190         if (r->min_ttl && h->ip6_hlim < r->min_ttl)
1191                 h->ip6_hlim = r->min_ttl;
1192
1193         return (PF_PASS);
1194
1195  fragment:
1196         if (ntohs(h->ip6_plen) == 0 || jumbolen)
1197                 goto drop;
1198         plen = ntohs(h->ip6_plen);
1199
1200         if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
1201                 goto shortpkt;
1202         fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
1203         if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET)
1204                 goto badfrag;
1205
1206         /* do something about it */
1207         /* remember to set pd->flags |= PFDESC_IP_REAS */
1208         return (PF_PASS);
1209
1210  shortpkt:
1211         REASON_SET(reason, PFRES_SHORT);
1212         if (r != NULL && r->log)
1213                 PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1214         return (PF_DROP);
1215
1216  drop:
1217         REASON_SET(reason, PFRES_NORM);
1218         if (r != NULL && r->log)
1219                 PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1220         return (PF_DROP);
1221
1222  badfrag:
1223         REASON_SET(reason, PFRES_FRAG);
1224         if (r != NULL && r->log)
1225                 PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1226         return (PF_DROP);
1227 }
1228 #endif /* INET6 */
1229
1230 int
1231 pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
1232     int off, void *h, struct pf_pdesc *pd)
1233 {
1234         struct pf_rule  *r, *rm = NULL;
1235         struct tcphdr   *th = pd->hdr.tcp;
1236         int              rewrite = 0;
1237         u_short          reason;
1238         u_int8_t         flags;
1239         sa_family_t      af = pd->af;
1240
1241         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1242         while (r != NULL) {
1243                 r->evaluations++;
1244                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
1245                         r = r->skip[PF_SKIP_IFP].ptr;
1246                 else if (r->direction && r->direction != dir)
1247                         r = r->skip[PF_SKIP_DIR].ptr;
1248                 else if (r->af && r->af != af)
1249                         r = r->skip[PF_SKIP_AF].ptr;
1250                 else if (r->proto && r->proto != pd->proto)
1251                         r = r->skip[PF_SKIP_PROTO].ptr;
1252                 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
1253                     r->src.neg, kif))
1254                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1255                 else if (r->src.port_op && !pf_match_port(r->src.port_op,
1256                             r->src.port[0], r->src.port[1], th->th_sport))
1257                         r = r->skip[PF_SKIP_SRC_PORT].ptr;
1258                 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
1259                     r->dst.neg, NULL))
1260                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
1261                 else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
1262                             r->dst.port[0], r->dst.port[1], th->th_dport))
1263                         r = r->skip[PF_SKIP_DST_PORT].ptr;
1264                 else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
1265                             pf_osfp_fingerprint(pd, m, off, th),
1266                             r->os_fingerprint))
1267                         r = TAILQ_NEXT(r, entries);
1268                 else {
1269                         rm = r;
1270                         break;
1271                 }
1272         }
1273
1274         if (rm == NULL || rm->action == PF_NOSCRUB)
1275                 return (PF_PASS);
1276         else {
1277                 r->packets[dir == PF_OUT]++;
1278                 r->bytes[dir == PF_OUT] += pd->tot_len;
1279         }
1280
1281         if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
1282                 pd->flags |= PFDESC_TCP_NORM;
1283
1284         flags = th->th_flags;
1285         if (flags & TH_SYN) {
1286                 /* Illegal packet */
1287                 if (flags & TH_RST)
1288                         goto tcp_drop;
1289
1290                 if (flags & TH_FIN)
1291                         flags &= ~TH_FIN;
1292         } else {
1293                 /* Illegal packet */
1294                 if (!(flags & (TH_ACK|TH_RST)))
1295                         goto tcp_drop;
1296         }
1297
1298         if (!(flags & TH_ACK)) {
1299                 /* These flags are only valid if ACK is set */
1300                 if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
1301                         goto tcp_drop;
1302         }
1303
1304         /* Check for illegal header length */
1305         if (th->th_off < (sizeof(struct tcphdr) >> 2))
1306                 goto tcp_drop;
1307
1308         /* If flags changed, or reserved data set, then adjust */
1309         if (flags != th->th_flags || th->th_x2 != 0) {
1310                 u_int16_t       ov, nv;
1311
1312                 ov = *(u_int16_t *)(&th->th_ack + 1);
1313                 th->th_flags = flags;
1314                 th->th_x2 = 0;
1315                 nv = *(u_int16_t *)(&th->th_ack + 1);
1316
1317                 th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
1318                 rewrite = 1;
1319         }
1320
1321         /* Remove urgent pointer, if TH_URG is not set */
1322         if (!(flags & TH_URG) && th->th_urp) {
1323                 th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
1324                 th->th_urp = 0;
1325                 rewrite = 1;
1326         }
1327
1328         /* Process options */
1329         if (r->max_mss && pf_normalize_tcpopt(r, m, th, off))
1330                 rewrite = 1;
1331
1332         /* copy back packet headers if we sanitized */
1333         if (rewrite)
1334                 m_copyback(m, off, sizeof(*th), (caddr_t)th);
1335
1336         return (PF_PASS);
1337
1338  tcp_drop:
1339         REASON_SET(&reason, PFRES_NORM);
1340         if (rm != NULL && r->log)
1341                 PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, r, NULL, NULL, pd);
1342         return (PF_DROP);
1343 }
1344
1345 int
1346 pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
1347     struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
1348 {
1349         u_int32_t tsval, tsecr;
1350         u_int8_t hdr[60];
1351         u_int8_t *opt;
1352
1353         KASSERT((src->scrub == NULL), 
1354             ("pf_normalize_tcp_init: src->scrub != NULL"));
1355
1356         src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT);
1357         if (src->scrub == NULL)
1358                 return (1);
1359         bzero(src->scrub, sizeof(*src->scrub));
1360
1361         switch (pd->af) {
1362 #ifdef INET
1363         case AF_INET: {
1364                 struct ip *h = mtod(m, struct ip *);
1365                 src->scrub->pfss_ttl = h->ip_ttl;
1366                 break;
1367         }
1368 #endif /* INET */
1369 #ifdef INET6
1370         case AF_INET6: {
1371                 struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1372                 src->scrub->pfss_ttl = h->ip6_hlim;
1373                 break;
1374         }
1375 #endif /* INET6 */
1376         }
1377
1378
1379         /*
1380          * All normalizations below are only begun if we see the start of
1381          * the connections.  They must all set an enabled bit in pfss_flags
1382          */
1383         if ((th->th_flags & TH_SYN) == 0)
1384                 return (0);
1385
1386
1387         if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
1388             pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1389                 /* Diddle with TCP options */
1390                 int hlen;
1391                 opt = hdr + sizeof(struct tcphdr);
1392                 hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1393                 while (hlen >= TCPOLEN_TIMESTAMP) {
1394                         switch (*opt) {
1395                         case TCPOPT_EOL:        /* FALLTHROUGH */
1396                         case TCPOPT_NOP:
1397                                 opt++;
1398                                 hlen--;
1399                                 break;
1400                         case TCPOPT_TIMESTAMP:
1401                                 if (opt[1] >= TCPOLEN_TIMESTAMP) {
1402                                         src->scrub->pfss_flags |=
1403                                             PFSS_TIMESTAMP;
1404                                         src->scrub->pfss_ts_mod = karc4random();
1405
1406                                         /* note PFSS_PAWS not set yet */
1407                                         memcpy(&tsval, &opt[2],
1408                                             sizeof(u_int32_t));
1409                                         memcpy(&tsecr, &opt[6],
1410                                             sizeof(u_int32_t));
1411                                         src->scrub->pfss_tsval0 = ntohl(tsval);
1412                                         src->scrub->pfss_tsval = ntohl(tsval);
1413                                         src->scrub->pfss_tsecr = ntohl(tsecr);
1414                                         getmicrouptime(&src->scrub->pfss_last);
1415                                 }
1416                                 /* FALLTHROUGH */
1417                         default:
1418                                 hlen -= MAX(opt[1], 2);
1419                                 opt += MAX(opt[1], 2);
1420                                 break;
1421                         }
1422                 }
1423         }
1424
1425         return (0);
1426 }
1427
1428 void
1429 pf_normalize_tcp_cleanup(struct pf_state *state)
1430 {
1431         if (state->src.scrub)
1432                 pool_put(&pf_state_scrub_pl, state->src.scrub);
1433         if (state->dst.scrub)
1434                 pool_put(&pf_state_scrub_pl, state->dst.scrub);
1435
1436         /* Someday... flush the TCP segment reassembly descriptors. */
1437 }
1438
1439 int
1440 pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
1441     u_short *reason, struct tcphdr *th, struct pf_state *state,
1442     struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
1443 {
1444         struct timeval uptime;
1445         u_int32_t tsval, tsecr;
1446         u_int tsval_from_last;
1447         u_int8_t hdr[60];
1448         u_int8_t *opt;
1449         int copyback = 0;
1450         int got_ts = 0;
1451
1452         KASSERT((src->scrub || dst->scrub), 
1453             ("pf_normalize_tcp_statefull: src->scrub && dst->scrub!"));
1454
1455         /*
1456          * Enforce the minimum TTL seen for this connection.  Negate a common
1457          * technique to evade an intrusion detection system and confuse
1458          * firewall state code.
1459          */
1460         switch (pd->af) {
1461 #ifdef INET
1462         case AF_INET: {
1463                 if (src->scrub) {
1464                         struct ip *h = mtod(m, struct ip *);
1465                         if (h->ip_ttl > src->scrub->pfss_ttl)
1466                                 src->scrub->pfss_ttl = h->ip_ttl;
1467                         h->ip_ttl = src->scrub->pfss_ttl;
1468                 }
1469                 break;
1470         }
1471 #endif /* INET */
1472 #ifdef INET6
1473         case AF_INET6: {
1474                 if (src->scrub) {
1475                         struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1476                         if (h->ip6_hlim > src->scrub->pfss_ttl)
1477                                 src->scrub->pfss_ttl = h->ip6_hlim;
1478                         h->ip6_hlim = src->scrub->pfss_ttl;
1479                 }
1480                 break;
1481         }
1482 #endif /* INET6 */
1483         }
1484
1485         if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
1486             ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
1487             (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
1488             pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1489                 /* Diddle with TCP options */
1490                 int hlen;
1491                 opt = hdr + sizeof(struct tcphdr);
1492                 hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1493                 while (hlen >= TCPOLEN_TIMESTAMP) {
1494                         switch (*opt) {
1495                         case TCPOPT_EOL:        /* FALLTHROUGH */
1496                         case TCPOPT_NOP:
1497                                 opt++;
1498                                 hlen--;
1499                                 break;
1500                         case TCPOPT_TIMESTAMP:
1501                                 /* Modulate the timestamps.  Can be used for
1502                                  * NAT detection, OS uptime determination or
1503                                  * reboot detection.
1504                                  */
1505
1506                                 if (got_ts) {
1507                                         /* Huh?  Multiple timestamps!? */
1508                                         if (pf_status.debug >= PF_DEBUG_MISC) {
1509                                                 DPFPRINTF(("multiple TS??"));
1510                                                 pf_print_state(state);
1511                                                 kprintf("\n");
1512                                         }
1513                                         REASON_SET(reason, PFRES_TS);
1514                                         return (PF_DROP);
1515                                 }
1516                                 if (opt[1] >= TCPOLEN_TIMESTAMP) {
1517                                         memcpy(&tsval, &opt[2],
1518                                             sizeof(u_int32_t));
1519                                         if (tsval && src->scrub &&
1520                                             (src->scrub->pfss_flags &
1521                                             PFSS_TIMESTAMP)) {
1522                                                 tsval = ntohl(tsval);
1523                                                 pf_change_a(&opt[2],
1524                                                     &th->th_sum,
1525                                                     htonl(tsval +
1526                                                     src->scrub->pfss_ts_mod),
1527                                                     0);
1528                                                 copyback = 1;
1529                                         }
1530
1531                                         /* Modulate TS reply iff valid (!0) */
1532                                         memcpy(&tsecr, &opt[6],
1533                                             sizeof(u_int32_t));
1534                                         if (tsecr && dst->scrub &&
1535                                             (dst->scrub->pfss_flags &
1536                                             PFSS_TIMESTAMP)) {
1537                                                 tsecr = ntohl(tsecr)
1538                                                     - dst->scrub->pfss_ts_mod;
1539                                                 pf_change_a(&opt[6],
1540                                                     &th->th_sum, htonl(tsecr),
1541                                                     0);
1542                                                 copyback = 1;
1543                                         }
1544                                         got_ts = 1;
1545                                 }
1546                                 /* FALLTHROUGH */
1547                         default:
1548                                 hlen -= MAX(opt[1], 2);
1549                                 opt += MAX(opt[1], 2);
1550                                 break;
1551                         }
1552                 }
1553                 if (copyback) {
1554                         /* Copyback the options, caller copys back header */
1555                         *writeback = 1;
1556                         m_copyback(m, off + sizeof(struct tcphdr),
1557                             (th->th_off << 2) - sizeof(struct tcphdr), hdr +
1558                             sizeof(struct tcphdr));
1559                 }
1560         }
1561
1562
1563         /*
1564          * Must invalidate PAWS checks on connections idle for too long.
1565          * The fastest allowed timestamp clock is 1ms.  That turns out to
1566          * be about 24 days before it wraps.  XXX Right now our lowerbound
1567          * TS echo check only works for the first 12 days of a connection
1568          * when the TS has exhausted half its 32bit space
1569          */
1570 #define TS_MAX_IDLE     (24*24*60*60)
1571 #define TS_MAX_CONN     (12*24*60*60)   /* XXX remove when better tsecr check */
1572
1573         getmicrouptime(&uptime);
1574         if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
1575             (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
1576             time_second - state->creation > TS_MAX_CONN))  {
1577                 if (pf_status.debug >= PF_DEBUG_MISC) {
1578                         DPFPRINTF(("src idled out of PAWS\n"));
1579                         pf_print_state(state);
1580                         kprintf("\n");
1581                 }
1582                 src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
1583                     | PFSS_PAWS_IDLED;
1584         }
1585         if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
1586             uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
1587                 if (pf_status.debug >= PF_DEBUG_MISC) {
1588                         DPFPRINTF(("dst idled out of PAWS\n"));
1589                         pf_print_state(state);
1590                         kprintf("\n");
1591                 }
1592                 dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
1593                     | PFSS_PAWS_IDLED;
1594         }
1595
1596         if (got_ts && src->scrub && dst->scrub &&
1597             (src->scrub->pfss_flags & PFSS_PAWS) &&
1598             (dst->scrub->pfss_flags & PFSS_PAWS)) {
1599                 /* Validate that the timestamps are "in-window".
1600                  * RFC1323 describes TCP Timestamp options that allow
1601                  * measurement of RTT (round trip time) and PAWS
1602                  * (protection against wrapped sequence numbers).  PAWS
1603                  * gives us a set of rules for rejecting packets on
1604                  * long fat pipes (packets that were somehow delayed 
1605                  * in transit longer than the time it took to send the
1606                  * full TCP sequence space of 4Gb).  We can use these
1607                  * rules and infer a few others that will let us treat
1608                  * the 32bit timestamp and the 32bit echoed timestamp
1609                  * as sequence numbers to prevent a blind attacker from
1610                  * inserting packets into a connection.
1611                  *
1612                  * RFC1323 tells us:
1613                  *  - The timestamp on this packet must be greater than
1614                  *    or equal to the last value echoed by the other
1615                  *    endpoint.  The RFC says those will be discarded
1616                  *    since it is a dup that has already been acked.
1617                  *    This gives us a lowerbound on the timestamp.
1618                  *        timestamp >= other last echoed timestamp
1619                  *  - The timestamp will be less than or equal to
1620                  *    the last timestamp plus the time between the
1621                  *    last packet and now.  The RFC defines the max
1622                  *    clock rate as 1ms.  We will allow clocks to be
1623                  *    up to 10% fast and will allow a total difference
1624                  *    or 30 seconds due to a route change.  And this
1625                  *    gives us an upperbound on the timestamp.
1626                  *        timestamp <= last timestamp + max ticks
1627                  *    We have to be careful here.  Windows will send an
1628                  *    initial timestamp of zero and then initialize it
1629                  *    to a random value after the 3whs; presumably to
1630                  *    avoid a DoS by having to call an expensive RNG
1631                  *    during a SYN flood.  Proof MS has at least one
1632                  *    good security geek.
1633                  *
1634                  *  - The TCP timestamp option must also echo the other
1635                  *    endpoints timestamp.  The timestamp echoed is the
1636                  *    one carried on the earliest unacknowledged segment
1637                  *    on the left edge of the sequence window.  The RFC
1638                  *    states that the host will reject any echoed
1639                  *    timestamps that were larger than any ever sent.
1640                  *    This gives us an upperbound on the TS echo.
1641                  *        tescr <= largest_tsval
1642                  *  - The lowerbound on the TS echo is a little more
1643                  *    tricky to determine.  The other endpoint's echoed
1644                  *    values will not decrease.  But there may be
1645                  *    network conditions that re-order packets and
1646                  *    cause our view of them to decrease.  For now the
1647                  *    only lowerbound we can safely determine is that
1648                  *    the TS echo will never be less than the orginal
1649                  *    TS.  XXX There is probably a better lowerbound.
1650                  *    Remove TS_MAX_CONN with better lowerbound check.
1651                  *        tescr >= other original TS
1652                  *
1653                  * It is also important to note that the fastest
1654                  * timestamp clock of 1ms will wrap its 32bit space in
1655                  * 24 days.  So we just disable TS checking after 24
1656                  * days of idle time.  We actually must use a 12d
1657                  * connection limit until we can come up with a better
1658                  * lowerbound to the TS echo check.
1659                  */
1660                 struct timeval delta_ts;
1661                 int ts_fudge;
1662
1663
1664                 /*
1665                  * PFTM_TS_DIFF is how many seconds of leeway to allow
1666                  * a host's timestamp.  This can happen if the previous
1667                  * packet got delayed in transit for much longer than
1668                  * this packet.
1669                  */
1670                 if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
1671                         ts_fudge = pf_default_rule.timeout[PFTM_TS_DIFF];
1672
1673
1674                 /* Calculate max ticks since the last timestamp */
1675 #define TS_MAXFREQ      1100            /* RFC max TS freq of 1Khz + 10% skew */
1676 #define TS_MICROSECS    1000000         /* microseconds per second */
1677 #ifndef timersub
1678 #define timersub(tvp, uvp, vvp)                                         \
1679         do {                                                            \
1680                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
1681                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
1682                 if ((vvp)->tv_usec < 0) {                               \
1683                         (vvp)->tv_sec--;                                \
1684                         (vvp)->tv_usec += 1000000;                      \
1685                 }                                                       \
1686         } while (0)
1687 #endif
1688
1689                 timersub(&uptime, &src->scrub->pfss_last, &delta_ts);
1690                 tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
1691                 tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
1692
1693
1694                 if ((src->state >= TCPS_ESTABLISHED &&
1695                     dst->state >= TCPS_ESTABLISHED) &&
1696                     (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
1697                     SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
1698                     (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
1699                     SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
1700                         /* Bad RFC1323 implementation or an insertion attack.
1701                          *
1702                          * - Solaris 2.6 and 2.7 are known to send another ACK
1703                          *   after the FIN,FIN|ACK,ACK closing that carries
1704                          *   an old timestamp.
1705                          */
1706
1707                         DPFPRINTF(("Timestamp failed %c%c%c%c\n",
1708                             SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
1709                             SEQ_GT(tsval, src->scrub->pfss_tsval +
1710                             tsval_from_last) ? '1' : ' ',
1711                             SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
1712                             SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
1713                         DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
1714                             "idle: %lus %lums\n",
1715                             tsval, tsecr, tsval_from_last, delta_ts.tv_sec,
1716                             delta_ts.tv_usec / 1000));
1717                         DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
1718                             src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
1719                         DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u"
1720                             "\n", dst->scrub->pfss_tsval,
1721                             dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
1722                         if (pf_status.debug >= PF_DEBUG_MISC) {
1723                                 pf_print_state(state);
1724                                 pf_print_flags(th->th_flags);
1725                                 kprintf("\n");
1726                         }
1727                         REASON_SET(reason, PFRES_TS);
1728                         return (PF_DROP);
1729                 }
1730
1731                 /* XXX I'd really like to require tsecr but it's optional */
1732
1733         } else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
1734             ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
1735             || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
1736             src->scrub && dst->scrub &&
1737             (src->scrub->pfss_flags & PFSS_PAWS) &&
1738             (dst->scrub->pfss_flags & PFSS_PAWS)) {
1739                 /* Didn't send a timestamp.  Timestamps aren't really useful
1740                  * when:
1741                  *  - connection opening or closing (often not even sent).
1742                  *    but we must not let an attacker to put a FIN on a
1743                  *    data packet to sneak it through our ESTABLISHED check.
1744                  *  - on a TCP reset.  RFC suggests not even looking at TS.
1745                  *  - on an empty ACK.  The TS will not be echoed so it will
1746                  *    probably not help keep the RTT calculation in sync and
1747                  *    there isn't as much danger when the sequence numbers
1748                  *    got wrapped.  So some stacks don't include TS on empty
1749                  *    ACKs :-(
1750                  *
1751                  * To minimize the disruption to mostly RFC1323 conformant
1752                  * stacks, we will only require timestamps on data packets.
1753                  *
1754                  * And what do ya know, we cannot require timestamps on data
1755                  * packets.  There appear to be devices that do legitimate
1756                  * TCP connection hijacking.  There are HTTP devices that allow
1757                  * a 3whs (with timestamps) and then buffer the HTTP request.
1758                  * If the intermediate device has the HTTP response cache, it
1759                  * will spoof the response but not bother timestamping its
1760                  * packets.  So we can look for the presence of a timestamp in
1761                  * the first data packet and if there, require it in all future
1762                  * packets.
1763                  */
1764
1765                 if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
1766                         /*
1767                          * Hey!  Someone tried to sneak a packet in.  Or the
1768                          * stack changed its RFC1323 behavior?!?!
1769                          */
1770                         if (pf_status.debug >= PF_DEBUG_MISC) {
1771                                 DPFPRINTF(("Did not receive expected RFC1323 "
1772                                     "timestamp\n"));
1773                                 pf_print_state(state);
1774                                 pf_print_flags(th->th_flags);
1775                                 kprintf("\n");
1776                         }
1777                         REASON_SET(reason, PFRES_TS);
1778                         return (PF_DROP);
1779                 }
1780         }
1781
1782
1783         /*
1784          * We will note if a host sends his data packets with or without
1785          * timestamps.  And require all data packets to contain a timestamp
1786          * if the first does.  PAWS implicitly requires that all data packets be
1787          * timestamped.  But I think there are middle-man devices that hijack
1788          * TCP streams immediately after the 3whs and don't timestamp their
1789          * packets (seen in a WWW accelerator or cache).
1790          */
1791         if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
1792             (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
1793                 if (got_ts)
1794                         src->scrub->pfss_flags |= PFSS_DATA_TS;
1795                 else {
1796                         src->scrub->pfss_flags |= PFSS_DATA_NOTS;
1797                         if (pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
1798                             (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
1799                                 /* Don't warn if other host rejected RFC1323 */
1800                                 DPFPRINTF(("Broken RFC1323 stack did not "
1801                                     "timestamp data packet. Disabled PAWS "
1802                                     "security.\n"));
1803                                 pf_print_state(state);
1804                                 pf_print_flags(th->th_flags);
1805                                 kprintf("\n");
1806                         }
1807                 }
1808         }
1809
1810
1811         /*
1812          * Update PAWS values
1813          */
1814         if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
1815             (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
1816                 getmicrouptime(&src->scrub->pfss_last);
1817                 if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
1818                     (src->scrub->pfss_flags & PFSS_PAWS) == 0)
1819                         src->scrub->pfss_tsval = tsval;
1820
1821                 if (tsecr) {
1822                         if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
1823                             (src->scrub->pfss_flags & PFSS_PAWS) == 0)
1824                                 src->scrub->pfss_tsecr = tsecr;
1825
1826                         if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
1827                             (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
1828                             src->scrub->pfss_tsval0 == 0)) {
1829                                 /* tsval0 MUST be the lowest timestamp */
1830                                 src->scrub->pfss_tsval0 = tsval;
1831                         }
1832
1833                         /* Only fully initialized after a TS gets echoed */
1834                         if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
1835                                 src->scrub->pfss_flags |= PFSS_PAWS;
1836                 }
1837         }
1838
1839         /* I have a dream....  TCP segment reassembly.... */
1840         return (0);
1841 }
1842
1843 int
1844 pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
1845     int off)
1846 {
1847         u_int16_t       *mss;
1848         int              thoff;
1849         int              opt, cnt, optlen = 0;
1850         int              rewrite = 0;
1851         u_char          *optp;
1852
1853         thoff = th->th_off << 2;
1854         cnt = thoff - sizeof(struct tcphdr);
1855         optp = mtod(m, caddr_t) + off + sizeof(struct tcphdr);
1856
1857         for (; cnt > 0; cnt -= optlen, optp += optlen) {
1858                 opt = optp[0];
1859                 if (opt == TCPOPT_EOL)
1860                         break;
1861                 if (opt == TCPOPT_NOP)
1862                         optlen = 1;
1863                 else {
1864                         if (cnt < 2)
1865                                 break;
1866                         optlen = optp[1];
1867                         if (optlen < 2 || optlen > cnt)
1868                                 break;
1869                 }
1870                 switch (opt) {
1871                 case TCPOPT_MAXSEG:
1872                         mss = (u_int16_t *)(optp + 2);
1873                         if ((ntohs(*mss)) > r->max_mss) {
1874                                 th->th_sum = pf_cksum_fixup(th->th_sum,
1875                                     *mss, htons(r->max_mss), 0);
1876                                 *mss = htons(r->max_mss);
1877                                 rewrite = 1;
1878                         }
1879                         break;
1880                 default:
1881                         break;
1882                 }
1883         }
1884
1885         return (rewrite);
1886 }