| Commit | Line | Data |
|---|---|---|
| 86e10434 | 1 | /* |
| 66d6c637 JH |
2 | * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. |
| 3 | * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. | |
| f23061d4 | 4 | * |
| 66d6c637 JH |
5 | * This code is derived from software contributed to The DragonFly Project |
| 6 | * by Jeffrey M. Hsu. | |
| f23061d4 | 7 | * |
| 66d6c637 JH |
8 | * Redistribution and use in source and binary forms, with or without |
| 9 | * modification, are permitted provided that the following conditions | |
| 10 | * are met: | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in the | |
| 15 | * documentation and/or other materials provided with the distribution. | |
| 16 | * 3. Neither the name of The DragonFly Project nor the names of its | |
| 17 | * contributors may be used to endorse or promote products derived | |
| 18 | * from this software without specific, prior written permission. | |
| f23061d4 | 19 | * |
| 66d6c637 JH |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| 23 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
| 24 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
| 25 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
| 26 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 27 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
| 28 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 29 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
| 30 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 31 | * SUCH DAMAGE. | |
| 32 | */ | |
| 33 | ||
| 34 | /* | |
| 86e10434 JH |
35 | * All advertising materials mentioning features or use of this software |
| 36 | * must display the following acknowledgement: | |
| 37 | * This product includes software developed by Jeffrey M. Hsu. | |
| 38 | * | |
| 984263bc MD |
39 | * Copyright (c) 2001 Networks Associates Technologies, Inc. |
| 40 | * All rights reserved. | |
| 41 | * | |
| 42 | * This software was developed for the FreeBSD Project by Jonathan Lemon | |
| 43 | * and NAI Labs, the Security Research Division of Network Associates, Inc. | |
| 44 | * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the | |
| 45 | * DARPA CHATS research program. | |
| 46 | * | |
| 47 | * Redistribution and use in source and binary forms, with or without | |
| 48 | * modification, are permitted provided that the following conditions | |
| 49 | * are met: | |
| 50 | * 1. Redistributions of source code must retain the above copyright | |
| 51 | * notice, this list of conditions and the following disclaimer. | |
| 52 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 53 | * notice, this list of conditions and the following disclaimer in the | |
| 54 | * documentation and/or other materials provided with the distribution. | |
| 55 | * 3. The name of the author may not be used to endorse or promote | |
| 56 | * products derived from this software without specific prior written | |
| 57 | * permission. | |
| 58 | * | |
| 59 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
| 60 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
| 63 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 64 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 65 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 66 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 67 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 68 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 69 | * SUCH DAMAGE. | |
| 70 | * | |
| 71 | * $FreeBSD: src/sys/netinet/tcp_syncache.c,v 1.5.2.14 2003/02/24 04:02:27 silby Exp $ | |
| b95665d8 | 72 | * $DragonFly: src/sys/netinet/tcp_syncache.c,v 1.35 2008/11/22 11:03:35 sephe Exp $ |
| 984263bc MD |
73 | */ |
| 74 | ||
| b1992928 | 75 | #include "opt_inet.h" |
| 984263bc MD |
76 | #include "opt_inet6.h" |
| 77 | #include "opt_ipsec.h" | |
| 78 | ||
| 79 | #include <sys/param.h> | |
| 80 | #include <sys/systm.h> | |
| 81 | #include <sys/kernel.h> | |
| 82 | #include <sys/sysctl.h> | |
| 83 | #include <sys/malloc.h> | |
| 84 | #include <sys/mbuf.h> | |
| 85 | #include <sys/md5.h> | |
| 86 | #include <sys/proc.h> /* for proc0 declaration */ | |
| 87 | #include <sys/random.h> | |
| 88 | #include <sys/socket.h> | |
| 89 | #include <sys/socketvar.h> | |
| 3f9db7f8 | 90 | #include <sys/in_cksum.h> |
| 984263bc | 91 | |
| 00943fd6 | 92 | #include <sys/msgport2.h> |
| 4599cf19 | 93 | #include <net/netmsg2.h> |
| 00943fd6 | 94 | |
| 984263bc MD |
95 | #include <net/if.h> |
| 96 | #include <net/route.h> | |
| 97 | ||
| 98 | #include <netinet/in.h> | |
| 99 | #include <netinet/in_systm.h> | |
| 100 | #include <netinet/ip.h> | |
| 101 | #include <netinet/in_var.h> | |
| 102 | #include <netinet/in_pcb.h> | |
| 103 | #include <netinet/ip_var.h> | |
| 984263bc | 104 | #include <netinet/ip6.h> |
| 61896e3c | 105 | #ifdef INET6 |
| 984263bc MD |
106 | #include <netinet/icmp6.h> |
| 107 | #include <netinet6/nd6.h> | |
| 61896e3c | 108 | #endif |
| 984263bc MD |
109 | #include <netinet6/ip6_var.h> |
| 110 | #include <netinet6/in6_pcb.h> | |
| 984263bc MD |
111 | #include <netinet/tcp.h> |
| 112 | #include <netinet/tcp_fsm.h> | |
| 113 | #include <netinet/tcp_seq.h> | |
| 114 | #include <netinet/tcp_timer.h> | |
| a48c5dd5 | 115 | #include <netinet/tcp_timer2.h> |
| 984263bc | 116 | #include <netinet/tcp_var.h> |
| 984263bc | 117 | #include <netinet6/tcp6_var.h> |
| 984263bc MD |
118 | |
| 119 | #ifdef IPSEC | |
| 120 | #include <netinet6/ipsec.h> | |
| 121 | #ifdef INET6 | |
| 122 | #include <netinet6/ipsec6.h> | |
| 123 | #endif | |
| d2438d69 | 124 | #include <netproto/key/key.h> |
| 984263bc MD |
125 | #endif /*IPSEC*/ |
| 126 | ||
| 127 | #ifdef FAST_IPSEC | |
| bf844ffa | 128 | #include <netproto/ipsec/ipsec.h> |
| 984263bc | 129 | #ifdef INET6 |
| bf844ffa | 130 | #include <netproto/ipsec/ipsec6.h> |
| 984263bc | 131 | #endif |
| bf844ffa | 132 | #include <netproto/ipsec/key.h> |
| 984263bc MD |
133 | #define IPSEC |
| 134 | #endif /*FAST_IPSEC*/ | |
| 135 | ||
| 984263bc MD |
136 | static int tcp_syncookies = 1; |
| 137 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, | |
| f23061d4 | 138 | &tcp_syncookies, 0, |
| 984263bc MD |
139 | "Use TCP SYN cookies if the syncache overflows"); |
| 140 | ||
| 141 | static void syncache_drop(struct syncache *, struct syncache_head *); | |
| 142 | static void syncache_free(struct syncache *); | |
| 143 | static void syncache_insert(struct syncache *, struct syncache_head *); | |
| 144 | struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); | |
| 145 | static int syncache_respond(struct syncache *, struct mbuf *); | |
| 7e31206a SZ |
146 | static struct socket *syncache_socket(struct syncache *, struct socket *, |
| 147 | struct mbuf *); | |
| 984263bc MD |
148 | static void syncache_timer(void *); |
| 149 | static u_int32_t syncookie_generate(struct syncache *); | |
| 150 | static struct syncache *syncookie_lookup(struct in_conninfo *, | |
| 151 | struct tcphdr *, struct socket *); | |
| 152 | ||
| 153 | /* | |
| 154 | * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. | |
| 155 | * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds, | |
| 156 | * the odds are that the user has given up attempting to connect by then. | |
| 157 | */ | |
| 158 | #define SYNCACHE_MAXREXMTS 3 | |
| 159 | ||
| 160 | /* Arbitrary values */ | |
| 161 | #define TCP_SYNCACHE_HASHSIZE 512 | |
| 162 | #define TCP_SYNCACHE_BUCKETLIMIT 30 | |
| 163 | ||
| 00943fd6 | 164 | struct netmsg_sc_timer { |
| 002c1265 | 165 | struct netmsg_base base; |
| 00943fd6 JH |
166 | struct msgrec *nm_mrec; /* back pointer to containing msgrec */ |
| 167 | }; | |
| 168 | ||
| 169 | struct msgrec { | |
| 170 | struct netmsg_sc_timer msg; | |
| 171 | lwkt_port_t port; /* constant after init */ | |
| 172 | int slot; /* constant after init */ | |
| 173 | }; | |
| 174 | ||
| 4599cf19 | 175 | static void syncache_timer_handler(netmsg_t); |
| 00943fd6 | 176 | |
| 984263bc | 177 | struct tcp_syncache { |
| 984263bc MD |
178 | u_int hashsize; |
| 179 | u_int hashmask; | |
| 180 | u_int bucket_limit; | |
| 984263bc MD |
181 | u_int cache_limit; |
| 182 | u_int rexmt_limit; | |
| 183 | u_int hash_secret; | |
| 984263bc MD |
184 | }; |
| 185 | static struct tcp_syncache tcp_syncache; | |
| 186 | ||
| c1d0893d MD |
187 | TAILQ_HEAD(syncache_list, syncache); |
| 188 | ||
| 00943fd6 JH |
189 | struct tcp_syncache_percpu { |
| 190 | struct syncache_head *hashbase; | |
| 191 | u_int cache_count; | |
| c1d0893d | 192 | struct syncache_list timerq[SYNCACHE_MAXREXMTS + 1]; |
| 00943fd6 JH |
193 | struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1]; |
| 194 | struct msgrec mrec[SYNCACHE_MAXREXMTS + 1]; | |
| 195 | }; | |
| 196 | static struct tcp_syncache_percpu tcp_syncache_percpu[MAXCPU]; | |
| 197 | ||
| 198 | static struct lwkt_port syncache_null_rport; | |
| 199 | ||
| 984263bc MD |
200 | SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); |
| 201 | ||
| 202 | SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RD, | |
| 203 | &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); | |
| 204 | ||
| 205 | SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RD, | |
| 206 | &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); | |
| 207 | ||
| 00943fd6 JH |
208 | /* XXX JH */ |
| 209 | #if 0 | |
| 984263bc MD |
210 | SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, |
| 211 | &tcp_syncache.cache_count, 0, "Current number of entries in syncache"); | |
| 00943fd6 | 212 | #endif |
| 984263bc MD |
213 | |
| 214 | SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RD, | |
| 215 | &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); | |
| 216 | ||
| 217 | SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, | |
| 218 | &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); | |
| 219 | ||
| 220 | static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); | |
| 221 | ||
| f23061d4 | 222 | #define SYNCACHE_HASH(inc, mask) \ |
| 984263bc MD |
223 | ((tcp_syncache.hash_secret ^ \ |
| 224 | (inc)->inc_faddr.s_addr ^ \ | |
| f23061d4 | 225 | ((inc)->inc_faddr.s_addr >> 16) ^ \ |
| 984263bc MD |
226 | (inc)->inc_fport ^ (inc)->inc_lport) & mask) |
| 227 | ||
| f23061d4 | 228 | #define SYNCACHE_HASH6(inc, mask) \ |
| 984263bc | 229 | ((tcp_syncache.hash_secret ^ \ |
| f23061d4 JH |
230 | (inc)->inc6_faddr.s6_addr32[0] ^ \ |
| 231 | (inc)->inc6_faddr.s6_addr32[3] ^ \ | |
| 984263bc MD |
232 | (inc)->inc_fport ^ (inc)->inc_lport) & mask) |
| 233 | ||
| 234 | #define ENDPTS_EQ(a, b) ( \ | |
| 235 | (a)->ie_fport == (b)->ie_fport && \ | |
| 236 | (a)->ie_lport == (b)->ie_lport && \ | |
| 237 | (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ | |
| 238 | (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ | |
| 239 | ) | |
| 240 | ||
| 241 | #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) | |
| 242 | ||
| 00943fd6 JH |
243 | static __inline void |
| 244 | syncache_timeout(struct tcp_syncache_percpu *syncache_percpu, | |
| 245 | struct syncache *sc, int slot) | |
| 246 | { | |
| 247 | sc->sc_rxtslot = slot; | |
| 248 | sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[slot]; | |
| 249 | TAILQ_INSERT_TAIL(&syncache_percpu->timerq[slot], sc, sc_timerq); | |
| 250 | if (!callout_active(&syncache_percpu->tt_timerq[slot])) { | |
| 251 | callout_reset(&syncache_percpu->tt_timerq[slot], | |
| 252 | TCPTV_RTOBASE * tcp_backoff[slot], | |
| 253 | syncache_timer, | |
| 254 | &syncache_percpu->mrec[slot]); | |
| 255 | } | |
| 256 | } | |
| 984263bc MD |
257 | |
| 258 | static void | |
| 259 | syncache_free(struct syncache *sc) | |
| 260 | { | |
| 261 | struct rtentry *rt; | |
| 61896e3c JH |
262 | #ifdef INET6 |
| 263 | const boolean_t isipv6 = sc->sc_inc.inc_isipv6; | |
| 264 | #else | |
| 265 | const boolean_t isipv6 = FALSE; | |
| 266 | #endif | |
| 984263bc MD |
267 | |
| 268 | if (sc->sc_ipopts) | |
| f23061d4 | 269 | m_free(sc->sc_ipopts); |
| a5263048 JH |
270 | |
| 271 | rt = isipv6 ? sc->sc_route6.ro_rt : sc->sc_route.ro_rt; | |
| 984263bc MD |
272 | if (rt != NULL) { |
| 273 | /* | |
| a5263048 | 274 | * If this is the only reference to a protocol-cloned |
| 984263bc MD |
275 | * route, remove it immediately. |
| 276 | */ | |
| a5263048 | 277 | if ((rt->rt_flags & RTF_WASCLONED) && rt->rt_refcnt == 1) |
| f23061d4 JH |
278 | rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, |
| 279 | rt_mask(rt), rt->rt_flags, NULL); | |
| 984263bc MD |
280 | RTFREE(rt); |
| 281 | } | |
| 9f42c129 | 282 | kfree(sc, M_SYNCACHE); |
| 984263bc MD |
283 | } |
| 284 | ||
| 285 | void | |
| 286 | syncache_init(void) | |
| 287 | { | |
| 00943fd6 | 288 | int i, cpu; |
| 984263bc | 289 | |
| 984263bc MD |
290 | tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; |
| 291 | tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; | |
| 292 | tcp_syncache.cache_limit = | |
| 293 | tcp_syncache.hashsize * tcp_syncache.bucket_limit; | |
| 294 | tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; | |
| 0ced1954 | 295 | tcp_syncache.hash_secret = karc4random(); |
| 984263bc | 296 | |
| f23061d4 | 297 | TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", |
| 984263bc | 298 | &tcp_syncache.hashsize); |
| f23061d4 | 299 | TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", |
| 984263bc | 300 | &tcp_syncache.cache_limit); |
| f23061d4 | 301 | TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", |
| 984263bc MD |
302 | &tcp_syncache.bucket_limit); |
| 303 | if (!powerof2(tcp_syncache.hashsize)) { | |
| a6ec04bc | 304 | kprintf("WARNING: syncache hash size is not a power of 2.\n"); |
| 984263bc | 305 | tcp_syncache.hashsize = 512; /* safe default */ |
| f23061d4 | 306 | } |
| 984263bc MD |
307 | tcp_syncache.hashmask = tcp_syncache.hashsize - 1; |
| 308 | ||
| fb0f29c4 | 309 | lwkt_initport_replyonly_null(&syncache_null_rport); |
| 984263bc | 310 | |
| 00943fd6 JH |
311 | for (cpu = 0; cpu < ncpus2; cpu++) { |
| 312 | struct tcp_syncache_percpu *syncache_percpu; | |
| 313 | ||
| 314 | syncache_percpu = &tcp_syncache_percpu[cpu]; | |
| 315 | /* Allocate the hash table. */ | |
| 316 | MALLOC(syncache_percpu->hashbase, struct syncache_head *, | |
| 317 | tcp_syncache.hashsize * sizeof(struct syncache_head), | |
| 318 | M_SYNCACHE, M_WAITOK); | |
| 319 | ||
| 320 | /* Initialize the hash buckets. */ | |
| 321 | for (i = 0; i < tcp_syncache.hashsize; i++) { | |
| 322 | struct syncache_head *bucket; | |
| 323 | ||
| 324 | bucket = &syncache_percpu->hashbase[i]; | |
| 325 | TAILQ_INIT(&bucket->sch_bucket); | |
| 326 | bucket->sch_length = 0; | |
| 327 | } | |
| 984263bc | 328 | |
| 00943fd6 JH |
329 | for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) { |
| 330 | /* Initialize the timer queues. */ | |
| 331 | TAILQ_INIT(&syncache_percpu->timerq[i]); | |
| 332 | callout_init(&syncache_percpu->tt_timerq[i]); | |
| 333 | ||
| 334 | syncache_percpu->mrec[i].slot = i; | |
| c3c96e44 | 335 | syncache_percpu->mrec[i].port = cpu_portfn(cpu); |
| 00943fd6 | 336 | syncache_percpu->mrec[i].msg.nm_mrec = |
| c3c96e44 | 337 | &syncache_percpu->mrec[i]; |
| 002c1265 | 338 | netmsg_init(&syncache_percpu->mrec[i].msg.base, |
| 48e7b118 MD |
339 | NULL, &syncache_null_rport, |
| 340 | 0, syncache_timer_handler); | |
| 00943fd6 | 341 | } |
| 984263bc | 342 | } |
| 984263bc MD |
343 | } |
| 344 | ||
| 345 | static void | |
| f3f70f0d | 346 | syncache_insert(struct syncache *sc, struct syncache_head *sch) |
| 984263bc | 347 | { |
| 00943fd6 | 348 | struct tcp_syncache_percpu *syncache_percpu; |
| 984263bc | 349 | struct syncache *sc2; |
| d982be66 | 350 | int i; |
| 984263bc | 351 | |
| 00943fd6 JH |
352 | syncache_percpu = &tcp_syncache_percpu[mycpu->gd_cpuid]; |
| 353 | ||
| 984263bc MD |
354 | /* |
| 355 | * Make sure that we don't overflow the per-bucket | |
| 356 | * limit or the total cache size limit. | |
| 357 | */ | |
| 984263bc MD |
358 | if (sch->sch_length >= tcp_syncache.bucket_limit) { |
| 359 | /* | |
| 360 | * The bucket is full, toss the oldest element. | |
| 361 | */ | |
| 362 | sc2 = TAILQ_FIRST(&sch->sch_bucket); | |
| 363 | sc2->sc_tp->ts_recent = ticks; | |
| 364 | syncache_drop(sc2, sch); | |
| 365 | tcpstat.tcps_sc_bucketoverflow++; | |
| 00943fd6 | 366 | } else if (syncache_percpu->cache_count >= tcp_syncache.cache_limit) { |
| 984263bc MD |
367 | /* |
| 368 | * The cache is full. Toss the oldest entry in the | |
| 369 | * entire cache. This is the front entry in the | |
| 370 | * first non-empty timer queue with the largest | |
| 371 | * timeout value. | |
| 372 | */ | |
| 373 | for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { | |
| 00943fd6 | 374 | sc2 = TAILQ_FIRST(&syncache_percpu->timerq[i]); |
| c1d0893d MD |
375 | while (sc2 && (sc2->sc_flags & SCF_MARKER)) |
| 376 | sc2 = TAILQ_NEXT(sc2, sc_timerq); | |
| 984263bc MD |
377 | if (sc2 != NULL) |
| 378 | break; | |
| 379 | } | |
| 380 | sc2->sc_tp->ts_recent = ticks; | |
| 381 | syncache_drop(sc2, NULL); | |
| 382 | tcpstat.tcps_sc_cacheoverflow++; | |
| 383 | } | |
| 384 | ||
| 385 | /* Initialize the entry's timer. */ | |
| 00943fd6 | 386 | syncache_timeout(syncache_percpu, sc, 0); |
| 984263bc MD |
387 | |
| 388 | /* Put it into the bucket. */ | |
| 389 | TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash); | |
| 390 | sch->sch_length++; | |
| 00943fd6 | 391 | syncache_percpu->cache_count++; |
| 984263bc | 392 | tcpstat.tcps_sc_added++; |
| 984263bc MD |
393 | } |
| 394 | ||
| e5fe3477 MD |
395 | void |
| 396 | syncache_destroy(struct tcpcb *tp) | |
| 397 | { | |
| 398 | struct tcp_syncache_percpu *syncache_percpu; | |
| 399 | struct syncache_head *bucket; | |
| 400 | struct syncache *sc; | |
| 401 | int i; | |
| 402 | ||
| 403 | syncache_percpu = &tcp_syncache_percpu[mycpu->gd_cpuid]; | |
| 404 | sc = NULL; | |
| c1d0893d | 405 | |
| e5fe3477 MD |
406 | for (i = 0; i < tcp_syncache.hashsize; i++) { |
| 407 | bucket = &syncache_percpu->hashbase[i]; | |
| 408 | TAILQ_FOREACH(sc, &bucket->sch_bucket, sc_hash) { | |
| 7123bbff | 409 | if (sc->sc_tp == tp) |
| e5fe3477 | 410 | sc->sc_tp = NULL; |
| e5fe3477 MD |
411 | } |
| 412 | } | |
| e5fe3477 MD |
413 | } |
| 414 | ||
| 984263bc | 415 | static void |
| f3f70f0d | 416 | syncache_drop(struct syncache *sc, struct syncache_head *sch) |
| 984263bc | 417 | { |
| 00943fd6 | 418 | struct tcp_syncache_percpu *syncache_percpu; |
| 61896e3c JH |
419 | #ifdef INET6 |
| 420 | const boolean_t isipv6 = sc->sc_inc.inc_isipv6; | |
| 421 | #else | |
| 422 | const boolean_t isipv6 = FALSE; | |
| 423 | #endif | |
| 984263bc | 424 | |
| 00943fd6 JH |
425 | syncache_percpu = &tcp_syncache_percpu[mycpu->gd_cpuid]; |
| 426 | ||
| 984263bc | 427 | if (sch == NULL) { |
| 61896e3c | 428 | if (isipv6) { |
| 00943fd6 | 429 | sch = &syncache_percpu->hashbase[ |
| 984263bc | 430 | SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)]; |
| 61896e3c | 431 | } else { |
| 00943fd6 | 432 | sch = &syncache_percpu->hashbase[ |
| 984263bc MD |
433 | SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)]; |
| 434 | } | |
| 435 | } | |
| 436 | ||
| 984263bc MD |
437 | TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); |
| 438 | sch->sch_length--; | |
| 00943fd6 | 439 | syncache_percpu->cache_count--; |
| 984263bc | 440 | |
| 00943fd6 | 441 | /* |
| e5fe3477 MD |
442 | * Cleanup |
| 443 | */ | |
| 7123bbff | 444 | if (sc->sc_tp) |
| e5fe3477 | 445 | sc->sc_tp = NULL; |
| e5fe3477 MD |
446 | |
| 447 | /* | |
| 00943fd6 JH |
448 | * Remove the entry from the syncache timer/timeout queue. Note |
| 449 | * that we do not try to stop any running timer since we do not know | |
| 450 | * whether the timer's message is in-transit or not. Since timeouts | |
| 451 | * are fairly long, taking an unneeded callout does not detrimentally | |
| 452 | * effect performance. | |
| 453 | */ | |
| 454 | TAILQ_REMOVE(&syncache_percpu->timerq[sc->sc_rxtslot], sc, sc_timerq); | |
| 984263bc MD |
455 | |
| 456 | syncache_free(sc); | |
| 457 | } | |
| 458 | ||
| 459 | /* | |
| 00943fd6 JH |
460 | * Place a timeout message on the TCP thread's message queue. |
| 461 | * This routine runs in soft interrupt context. | |
| 462 | * | |
| 463 | * An invariant is for this routine to be called, the callout must | |
| 464 | * have been active. Note that the callout is not deactivated until | |
| 465 | * after the message has been processed in syncache_timer_handler() below. | |
| 466 | */ | |
| 467 | static void | |
| 468 | syncache_timer(void *p) | |
| 469 | { | |
| 470 | struct netmsg_sc_timer *msg = p; | |
| 471 | ||
| 002c1265 | 472 | lwkt_sendmsg(msg->nm_mrec->port, &msg->base.lmsg); |
| 00943fd6 JH |
473 | } |
| 474 | ||
| 475 | /* | |
| 476 | * Service a timer message queued by timer expiration. | |
| 477 | * This routine runs in the TCP protocol thread. | |
| 478 | * | |
| 984263bc MD |
479 | * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. |
| 480 | * If we have retransmitted an entry the maximum number of times, expire it. | |
| 00943fd6 JH |
481 | * |
| 482 | * When we finish processing timed-out entries, we restart the timer if there | |
| 483 | * are any entries still on the queue and deactivate it otherwise. Only after | |
| 484 | * a timer has been deactivated here can it be restarted by syncache_timeout(). | |
| 984263bc | 485 | */ |
| 4599cf19 | 486 | static void |
| 002c1265 | 487 | syncache_timer_handler(netmsg_t msg) |
| 984263bc | 488 | { |
| 00943fd6 | 489 | struct tcp_syncache_percpu *syncache_percpu; |
| c1d0893d MD |
490 | struct syncache *sc; |
| 491 | struct syncache marker; | |
| 492 | struct syncache_list *list; | |
| 984263bc | 493 | struct inpcb *inp; |
| 00943fd6 | 494 | int slot; |
| 984263bc | 495 | |
| 002c1265 | 496 | slot = ((struct netmsg_sc_timer *)msg)->nm_mrec->slot; |
| 00943fd6 | 497 | syncache_percpu = &tcp_syncache_percpu[mycpu->gd_cpuid]; |
| 984263bc | 498 | |
| c1d0893d MD |
499 | list = &syncache_percpu->timerq[slot]; |
| 500 | ||
| 501 | /* | |
| 502 | * Use a marker to keep our place in the scan. syncache_drop() | |
| 503 | * can block and cause any next pointer we cache to become stale. | |
| 504 | */ | |
| 505 | marker.sc_flags = SCF_MARKER; | |
| 506 | TAILQ_INSERT_HEAD(list, &marker, sc_timerq); | |
| 507 | ||
| 508 | while ((sc = TAILQ_NEXT(&marker, sc_timerq)) != NULL) { | |
| 509 | /* | |
| 510 | * Move the marker. | |
| 511 | */ | |
| 512 | TAILQ_REMOVE(list, &marker, sc_timerq); | |
| 513 | TAILQ_INSERT_AFTER(list, sc, &marker, sc_timerq); | |
| 514 | ||
| 515 | if (sc->sc_flags & SCF_MARKER) | |
| 516 | continue; | |
| 517 | ||
| 518 | if (ticks < sc->sc_rxttime) | |
| 00943fd6 | 519 | break; /* finished because timerq sorted by time */ |
| e5fe3477 | 520 | if (sc->sc_tp == NULL) { |
| e5fe3477 MD |
521 | syncache_drop(sc, NULL); |
| 522 | tcpstat.tcps_sc_stale++; | |
| 523 | continue; | |
| 524 | } | |
| 984263bc MD |
525 | inp = sc->sc_tp->t_inpcb; |
| 526 | if (slot == SYNCACHE_MAXREXMTS || | |
| 527 | slot >= tcp_syncache.rexmt_limit || | |
| 16961763 | 528 | inp == NULL || |
| 984263bc | 529 | inp->inp_gencnt != sc->sc_inp_gencnt) { |
| 984263bc MD |
530 | syncache_drop(sc, NULL); |
| 531 | tcpstat.tcps_sc_stale++; | |
| 532 | continue; | |
| 533 | } | |
| 534 | /* | |
| 535 | * syncache_respond() may call back into the syncache to | |
| 536 | * to modify another entry, so do not obtain the next | |
| 537 | * entry on the timer chain until it has completed. | |
| 538 | */ | |
| f23061d4 | 539 | syncache_respond(sc, NULL); |
| 984263bc | 540 | tcpstat.tcps_sc_retransmitted++; |
| c1d0893d | 541 | TAILQ_REMOVE(list, sc, sc_timerq); |
| 00943fd6 | 542 | syncache_timeout(syncache_percpu, sc, slot + 1); |
| 984263bc | 543 | } |
| c1d0893d MD |
544 | TAILQ_REMOVE(list, &marker, sc_timerq); |
| 545 | ||
| 546 | if (sc != NULL) { | |
| 00943fd6 | 547 | callout_reset(&syncache_percpu->tt_timerq[slot], |
| c1d0893d MD |
548 | sc->sc_rxttime - ticks, syncache_timer, |
| 549 | &syncache_percpu->mrec[slot]); | |
| 550 | } else { | |
| 00943fd6 | 551 | callout_deactivate(&syncache_percpu->tt_timerq[slot]); |
| c1d0893d | 552 | } |
| 002c1265 | 553 | lwkt_replymsg(&msg->base.lmsg, 0); |
| 984263bc MD |
554 | } |
| 555 | ||
| 556 | /* | |
| 557 | * Find an entry in the syncache. | |
| 558 | */ | |
| 559 | struct syncache * | |
| f3f70f0d | 560 | syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) |
| 984263bc | 561 | { |
| 00943fd6 | 562 | struct tcp_syncache_percpu *syncache_percpu; |
| 984263bc MD |
563 | struct syncache *sc; |
| 564 | struct syncache_head *sch; | |
| 984263bc | 565 | |
| 00943fd6 | 566 | syncache_percpu = &tcp_syncache_percpu[mycpu->gd_cpuid]; |
| 984263bc MD |
567 | #ifdef INET6 |
| 568 | if (inc->inc_isipv6) { | |
| 00943fd6 | 569 | sch = &syncache_percpu->hashbase[ |
| 984263bc MD |
570 | SYNCACHE_HASH6(inc, tcp_syncache.hashmask)]; |
| 571 | *schp = sch; | |
| d982be66 JH |
572 | TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) |
| 573 | if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) | |
| 984263bc | 574 | return (sc); |
| 984263bc MD |
575 | } else |
| 576 | #endif | |
| 577 | { | |
| 00943fd6 | 578 | sch = &syncache_percpu->hashbase[ |
| 984263bc MD |
579 | SYNCACHE_HASH(inc, tcp_syncache.hashmask)]; |
| 580 | *schp = sch; | |
| 984263bc MD |
581 | TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { |
| 582 | #ifdef INET6 | |
| 583 | if (sc->sc_inc.inc_isipv6) | |
| 584 | continue; | |
| 585 | #endif | |
| d982be66 | 586 | if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) |
| 984263bc | 587 | return (sc); |
| 984263bc | 588 | } |
| 984263bc MD |
589 | } |
| 590 | return (NULL); | |
| 591 | } | |
| 592 | ||
| 593 | /* | |
| 594 | * This function is called when we get a RST for a | |
| 595 | * non-existent connection, so that we can see if the | |
| 596 | * connection is in the syn cache. If it is, zap it. | |
| 597 | */ | |
| 598 | void | |
| f3f70f0d | 599 | syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) |
| 984263bc MD |
600 | { |
| 601 | struct syncache *sc; | |
| 602 | struct syncache_head *sch; | |
| 603 | ||
| 604 | sc = syncache_lookup(inc, &sch); | |
| c1d0893d | 605 | if (sc == NULL) { |
| 984263bc | 606 | return; |
| c1d0893d | 607 | } |
| 984263bc MD |
608 | /* |
| 609 | * If the RST bit is set, check the sequence number to see | |
| 610 | * if this is a valid reset segment. | |
| 611 | * RFC 793 page 37: | |
| 612 | * In all states except SYN-SENT, all reset (RST) segments | |
| 613 | * are validated by checking their SEQ-fields. A reset is | |
| 614 | * valid if its sequence number is in the window. | |
| 615 | * | |
| 616 | * The sequence number in the reset segment is normally an | |
| 617 | * echo of our outgoing acknowlegement numbers, but some hosts | |
| 618 | * send a reset with the sequence number at the rightmost edge | |
| 619 | * of our receive window, and we have to handle this case. | |
| 620 | */ | |
| 621 | if (SEQ_GEQ(th->th_seq, sc->sc_irs) && | |
| 622 | SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { | |
| 623 | syncache_drop(sc, sch); | |
| 624 | tcpstat.tcps_sc_reset++; | |
| 625 | } | |
| 626 | } | |
| 627 | ||
| 628 | void | |
| f3f70f0d | 629 | syncache_badack(struct in_conninfo *inc) |
| 984263bc MD |
630 | { |
| 631 | struct syncache *sc; | |
| 632 | struct syncache_head *sch; | |
| 633 | ||
| 634 | sc = syncache_lookup(inc, &sch); | |
| 635 | if (sc != NULL) { | |
| 636 | syncache_drop(sc, sch); | |
| 637 | tcpstat.tcps_sc_badack++; | |
| 638 | } | |
| 639 | } | |
| 640 | ||
| 641 | void | |
| f3f70f0d | 642 | syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) |
| 984263bc MD |
643 | { |
| 644 | struct syncache *sc; | |
| 645 | struct syncache_head *sch; | |
| 646 | ||
| 647 | /* we are called at splnet() here */ | |
| 648 | sc = syncache_lookup(inc, &sch); | |
| 649 | if (sc == NULL) | |
| 650 | return; | |
| 651 | ||
| 652 | /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ | |
| 653 | if (ntohl(th->th_seq) != sc->sc_iss) | |
| 654 | return; | |
| 655 | ||
| 656 | /* | |
| 657 | * If we've rertransmitted 3 times and this is our second error, | |
| 658 | * we remove the entry. Otherwise, we allow it to continue on. | |
| 659 | * This prevents us from incorrectly nuking an entry during a | |
| 660 | * spurious network outage. | |
| 661 | * | |
| 662 | * See tcp_notify(). | |
| 663 | */ | |
| 664 | if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) { | |
| 665 | sc->sc_flags |= SCF_UNREACH; | |
| 666 | return; | |
| 667 | } | |
| 668 | syncache_drop(sc, sch); | |
| 669 | tcpstat.tcps_sc_unreach++; | |
| 670 | } | |
| 671 | ||
| 672 | /* | |
| 673 | * Build a new TCP socket structure from a syncache entry. | |
| 48e7b118 MD |
674 | * |
| 675 | * This is called from the context of the SYN+ACK | |
| 984263bc MD |
676 | */ |
| 677 | static struct socket * | |
| 7e31206a | 678 | syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) |
| 984263bc | 679 | { |
| ed894f8c | 680 | struct inpcb *inp = NULL, *linp; |
| 984263bc | 681 | struct socket *so; |
| 2ce132be | 682 | struct tcpcb *tp, *ltp; |
| 48e7b118 | 683 | lwkt_port_t port; |
| 61896e3c JH |
684 | #ifdef INET6 |
| 685 | const boolean_t isipv6 = sc->sc_inc.inc_isipv6; | |
| 686 | #else | |
| 687 | const boolean_t isipv6 = FALSE; | |
| 688 | #endif | |
| 984263bc MD |
689 | |
| 690 | /* | |
| 691 | * Ok, create the full blown connection, and set things up | |
| 692 | * as they would have been set up if we had created the | |
| 693 | * connection when the SYN arrived. If we can't create | |
| 694 | * the connection, abort it. | |
| 0ce0603e MD |
695 | * |
| 696 | * Set the protocol processing port for the socket to the current | |
| 697 | * port (that the connection came in on). | |
| 984263bc MD |
698 | */ |
| 699 | so = sonewconn(lso, SS_ISCONNECTED); | |
| 700 | if (so == NULL) { | |
| 701 | /* | |
| 702 | * Drop the connection; we will send a RST if the peer | |
| 703 | * retransmits the ACK, | |
| 704 | */ | |
| 705 | tcpstat.tcps_listendrop++; | |
| 706 | goto abort; | |
| 707 | } | |
| 708 | ||
| 48e7b118 | 709 | /* |
| 984263bc MD |
710 | * Insert new socket into hash list. |
| 711 | */ | |
| 48e7b118 | 712 | inp = so->so_pcb; |
| 984263bc | 713 | inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6; |
| 61896e3c | 714 | if (isipv6) { |
| 984263bc MD |
715 | inp->in6p_laddr = sc->sc_inc.inc6_laddr; |
| 716 | } else { | |
| 61896e3c | 717 | #ifdef INET6 |
| 984263bc MD |
718 | inp->inp_vflag &= ~INP_IPV6; |
| 719 | inp->inp_vflag |= INP_IPV4; | |
| 65f3e756 | 720 | inp->inp_flags &= ~IN6P_IPV6_V6ONLY; |
| 984263bc MD |
721 | #endif |
| 722 | inp->inp_laddr = sc->sc_inc.inc_laddr; | |
| 984263bc | 723 | } |
| 984263bc | 724 | inp->inp_lport = sc->sc_inc.inc_lport; |
| 13d8907a | 725 | if (in_pcbinsporthash(inp) != 0) { |
| 984263bc MD |
726 | /* |
| 727 | * Undo the assignments above if we failed to | |
| 728 | * put the PCB on the hash lists. | |
| 729 | */ | |
| 61896e3c | 730 | if (isipv6) |
| 84204577 | 731 | inp->in6p_laddr = kin6addr_any; |
| f23061d4 | 732 | else |
| 984263bc MD |
733 | inp->inp_laddr.s_addr = INADDR_ANY; |
| 734 | inp->inp_lport = 0; | |
| 735 | goto abort; | |
| 736 | } | |
| 0ce0603e | 737 | linp = lso->so_pcb; |
| 984263bc MD |
738 | #ifdef IPSEC |
| 739 | /* copy old policy into new socket's */ | |
| ed894f8c | 740 | if (ipsec_copy_policy(linp->inp_sp, inp->inp_sp)) |
| a6ec04bc | 741 | kprintf("syncache_expand: could not copy policy\n"); |
| 984263bc | 742 | #endif |
| 61896e3c | 743 | if (isipv6) { |
| 984263bc | 744 | struct in6_addr laddr6; |
| d982be66 | 745 | struct sockaddr_in6 sin6; |
| 984263bc MD |
746 | /* |
| 747 | * Inherit socket options from the listening socket. | |
| 748 | * Note that in6p_inputopts are not (and should not be) | |
| 749 | * copied, since it stores previously received options and is | |
| 750 | * used to detect if each new option is different than the | |
| 751 | * previous one and hence should be passed to a user. | |
| f23061d4 | 752 | * If we copied in6p_inputopts, a user would not be able to |
| 984263bc MD |
753 | * receive options just after calling the accept system call. |
| 754 | */ | |
| ed894f8c JH |
755 | inp->inp_flags |= linp->inp_flags & INP_CONTROLOPTS; |
| 756 | if (linp->in6p_outputopts) | |
| 984263bc | 757 | inp->in6p_outputopts = |
| ed894f8c | 758 | ip6_copypktopts(linp->in6p_outputopts, M_INTWAIT); |
| 984263bc MD |
759 | inp->in6p_route = sc->sc_route6; |
| 760 | sc->sc_route6.ro_rt = NULL; | |
| 761 | ||
| d982be66 JH |
762 | sin6.sin6_family = AF_INET6; |
| 763 | sin6.sin6_len = sizeof sin6; | |
| 764 | sin6.sin6_addr = sc->sc_inc.inc6_faddr; | |
| 765 | sin6.sin6_port = sc->sc_inc.inc_fport; | |
| 766 | sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; | |
| 984263bc MD |
767 | laddr6 = inp->in6p_laddr; |
| 768 | if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) | |
| 769 | inp->in6p_laddr = sc->sc_inc.inc6_laddr; | |
| d982be66 | 770 | if (in6_pcbconnect(inp, (struct sockaddr *)&sin6, &thread0)) { |
| 984263bc | 771 | inp->in6p_laddr = laddr6; |
| 984263bc MD |
772 | goto abort; |
| 773 | } | |
| 61896e3c | 774 | } else { |
| 984263bc | 775 | struct in_addr laddr; |
| d982be66 | 776 | struct sockaddr_in sin; |
| 984263bc | 777 | |
| 7e31206a | 778 | inp->inp_options = ip_srcroute(m); |
| 984263bc MD |
779 | if (inp->inp_options == NULL) { |
| 780 | inp->inp_options = sc->sc_ipopts; | |
| 781 | sc->sc_ipopts = NULL; | |
| 782 | } | |
| 783 | inp->inp_route = sc->sc_route; | |
| 784 | sc->sc_route.ro_rt = NULL; | |
| 785 | ||
| d982be66 JH |
786 | sin.sin_family = AF_INET; |
| 787 | sin.sin_len = sizeof sin; | |
| 788 | sin.sin_addr = sc->sc_inc.inc_faddr; | |
| 789 | sin.sin_port = sc->sc_inc.inc_fport; | |
| 790 | bzero(sin.sin_zero, sizeof sin.sin_zero); | |
| 984263bc MD |
791 | laddr = inp->inp_laddr; |
| 792 | if (inp->inp_laddr.s_addr == INADDR_ANY) | |
| 793 | inp->inp_laddr = sc->sc_inc.inc_laddr; | |
| d982be66 | 794 | if (in_pcbconnect(inp, (struct sockaddr *)&sin, &thread0)) { |
| 984263bc | 795 | inp->inp_laddr = laddr; |
| 984263bc MD |
796 | goto abort; |
| 797 | } | |
| 984263bc MD |
798 | } |
| 799 | ||
| 48e7b118 MD |
800 | /* |
| 801 | * The current port should be in the context of the SYN+ACK and | |
| 802 | * so should match the tcp address port. | |
| 803 | * | |
| 804 | * XXX we may be running on the netisr thread instead of a tcp | |
| 805 | * thread, in which case port will not match | |
| 806 | * curthread->td_msgport. | |
| 807 | */ | |
| 808 | if (isipv6) { | |
| 809 | port = tcp6_addrport(); | |
| 810 | } else { | |
| 811 | port = tcp_addrport(inp->inp_faddr.s_addr, inp->inp_fport, | |
| 812 | inp->inp_laddr.s_addr, inp->inp_lport); | |
| 813 | } | |
| 0ce0603e MD |
814 | if (port != &curthread->td_msgport) { |
| 815 | print_backtrace(-1); | |
| 816 | kprintf("TCP PORT MISMATCH %p vs %p\n", | |
| 817 | port, &curthread->td_msgport); | |
| 818 | } | |
| 48e7b118 MD |
819 | /*KKASSERT(port == &curthread->td_msgport);*/ |
| 820 | ||
| 984263bc MD |
821 | tp = intotcpcb(inp); |
| 822 | tp->t_state = TCPS_SYN_RECEIVED; | |
| 823 | tp->iss = sc->sc_iss; | |
| 824 | tp->irs = sc->sc_irs; | |
| 825 | tcp_rcvseqinit(tp); | |
| 826 | tcp_sendseqinit(tp); | |
| 827 | tp->snd_wl1 = sc->sc_irs; | |
| 828 | tp->rcv_up = sc->sc_irs + 1; | |
| 829 | tp->rcv_wnd = sc->sc_wnd; | |
| 830 | tp->rcv_adv += tp->rcv_wnd; | |
| 831 | ||
| 61896e3c | 832 | tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH | TF_NODELAY); |
| 984263bc MD |
833 | if (sc->sc_flags & SCF_NOOPT) |
| 834 | tp->t_flags |= TF_NOOPT; | |
| 835 | if (sc->sc_flags & SCF_WINSCALE) { | |
| 61896e3c | 836 | tp->t_flags |= TF_REQ_SCALE | TF_RCVD_SCALE; |
| 984263bc MD |
837 | tp->requested_s_scale = sc->sc_requested_s_scale; |
| 838 | tp->request_r_scale = sc->sc_request_r_scale; | |
| 839 | } | |
| 840 | if (sc->sc_flags & SCF_TIMESTAMP) { | |
| 61896e3c | 841 | tp->t_flags |= TF_REQ_TSTMP | TF_RCVD_TSTMP; |
| 984263bc MD |
842 | tp->ts_recent = sc->sc_tsrecent; |
| 843 | tp->ts_recent_age = ticks; | |
| 844 | } | |
| 91489f6b JH |
845 | if (sc->sc_flags & SCF_SACK_PERMITTED) |
| 846 | tp->t_flags |= TF_SACK_PERMITTED; | |
| 984263bc | 847 | |
| b1992928 MD |
848 | #ifdef TCP_SIGNATURE |
| 849 | if (sc->sc_flags & SCF_SIGNATURE) | |
| 850 | tp->t_flags |= TF_SIGNATURE; | |
| 851 | #endif /* TCP_SIGNATURE */ | |
| 852 | ||
| 853 | ||
| 984263bc MD |
854 | tcp_mss(tp, sc->sc_peer_mss); |
| 855 | ||
| 856 | /* | |
| 857 | * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. | |
| 858 | */ | |
| 859 | if (sc->sc_rxtslot != 0) | |
| f23061d4 | 860 | tp->snd_cwnd = tp->t_maxseg; |
| 2ce132be SZ |
861 | |
| 862 | /* | |
| 863 | * Inherit some properties from the listen socket | |
| 864 | */ | |
| 865 | ltp = intotcpcb(linp); | |
| 866 | tp->t_keepinit = ltp->t_keepinit; | |
| 7ea3a353 | 867 | tp->t_keepidle = ltp->t_keepidle; |
| 5d61ded3 SZ |
868 | tp->t_keepintvl = ltp->t_keepintvl; |
| 869 | tp->t_keepcnt = ltp->t_keepcnt; | |
| 870 | tp->t_maxidle = ltp->t_maxidle; | |
| 2ce132be | 871 | |
| 48e7b118 | 872 | tcp_create_timermsg(tp, port); |
| 2ce132be | 873 | tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); |
| 984263bc MD |
874 | |
| 875 | tcpstat.tcps_accepts++; | |
| 876 | return (so); | |
| 877 | ||
| 878 | abort: | |
| 879 | if (so != NULL) | |
| fd86a41c | 880 | soabort_oncpu(so); |
| 984263bc MD |
881 | return (NULL); |
| 882 | } | |
| 883 | ||
| 884 | /* | |
| 885 | * This function gets called when we receive an ACK for a | |
| 886 | * socket in the LISTEN state. We look up the connection | |
| 887 | * in the syncache, and if its there, we pull it out of | |
| 888 | * the cache and turn it into a full-blown connection in | |
| 889 | * the SYN-RECEIVED state. | |
| 890 | */ | |
| 891 | int | |
| f3f70f0d SW |
892 | syncache_expand(struct in_conninfo *inc, struct tcphdr *th, struct socket **sop, |
| 893 | struct mbuf *m) | |
| 984263bc MD |
894 | { |
| 895 | struct syncache *sc; | |
| 896 | struct syncache_head *sch; | |
| 897 | struct socket *so; | |
| 898 | ||
| 899 | sc = syncache_lookup(inc, &sch); | |
| 900 | if (sc == NULL) { | |
| 901 | /* | |
| f23061d4 | 902 | * There is no syncache entry, so see if this ACK is |
| 984263bc MD |
903 | * a returning syncookie. To do this, first: |
| 904 | * A. See if this socket has had a syncache entry dropped in | |
| 905 | * the past. We don't want to accept a bogus syncookie | |
| f23061d4 | 906 | * if we've never received a SYN. |
| 984263bc MD |
907 | * B. check that the syncookie is valid. If it is, then |
| 908 | * cobble up a fake syncache entry, and return. | |
| 909 | */ | |
| 910 | if (!tcp_syncookies) | |
| 911 | return (0); | |
| 912 | sc = syncookie_lookup(inc, th, *sop); | |
| 913 | if (sc == NULL) | |
| 914 | return (0); | |
| 915 | sch = NULL; | |
| 916 | tcpstat.tcps_sc_recvcookie++; | |
| 917 | } | |
| 918 | ||
| 919 | /* | |
| 920 | * If seg contains an ACK, but not for our SYN/ACK, send a RST. | |
| 921 | */ | |
| 922 | if (th->th_ack != sc->sc_iss + 1) | |
| 923 | return (0); | |
| 924 | ||
| 7e31206a | 925 | so = syncache_socket(sc, *sop, m); |
| 984263bc MD |
926 | if (so == NULL) { |
| 927 | #if 0 | |
| 928 | resetandabort: | |
| 929 | /* XXXjlemon check this - is this correct? */ | |
| f23061d4 | 930 | tcp_respond(NULL, m, m, th, |
| 61896e3c | 931 | th->th_seq + tlen, (tcp_seq)0, TH_RST | TH_ACK); |
| 984263bc MD |
932 | #endif |
| 933 | m_freem(m); /* XXX only needed for above */ | |
| 934 | tcpstat.tcps_sc_aborted++; | |
| 935 | } else { | |
| 984263bc MD |
936 | tcpstat.tcps_sc_completed++; |
| 937 | } | |
| 938 | if (sch == NULL) | |
| 939 | syncache_free(sc); | |
| 940 | else | |
| 941 | syncache_drop(sc, sch); | |
| 942 | *sop = so; | |
| 943 | return (1); | |
| 944 | } | |
| 945 | ||
| 946 | /* | |
| 947 | * Given a LISTEN socket and an inbound SYN request, add | |
| 948 | * this to the syn cache, and send back a segment: | |
| 949 | * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> | |
| 950 | * to the source. | |
| 951 | * | |
| 952 | * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. | |
| 953 | * Doing so would require that we hold onto the data and deliver it | |
| 954 | * to the application. However, if we are the target of a SYN-flood | |
| 955 | * DoS attack, an attacker could send data which would eventually | |
| 956 | * consume all available buffer space if it were ACKed. By not ACKing | |
| 957 | * the data, we avoid this DoS scenario. | |
| 958 | */ | |
| 959 | int | |
| f3f70f0d SW |
960 | syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, |
| 961 | struct socket **sop, struct mbuf *m) | |
| 984263bc | 962 | { |
| 00943fd6 | 963 | struct tcp_syncache_percpu *syncache_percpu; |
| 984263bc MD |
964 | struct tcpcb *tp; |
| 965 | struct socket *so; | |
| 966 | struct syncache *sc = NULL; | |
| 967 | struct syncache_head *sch; | |
| 968 | struct mbuf *ipopts = NULL; | |
| 913d40d1 | 969 | int win; |
| 984263bc | 970 | |
| 00943fd6 | 971 | syncache_percpu = &tcp_syncache_percpu[mycpu->gd_cpuid]; |
| 984263bc MD |
972 | so = *sop; |
| 973 | tp = sototcpcb(so); | |
| 974 | ||
| 975 | /* | |
| 976 | * Remember the IP options, if any. | |
| 977 | */ | |
| 978 | #ifdef INET6 | |
| 979 | if (!inc->inc_isipv6) | |
| 980 | #endif | |
| 7e31206a | 981 | ipopts = ip_srcroute(m); |
| 984263bc MD |
982 | |
| 983 | /* | |
| 984 | * See if we already have an entry for this connection. | |
| 985 | * If we do, resend the SYN,ACK, and reset the retransmit timer. | |
| 986 | * | |
| 987 | * XXX | |
| 91489f6b JH |
988 | * The syncache should be re-initialized with the contents |
| 989 | * of the new SYN which may have different options. | |
| 984263bc MD |
990 | */ |
| 991 | sc = syncache_lookup(inc, &sch); | |
| 992 | if (sc != NULL) { | |
| 993 | tcpstat.tcps_sc_dupsyn++; | |
| 994 | if (ipopts) { | |
| 995 | /* | |
| 996 | * If we were remembering a previous source route, | |
| 997 | * forget it and use the new one we've been given. | |
| 998 | */ | |
| 999 | if (sc->sc_ipopts) | |
| f23061d4 | 1000 | m_free(sc->sc_ipopts); |
| 984263bc MD |
1001 | sc->sc_ipopts = ipopts; |
| 1002 | } | |
| 1003 | /* | |
| 1004 | * Update timestamp if present. | |
| 1005 | */ | |
| 1006 | if (sc->sc_flags & SCF_TIMESTAMP) | |
| 1007 | sc->sc_tsrecent = to->to_tsval; | |
| 91489f6b JH |
1008 | |
| 1009 | /* Just update the TOF_SACK_PERMITTED for now. */ | |
| 1010 | if (tcp_do_sack && (to->to_flags & TOF_SACK_PERMITTED)) | |
| 1011 | sc->sc_flags |= SCF_SACK_PERMITTED; | |
| 1012 | else | |
| 1013 | sc->sc_flags &= ~SCF_SACK_PERMITTED; | |
| 1014 | ||
| 984263bc MD |
1015 | /* |
| 1016 | * PCB may have changed, pick up new values. | |
| 1017 | */ | |
| 1018 | sc->sc_tp = tp; | |
| 1019 | sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; | |
| 1020 | if (syncache_respond(sc, m) == 0) { | |
| 00943fd6 | 1021 | TAILQ_REMOVE(&syncache_percpu->timerq[sc->sc_rxtslot], |
| e5fe3477 | 1022 | sc, sc_timerq); |
| 00943fd6 | 1023 | syncache_timeout(syncache_percpu, sc, sc->sc_rxtslot); |
| f23061d4 | 1024 | tcpstat.tcps_sndacks++; |
| 984263bc MD |
1025 | tcpstat.tcps_sndtotal++; |
| 1026 | } | |
| 1027 | *sop = NULL; | |
| 1028 | return (1); | |
| 1029 | } | |
| 1030 | ||
| d191a96b | 1031 | /* |
| 984263bc MD |
1032 | * Fill in the syncache values. |
| 1033 | */ | |
| 9f42c129 | 1034 | sc = kmalloc(sizeof(struct syncache), M_SYNCACHE, M_WAITOK|M_ZERO); |
| 984263bc MD |
1035 | sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; |
| 1036 | sc->sc_ipopts = ipopts; | |
| 1037 | sc->sc_inc.inc_fport = inc->inc_fport; | |
| 1038 | sc->sc_inc.inc_lport = inc->inc_lport; | |
| e5fe3477 | 1039 | sc->sc_tp = tp; |
| 984263bc MD |
1040 | #ifdef INET6 |
| 1041 | sc->sc_inc.inc_isipv6 = inc->inc_isipv6; | |
| 1042 | if (inc->inc_isipv6) { | |
| 1043 | sc->sc_inc.inc6_faddr = inc->inc6_faddr; | |
| 1044 | sc->sc_inc.inc6_laddr = inc->inc6_laddr; | |
| 1045 | sc->sc_route6.ro_rt = NULL; | |
| 1046 | } else | |
| 1047 | #endif | |
| 1048 | { | |
| 1049 | sc->sc_inc.inc_faddr = inc->inc_faddr; | |
| 1050 | sc->sc_inc.inc_laddr = inc->inc_laddr; | |
| 1051 | sc->sc_route.ro_rt = NULL; | |
| 1052 | } | |
| 1053 | sc->sc_irs = th->th_seq; | |
| 1054 | sc->sc_flags = 0; | |
| 1055 | sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0; | |
| 1056 | if (tcp_syncookies) | |
| 1057 | sc->sc_iss = syncookie_generate(sc); | |
| 1058 | else | |
| 0ced1954 | 1059 | sc->sc_iss = karc4random(); |
| 984263bc | 1060 | |
| 6d49aa6f MD |
1061 | /* Initial receive window: clip ssb_space to [0 .. TCP_MAXWIN] */ |
| 1062 | win = ssb_space(&so->so_rcv); | |
| 984263bc MD |
1063 | win = imax(win, 0); |
| 1064 | win = imin(win, TCP_MAXWIN); | |
| 1065 | sc->sc_wnd = win; | |
| 1066 | ||
| 1067 | if (tcp_do_rfc1323) { | |
| 1068 | /* | |
| 1069 | * A timestamp received in a SYN makes | |
| 1070 | * it ok to send timestamp requests and replies. | |
| 1071 | */ | |
| 1072 | if (to->to_flags & TOF_TS) { | |
| 1073 | sc->sc_tsrecent = to->to_tsval; | |
| 1074 | sc->sc_flags |= SCF_TIMESTAMP; | |
| 1075 | } | |
| 1076 | if (to->to_flags & TOF_SCALE) { | |
| 46e92930 | 1077 | int wscale = TCP_MIN_WINSHIFT; |
| 984263bc MD |
1078 | |
| 1079 | /* Compute proper scaling value from buffer space */ | |
| 1080 | while (wscale < TCP_MAX_WINSHIFT && | |
| 46e92930 | 1081 | (TCP_MAXWIN << wscale) < so->so_rcv.ssb_hiwat) { |
| 984263bc | 1082 | wscale++; |
| 46e92930 | 1083 | } |
| 984263bc MD |
1084 | sc->sc_request_r_scale = wscale; |
| 1085 | sc->sc_requested_s_scale = to->to_requested_s_scale; | |
| 1086 | sc->sc_flags |= SCF_WINSCALE; | |
| 1087 | } | |
| 1088 | } | |
| 91489f6b JH |
1089 | if (tcp_do_sack && (to->to_flags & TOF_SACK_PERMITTED)) |
| 1090 | sc->sc_flags |= SCF_SACK_PERMITTED; | |
| 984263bc MD |
1091 | if (tp->t_flags & TF_NOOPT) |
| 1092 | sc->sc_flags = SCF_NOOPT; | |
| b1992928 MD |
1093 | #ifdef TCP_SIGNATURE |
| 1094 | /* | |
| 1095 | * If listening socket requested TCP digests, and received SYN | |
| 1096 | * contains the option, flag this in the syncache so that | |
| 1097 | * syncache_respond() will do the right thing with the SYN+ACK. | |
| 1098 | * XXX Currently we always record the option by default and will | |
| 1099 | * attempt to use it in syncache_respond(). | |
| 1100 | */ | |
| 1101 | if (to->to_flags & TOF_SIGNATURE) | |
| 1102 | sc->sc_flags = SCF_SIGNATURE; | |
| 1103 | #endif /* TCP_SIGNATURE */ | |
| 984263bc | 1104 | |
| 984263bc MD |
1105 | if (syncache_respond(sc, m) == 0) { |
| 1106 | syncache_insert(sc, sch); | |
| 1107 | tcpstat.tcps_sndacks++; | |
| 1108 | tcpstat.tcps_sndtotal++; | |
| 1109 | } else { | |
| 1110 | syncache_free(sc); | |
| 1111 | tcpstat.tcps_sc_dropped++; | |
| 1112 | } | |
| 1113 | *sop = NULL; | |
| 1114 | return (1); | |
| 1115 | } | |
| 1116 | ||
| 1117 | static int | |
| f3f70f0d | 1118 | syncache_respond(struct syncache *sc, struct mbuf *m) |
| 984263bc MD |
1119 | { |
| 1120 | u_int8_t *optp; | |
| 1121 | int optlen, error; | |
| 1122 | u_int16_t tlen, hlen, mssopt; | |
| 1123 | struct ip *ip = NULL; | |
| 1124 | struct rtentry *rt; | |
| 1125 | struct tcphdr *th; | |
| 984263bc | 1126 | struct ip6_hdr *ip6 = NULL; |
| 61896e3c JH |
1127 | #ifdef INET6 |
| 1128 | const boolean_t isipv6 = sc->sc_inc.inc_isipv6; | |
| 1129 | #else | |
| 1130 | const boolean_t isipv6 = FALSE; | |
| 984263bc MD |
1131 | #endif |
| 1132 | ||
| 61896e3c | 1133 | if (isipv6) { |
| 984263bc MD |
1134 | rt = tcp_rtlookup6(&sc->sc_inc); |
| 1135 | if (rt != NULL) | |
| 1136 | mssopt = rt->rt_ifp->if_mtu - | |
| 1137 | (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); | |
| f23061d4 | 1138 | else |
| 984263bc MD |
1139 | mssopt = tcp_v6mssdflt; |
| 1140 | hlen = sizeof(struct ip6_hdr); | |
| 61896e3c | 1141 | } else { |
| 984263bc MD |
1142 | rt = tcp_rtlookup(&sc->sc_inc); |
| 1143 | if (rt != NULL) | |
| 1144 | mssopt = rt->rt_ifp->if_mtu - | |
| 1145 | (sizeof(struct ip) + sizeof(struct tcphdr)); | |
| f23061d4 | 1146 | else |
| 984263bc MD |
1147 | mssopt = tcp_mssdflt; |
| 1148 | hlen = sizeof(struct ip); | |
| 1149 | } | |
| 1150 | ||
| 1151 | /* Compute the size of the TCP options. */ | |
| 1152 | if (sc->sc_flags & SCF_NOOPT) { | |
| 1153 | optlen = 0; | |
| 1154 | } else { | |
| 1155 | optlen = TCPOLEN_MAXSEG + | |
| 1156 | ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) + | |
| 1157 | ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) + | |
| 91489f6b | 1158 | ((sc->sc_flags & SCF_SACK_PERMITTED) ? |
| f23061d4 | 1159 | TCPOLEN_SACK_PERMITTED_ALIGNED : 0); |
| b1992928 MD |
1160 | #ifdef TCP_SIGNATURE |
| 1161 | optlen += ((sc->sc_flags & SCF_SIGNATURE) ? | |
| 1162 | (TCPOLEN_SIGNATURE + 2) : 0); | |
| 1163 | #endif /* TCP_SIGNATURE */ | |
| 984263bc MD |
1164 | } |
| 1165 | tlen = hlen + sizeof(struct tcphdr) + optlen; | |
| 1166 | ||
| 1167 | /* | |
| 1168 | * XXX | |
| 1169 | * assume that the entire packet will fit in a header mbuf | |
| 1170 | */ | |
| 1171 | KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small")); | |
| 1172 | ||
| 1173 | /* | |
| 1174 | * XXX shouldn't this reuse the mbuf if possible ? | |
| 1175 | * Create the IP+TCP header from scratch. | |
| 1176 | */ | |
| 1177 | if (m) | |
| 1178 | m_freem(m); | |
| 1179 | ||
| 74f1caca | 1180 | m = m_gethdr(MB_DONTWAIT, MT_HEADER); |
| 984263bc MD |
1181 | if (m == NULL) |
| 1182 | return (ENOBUFS); | |
| 1183 | m->m_data += max_linkhdr; | |
| 1184 | m->m_len = tlen; | |
| 1185 | m->m_pkthdr.len = tlen; | |
| 1186 | m->m_pkthdr.rcvif = NULL; | |
| 1187 | ||
| 61896e3c | 1188 | if (isipv6) { |
| 984263bc MD |
1189 | ip6 = mtod(m, struct ip6_hdr *); |
| 1190 | ip6->ip6_vfc = IPV6_VERSION; | |
| 1191 | ip6->ip6_nxt = IPPROTO_TCP; | |
| 1192 | ip6->ip6_src = sc->sc_inc.inc6_laddr; | |
| 1193 | ip6->ip6_dst = sc->sc_inc.inc6_faddr; | |
| 1194 | ip6->ip6_plen = htons(tlen - hlen); | |
| 1195 | /* ip6_hlim is set after checksum */ | |
| 1196 | /* ip6_flow = ??? */ | |
| 1197 | ||
| 1198 | th = (struct tcphdr *)(ip6 + 1); | |
| 61896e3c | 1199 | } else { |
| 984263bc MD |
1200 | ip = mtod(m, struct ip *); |
| 1201 | ip->ip_v = IPVERSION; | |
| 1202 | ip->ip_hl = sizeof(struct ip) >> 2; | |
| 1203 | ip->ip_len = tlen; | |
| 1204 | ip->ip_id = 0; | |
| 1205 | ip->ip_off = 0; | |
| 1206 | ip->ip_sum = 0; | |
| 1207 | ip->ip_p = IPPROTO_TCP; | |
| 1208 | ip->ip_src = sc->sc_inc.inc_laddr; | |
| 1209 | ip->ip_dst = sc->sc_inc.inc_faddr; | |
| 1210 | ip->ip_ttl = sc->sc_tp->t_inpcb->inp_ip_ttl; /* XXX */ | |
| 1211 | ip->ip_tos = sc->sc_tp->t_inpcb->inp_ip_tos; /* XXX */ | |
| 1212 | ||
| 1213 | /* | |
| 61896e3c JH |
1214 | * See if we should do MTU discovery. Route lookups are |
| 1215 | * expensive, so we will only unset the DF bit if: | |
| 984263bc MD |
1216 | * |
| 1217 | * 1) path_mtu_discovery is disabled | |
| 1218 | * 2) the SCF_UNREACH flag has been set | |
| 1219 | */ | |
| 1220 | if (path_mtu_discovery | |
| 1221 | && ((sc->sc_flags & SCF_UNREACH) == 0)) { | |
| 1222 | ip->ip_off |= IP_DF; | |
| 1223 | } | |
| 1224 | ||
| 1225 | th = (struct tcphdr *)(ip + 1); | |
| 1226 | } | |
| 1227 | th->th_sport = sc->sc_inc.inc_lport; | |
| 1228 | th->th_dport = sc->sc_inc.inc_fport; | |
| 1229 | ||
| 1230 | th->th_seq = htonl(sc->sc_iss); | |
| 1231 | th->th_ack = htonl(sc->sc_irs + 1); | |
| 1232 | th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; | |
| 1233 | th->th_x2 = 0; | |
| 61896e3c | 1234 | th->th_flags = TH_SYN | TH_ACK; |
| 984263bc MD |
1235 | th->th_win = htons(sc->sc_wnd); |
| 1236 | th->th_urp = 0; | |
| 1237 | ||
| 1238 | /* Tack on the TCP options. */ | |
| 1239 | if (optlen == 0) | |
| 1240 | goto no_options; | |
| 1241 | optp = (u_int8_t *)(th + 1); | |
| 1242 | *optp++ = TCPOPT_MAXSEG; | |
| 1243 | *optp++ = TCPOLEN_MAXSEG; | |
| 1244 | *optp++ = (mssopt >> 8) & 0xff; | |
| 1245 | *optp++ = mssopt & 0xff; | |
| 1246 | ||
| 1247 | if (sc->sc_flags & SCF_WINSCALE) { | |
| 1248 | *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | | |
| 1249 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | | |
| 1250 | sc->sc_request_r_scale); | |
| 1251 | optp += 4; | |
| 1252 | } | |
| 1253 | ||
| 1254 | if (sc->sc_flags & SCF_TIMESTAMP) { | |
| 1255 | u_int32_t *lp = (u_int32_t *)(optp); | |
| 1256 | ||
| 1257 | /* Form timestamp option as shown in appendix A of RFC 1323. */ | |
| 1258 | *lp++ = htonl(TCPOPT_TSTAMP_HDR); | |
| 1259 | *lp++ = htonl(ticks); | |
| 1260 | *lp = htonl(sc->sc_tsrecent); | |
| 1261 | optp += TCPOLEN_TSTAMP_APPA; | |
| 1262 | } | |
| 1263 | ||
| b1992928 MD |
1264 | #ifdef TCP_SIGNATURE |
| 1265 | /* | |
| 1266 | * Handle TCP-MD5 passive opener response. | |
| 1267 | */ | |
| 1268 | if (sc->sc_flags & SCF_SIGNATURE) { | |
| 1269 | u_int8_t *bp = optp; | |
| 1270 | int i; | |
| 1271 | ||
| 1272 | *bp++ = TCPOPT_SIGNATURE; | |
| 1273 | *bp++ = TCPOLEN_SIGNATURE; | |
| 1274 | for (i = 0; i < TCP_SIGLEN; i++) | |
| 1275 | *bp++ = 0; | |
| 1276 | tcpsignature_compute(m, 0, optlen, | |
| 1277 | optp + 2, IPSEC_DIR_OUTBOUND); | |
| 1278 | *bp++ = TCPOPT_NOP; | |
| 1279 | *bp++ = TCPOPT_EOL; | |
| 1280 | optp += TCPOLEN_SIGNATURE + 2; | |
| 1281 | } | |
| 1282 | #endif /* TCP_SIGNATURE */ | |
| 1283 | ||
| 91489f6b JH |
1284 | if (sc->sc_flags & SCF_SACK_PERMITTED) { |
| 1285 | *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); | |
| 1286 | optp += TCPOLEN_SACK_PERMITTED_ALIGNED; | |
| 1287 | } | |
| 1288 | ||
| 61896e3c JH |
1289 | no_options: |
| 1290 | if (isipv6) { | |
| 984263bc MD |
1291 | struct route_in6 *ro6 = &sc->sc_route6; |
| 1292 | ||
| 1293 | th->th_sum = 0; | |
| 1294 | th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); | |
| 1295 | ip6->ip6_hlim = in6_selecthlim(NULL, | |
| 1296 | ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); | |
| 1297 | error = ip6_output(m, NULL, ro6, 0, NULL, NULL, | |
| 1298 | sc->sc_tp->t_inpcb); | |
| 61896e3c | 1299 | } else { |
| f23061d4 JH |
1300 | th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, |
| 1301 | htons(tlen - hlen + IPPROTO_TCP)); | |
| 984263bc MD |
1302 | m->m_pkthdr.csum_flags = CSUM_TCP; |
| 1303 | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | |
| 1dbb3516 SZ |
1304 | error = ip_output(m, sc->sc_ipopts, &sc->sc_route, |
| 1305 | IP_DEBUGROUTE, NULL, sc->sc_tp->t_inpcb); | |
| 984263bc MD |
1306 | } |
| 1307 | return (error); | |
| 1308 | } | |
| 1309 | ||
| 1310 | /* | |
| 1311 | * cookie layers: | |
| 1312 | * | |
| 1313 | * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .| | |
| 1314 | * | peer iss | | |
| 1315 | * | MD5(laddr,faddr,secret,lport,fport) |. . . . . . .| | |
| 1316 | * | 0 |(A)| | | |
| 1317 | * (A): peer mss index | |
| 1318 | */ | |
| 1319 | ||
| 1320 | /* | |
| 1321 | * The values below are chosen to minimize the size of the tcp_secret | |
| 1322 | * table, as well as providing roughly a 16 second lifetime for the cookie. | |
| 1323 | */ | |
| 1324 | ||
| 1325 | #define SYNCOOKIE_WNDBITS 5 /* exposed bits for window indexing */ | |
| 1326 | #define SYNCOOKIE_TIMESHIFT 1 /* scale ticks to window time units */ | |
| 1327 | ||
| 1328 | #define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1) | |
| 1329 | #define SYNCOOKIE_NSECRETS (1 << SYNCOOKIE_WNDBITS) | |
| 1330 | #define SYNCOOKIE_TIMEOUT \ | |
| 1331 | (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT)) | |
| f23061d4 | 1332 | #define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK) |
| 984263bc MD |
1333 | |
| 1334 | static struct { | |
| 1335 | u_int32_t ts_secbits[4]; | |
| 1336 | u_int ts_expire; | |
| 1337 | } tcp_secret[SYNCOOKIE_NSECRETS]; | |
| 1338 | ||
| 1339 | static int tcp_msstab[] = { 0, 536, 1460, 8960 }; | |
| 1340 | ||
| 1341 | static MD5_CTX syn_ctx; | |
| 1342 | ||
| 1343 | #define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v)) | |
| 1344 | ||
| 1345 | struct md5_add { | |
| 1346 | u_int32_t laddr, faddr; | |
| 1347 | u_int32_t secbits[4]; | |
| 1348 | u_int16_t lport, fport; | |
| 1349 | }; | |
| 1350 | ||
| 1351 | #ifdef CTASSERT | |
| 1352 | CTASSERT(sizeof(struct md5_add) == 28); | |
| 1353 | #endif | |
| 1354 | ||
| 1355 | /* | |
| 1356 | * Consider the problem of a recreated (and retransmitted) cookie. If the | |
| f23061d4 JH |
1357 | * original SYN was accepted, the connection is established. The second |
| 1358 | * SYN is inflight, and if it arrives with an ISN that falls within the | |
| 1359 | * receive window, the connection is killed. | |
| 984263bc MD |
1360 | * |
| 1361 | * However, since cookies have other problems, this may not be worth | |
| 1362 | * worrying about. | |
| 1363 | */ | |
| 1364 | ||
| 1365 | static u_int32_t | |
| 1366 | syncookie_generate(struct syncache *sc) | |
| 1367 | { | |
| 1368 | u_int32_t md5_buffer[4]; | |
| 1369 | u_int32_t data; | |
| 1370 | int idx, i; | |
| 1371 | struct md5_add add; | |
| 61896e3c JH |
1372 | #ifdef INET6 |
| 1373 | const boolean_t isipv6 = sc->sc_inc.inc_isipv6; | |
| 1374 | #else | |
| 1375 | const boolean_t isipv6 = FALSE; | |
| 1376 | #endif | |
| 984263bc MD |
1377 | |
| 1378 | idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK; | |
| 1379 | if (tcp_secret[idx].ts_expire < ticks) { | |
| 1380 | for (i = 0; i < 4; i++) | |
| 0ced1954 | 1381 | tcp_secret[idx].ts_secbits[i] = karc4random(); |
| 984263bc MD |
1382 | tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT; |
| 1383 | } | |
| 1384 | for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--) | |
| 1385 | if (tcp_msstab[data] <= sc->sc_peer_mss) | |
| 1386 | break; | |
| 1387 | data = (data << SYNCOOKIE_WNDBITS) | idx; | |
| 1388 | data ^= sc->sc_irs; /* peer's iss */ | |
| 1389 | MD5Init(&syn_ctx); | |
| 61896e3c | 1390 | if (isipv6) { |
| 984263bc MD |
1391 | MD5Add(sc->sc_inc.inc6_laddr); |
| 1392 | MD5Add(sc->sc_inc.inc6_faddr); | |
| 1393 | add.laddr = 0; | |
| 1394 | add.faddr = 0; | |
| 61896e3c | 1395 | } else { |
| 984263bc MD |
1396 | add.laddr = sc->sc_inc.inc_laddr.s_addr; |
| 1397 | add.faddr = sc->sc_inc.inc_faddr.s_addr; | |
| 1398 | } | |
| 1399 | add.lport = sc->sc_inc.inc_lport; | |
| 1400 | add.fport = sc->sc_inc.inc_fport; | |
| 1401 | add.secbits[0] = tcp_secret[idx].ts_secbits[0]; | |
| 1402 | add.secbits[1] = tcp_secret[idx].ts_secbits[1]; | |
| 1403 | add.secbits[2] = tcp_secret[idx].ts_secbits[2]; | |
| 1404 | add.secbits[3] = tcp_secret[idx].ts_secbits[3]; | |
| 1405 | MD5Add(add); | |
| 1406 | MD5Final((u_char *)&md5_buffer, &syn_ctx); | |
| 1407 | data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK); | |
| 1408 | return (data); | |
| 1409 | } | |
| 1410 | ||
| 1411 | static struct syncache * | |
| f3f70f0d | 1412 | syncookie_lookup(struct in_conninfo *inc, struct tcphdr *th, struct socket *so) |
| 984263bc MD |
1413 | { |
| 1414 | u_int32_t md5_buffer[4]; | |
| 1415 | struct syncache *sc; | |
| 1416 | u_int32_t data; | |
| 1417 | int wnd, idx; | |
| 1418 | struct md5_add add; | |
| 1419 | ||
| 1420 | data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */ | |
| 1421 | idx = data & SYNCOOKIE_WNDMASK; | |
| 1422 | if (tcp_secret[idx].ts_expire < ticks || | |
| 1423 | sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks) | |
| 1424 | return (NULL); | |
| 1425 | MD5Init(&syn_ctx); | |
| 1426 | #ifdef INET6 | |
| 1427 | if (inc->inc_isipv6) { | |
| 1428 | MD5Add(inc->inc6_laddr); | |
| 1429 | MD5Add(inc->inc6_faddr); | |
| 1430 | add.laddr = 0; | |
| 1431 | add.faddr = 0; | |
| 1432 | } else | |
| 1433 | #endif | |
| 1434 | { | |
| 1435 | add.laddr = inc->inc_laddr.s_addr; | |
| 1436 | add.faddr = inc->inc_faddr.s_addr; | |
| 1437 | } | |
| 1438 | add.lport = inc->inc_lport; | |
| 1439 | add.fport = inc->inc_fport; | |
| 1440 | add.secbits[0] = tcp_secret[idx].ts_secbits[0]; | |
| 1441 | add.secbits[1] = tcp_secret[idx].ts_secbits[1]; | |
| 1442 | add.secbits[2] = tcp_secret[idx].ts_secbits[2]; | |
| 1443 | add.secbits[3] = tcp_secret[idx].ts_secbits[3]; | |
| 1444 | MD5Add(add); | |
| 1445 | MD5Final((u_char *)&md5_buffer, &syn_ctx); | |
| 1446 | data ^= md5_buffer[0]; | |
| f23061d4 | 1447 | if (data & ~SYNCOOKIE_DATAMASK) |
| 984263bc MD |
1448 | return (NULL); |
| 1449 | data = data >> SYNCOOKIE_WNDBITS; | |
| 1450 | ||
| d191a96b | 1451 | /* |
| 984263bc MD |
1452 | * Fill in the syncache values. |
| 1453 | * XXX duplicate code from syncache_add | |
| 1454 | */ | |
| 9f42c129 | 1455 | sc = kmalloc(sizeof(struct syncache), M_SYNCACHE, M_WAITOK|M_ZERO); |
| 984263bc MD |
1456 | sc->sc_ipopts = NULL; |
| 1457 | sc->sc_inc.inc_fport = inc->inc_fport; | |
| 1458 | sc->sc_inc.inc_lport = inc->inc_lport; | |
| 1459 | #ifdef INET6 | |
| 1460 | sc->sc_inc.inc_isipv6 = inc->inc_isipv6; | |
| 1461 | if (inc->inc_isipv6) { | |
| 1462 | sc->sc_inc.inc6_faddr = inc->inc6_faddr; | |
| 1463 | sc->sc_inc.inc6_laddr = inc->inc6_laddr; | |
| 1464 | sc->sc_route6.ro_rt = NULL; | |
| 1465 | } else | |
| 1466 | #endif | |
| 1467 | { | |
| 1468 | sc->sc_inc.inc_faddr = inc->inc_faddr; | |
| 1469 | sc->sc_inc.inc_laddr = inc->inc_laddr; | |
| 1470 | sc->sc_route.ro_rt = NULL; | |
| 1471 | } | |
| 1472 | sc->sc_irs = th->th_seq - 1; | |
| 1473 | sc->sc_iss = th->th_ack - 1; | |
| 6d49aa6f | 1474 | wnd = ssb_space(&so->so_rcv); |
| 984263bc MD |
1475 | wnd = imax(wnd, 0); |
| 1476 | wnd = imin(wnd, TCP_MAXWIN); | |
| 1477 | sc->sc_wnd = wnd; | |
| 1478 | sc->sc_flags = 0; | |
| 1479 | sc->sc_rxtslot = 0; | |
| 1480 | sc->sc_peer_mss = tcp_msstab[data]; | |
| 1481 | return (sc); | |
| 1482 | } |