| Commit | Line | Data |
|---|---|---|
| 984263bc | 1 | /* |
| df80f2ea | 2 | * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. |
| 984263bc MD |
3 | * Copyright (c) 1982, 1986, 1988, 1990, 1993 |
| 4 | * The Regents of the University of California. All rights reserved. | |
| 5 | * | |
| 6 | * Redistribution and use in source and binary forms, with or without | |
| 7 | * modification, are permitted provided that the following conditions | |
| 8 | * are met: | |
| 9 | * 1. Redistributions of source code must retain the above copyright | |
| 10 | * notice, this list of conditions and the following disclaimer. | |
| 11 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer in the | |
| 13 | * documentation and/or other materials provided with the distribution. | |
| 14 | * 3. All advertising materials mentioning features or use of this software | |
| 15 | * must display the following acknowledgement: | |
| 16 | * This product includes software developed by the University of | |
| 17 | * California, Berkeley and its contributors. | |
| 18 | * 4. Neither the name of the University nor the names of its contributors | |
| 19 | * may be used to endorse or promote products derived from this software | |
| 20 | * without specific prior written permission. | |
| 21 | * | |
| 22 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 23 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 24 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 25 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 26 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 27 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 28 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 29 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 31 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 32 | * SUCH DAMAGE. | |
| 33 | * | |
| 34 | * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 | |
| 35 | * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ | |
| 9116be8e | 36 | * $DragonFly: src/sys/kern/uipc_socket2.c,v 1.33 2008/09/02 16:17:52 dillon Exp $ |
| 984263bc MD |
37 | */ |
| 38 | ||
| 39 | #include "opt_param.h" | |
| 40 | #include <sys/param.h> | |
| 41 | #include <sys/systm.h> | |
| 42 | #include <sys/domain.h> | |
| 43 | #include <sys/file.h> /* for maxfiles */ | |
| 44 | #include <sys/kernel.h> | |
| 45 | #include <sys/proc.h> | |
| 46 | #include <sys/malloc.h> | |
| 47 | #include <sys/mbuf.h> | |
| 48 | #include <sys/protosw.h> | |
| 49 | #include <sys/resourcevar.h> | |
| 50 | #include <sys/stat.h> | |
| 51 | #include <sys/socket.h> | |
| 52 | #include <sys/socketvar.h> | |
| 002c1265 | 53 | #include <sys/socketops.h> |
| 984263bc MD |
54 | #include <sys/signalvar.h> |
| 55 | #include <sys/sysctl.h> | |
| 56 | #include <sys/aio.h> /* for aio_swake proto */ | |
| 57 | #include <sys/event.h> | |
| 58 | ||
| c1d0003c JH |
59 | #include <sys/thread2.h> |
| 60 | #include <sys/msgport2.h> | |
| 6cef7136 | 61 | #include <sys/socketvar2.h> |
| c1d0003c | 62 | |
| 984263bc MD |
63 | int maxsockets; |
| 64 | ||
| 65 | /* | |
| 66 | * Primitive routines for operating on sockets and socket buffers | |
| 67 | */ | |
| 68 | ||
| 69 | u_long sb_max = SB_MAX; | |
| 70 | u_long sb_max_adj = | |
| 71 | SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ | |
| 72 | ||
| 73 | static u_long sb_efficiency = 8; /* parameter for sbreserve() */ | |
| 74 | ||
| 6d49aa6f MD |
75 | /************************************************************************ |
| 76 | * signalsockbuf procedures * | |
| 77 | ************************************************************************/ | |
| 78 | ||
| 79 | /* | |
| 80 | * Wait for data to arrive at/drain from a socket buffer. | |
| 14343ad3 MD |
81 | * |
| 82 | * NOTE: Caller must generally hold the ssb_lock (client side lock) since | |
| 83 | * WAIT/WAKEUP only works for one client at a time. | |
| 84 | * | |
| 85 | * NOTE: Caller always retries whatever operation it was waiting on. | |
| 6d49aa6f MD |
86 | */ |
| 87 | int | |
| 88 | ssb_wait(struct signalsockbuf *ssb) | |
| 89 | { | |
| 14343ad3 MD |
90 | uint32_t flags; |
| 91 | int pflags; | |
| 92 | int error; | |
| 93 | ||
| 94 | pflags = (ssb->ssb_flags & SSB_NOINTR) ? 0 : PCATCH; | |
| 95 | ||
| 96 | for (;;) { | |
| 97 | flags = ssb->ssb_flags; | |
| 98 | cpu_ccfence(); | |
| 99 | ||
| 100 | /* | |
| 101 | * WAKEUP and WAIT interlock eachother. We can catch the | |
| 102 | * race by checking to see if WAKEUP has already been set, | |
| 103 | * and only setting WAIT if WAKEUP is clear. | |
| 104 | */ | |
| 105 | if (flags & SSB_WAKEUP) { | |
| 106 | if (atomic_cmpset_int(&ssb->ssb_flags, flags, | |
| 107 | flags & ~SSB_WAKEUP)) { | |
| 108 | error = 0; | |
| 109 | break; | |
| 110 | } | |
| 111 | continue; | |
| 112 | } | |
| 6d49aa6f | 113 | |
| 14343ad3 MD |
114 | /* |
| 115 | * Only set WAIT if WAKEUP is clear. | |
| 116 | */ | |
| 117 | tsleep_interlock(&ssb->ssb_cc, pflags); | |
| 118 | if (atomic_cmpset_int(&ssb->ssb_flags, flags, | |
| 119 | flags | SSB_WAIT)) { | |
| 120 | error = tsleep(&ssb->ssb_cc, pflags | PINTERLOCKED, | |
| 121 | "sbwait", ssb->ssb_timeo); | |
| 122 | break; | |
| 123 | } | |
| 124 | } | |
| 125 | return (error); | |
| 6d49aa6f MD |
126 | } |
| 127 | ||
| 128 | /* | |
| 129 | * Lock a sockbuf already known to be locked; | |
| 130 | * return any error returned from sleep (EINTR). | |
| 131 | */ | |
| 132 | int | |
| 133 | _ssb_lock(struct signalsockbuf *ssb) | |
| 134 | { | |
| 14343ad3 MD |
135 | uint32_t flags; |
| 136 | int pflags; | |
| 6d49aa6f MD |
137 | int error; |
| 138 | ||
| 14343ad3 MD |
139 | pflags = (ssb->ssb_flags & SSB_NOINTR) ? 0 : PCATCH; |
| 140 | ||
| 141 | for (;;) { | |
| 142 | flags = ssb->ssb_flags; | |
| 143 | cpu_ccfence(); | |
| 144 | if (flags & SSB_LOCK) { | |
| 145 | tsleep_interlock(&ssb->ssb_flags, pflags); | |
| 146 | if (atomic_cmpset_int(&ssb->ssb_flags, flags, | |
| 147 | flags | SSB_WANT)) { | |
| 148 | error = tsleep(&ssb->ssb_flags, | |
| 149 | pflags | PINTERLOCKED, | |
| 150 | "sblock", 0); | |
| 151 | if (error) | |
| 152 | break; | |
| 153 | } | |
| 154 | } else { | |
| 155 | if (atomic_cmpset_int(&ssb->ssb_flags, flags, | |
| 156 | flags | SSB_LOCK)) { | |
| 6cef7136 | 157 | lwkt_gettoken(&ssb->ssb_token); |
| 14343ad3 MD |
158 | error = 0; |
| 159 | break; | |
| 160 | } | |
| 161 | } | |
| 6d49aa6f | 162 | } |
| 14343ad3 | 163 | return (error); |
| 6d49aa6f MD |
164 | } |
| 165 | ||
| 984263bc | 166 | /* |
| 6d49aa6f MD |
167 | * This does the same for sockbufs. Note that the xsockbuf structure, |
| 168 | * since it is always embedded in a socket, does not include a self | |
| 169 | * pointer nor a length. We make this entry point public in case | |
| 170 | * some other mechanism needs it. | |
| 171 | */ | |
| 172 | void | |
| 173 | ssbtoxsockbuf(struct signalsockbuf *ssb, struct xsockbuf *xsb) | |
| 174 | { | |
| 175 | xsb->sb_cc = ssb->ssb_cc; | |
| 176 | xsb->sb_hiwat = ssb->ssb_hiwat; | |
| 177 | xsb->sb_mbcnt = ssb->ssb_mbcnt; | |
| 178 | xsb->sb_mbmax = ssb->ssb_mbmax; | |
| 179 | xsb->sb_lowat = ssb->ssb_lowat; | |
| 180 | xsb->sb_flags = ssb->ssb_flags; | |
| 181 | xsb->sb_timeo = ssb->ssb_timeo; | |
| 182 | } | |
| 183 | ||
| 184 | ||
| 185 | /************************************************************************ | |
| 186 | * Procedures which manipulate socket state flags, wakeups, etc. * | |
| 187 | ************************************************************************ | |
| 188 | * | |
| 189 | * Normal sequence from the active (originating) side is that | |
| 190 | * soisconnecting() is called during processing of connect() call, resulting | |
| 191 | * in an eventual call to soisconnected() if/when the connection is | |
| 192 | * established. When the connection is torn down soisdisconnecting() is | |
| 193 | * called during processing of disconnect() call, and soisdisconnected() is | |
| 194 | * called when the connection to the peer is totally severed. | |
| 984263bc | 195 | * |
| 6d49aa6f MD |
196 | * The semantics of these routines are such that connectionless protocols |
| 197 | * can call soisconnected() and soisdisconnected() only, bypassing the | |
| 198 | * in-progress calls when setting up a ``connection'' takes no time. | |
| 984263bc | 199 | * |
| 6d49aa6f MD |
200 | * From the passive side, a socket is created with two queues of sockets: |
| 201 | * so_incomp for connections in progress and so_comp for connections | |
| 202 | * already made and awaiting user acceptance. As a protocol is preparing | |
| 203 | * incoming connections, it creates a socket structure queued on so_incomp | |
| 204 | * by calling sonewconn(). When the connection is established, | |
| 205 | * soisconnected() is called, and transfers the socket structure to so_comp, | |
| 206 | * making it available to accept(). | |
| 984263bc | 207 | * |
| 6d49aa6f MD |
208 | * If a socket is closed with sockets on either so_incomp or so_comp, these |
| 209 | * sockets are dropped. | |
| 210 | * | |
| 211 | * If higher level protocols are implemented in the kernel, the wakeups | |
| 212 | * done here will sometimes cause software-interrupt process scheduling. | |
| 984263bc MD |
213 | */ |
| 214 | ||
| 215 | void | |
| c972a82f | 216 | soisconnecting(struct socket *so) |
| 984263bc | 217 | { |
| 6cef7136 MD |
218 | soclrstate(so, SS_ISCONNECTED | SS_ISDISCONNECTING); |
| 219 | sosetstate(so, SS_ISCONNECTING); | |
| 984263bc MD |
220 | } |
| 221 | ||
| 222 | void | |
| c972a82f | 223 | soisconnected(struct socket *so) |
| 984263bc | 224 | { |
| 5217bcbc MD |
225 | struct socket *head; |
| 226 | ||
| 227 | while ((head = so->so_head) != NULL) { | |
| 228 | lwkt_getpooltoken(head); | |
| 229 | if (so->so_head == head) | |
| 230 | break; | |
| 231 | lwkt_relpooltoken(head); | |
| 232 | } | |
| 984263bc | 233 | |
| 6cef7136 MD |
234 | soclrstate(so, SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING); |
| 235 | sosetstate(so, SS_ISCONNECTED); | |
| 984263bc MD |
236 | if (head && (so->so_state & SS_INCOMP)) { |
| 237 | if ((so->so_options & SO_ACCEPTFILTER) != 0) { | |
| 238 | so->so_upcall = head->so_accf->so_accept_filter->accf_callback; | |
| 239 | so->so_upcallarg = head->so_accf->so_accept_filter_arg; | |
| 14343ad3 | 240 | atomic_set_int(&so->so_rcv.ssb_flags, SSB_UPCALL); |
| 984263bc MD |
241 | so->so_options &= ~SO_ACCEPTFILTER; |
| 242 | so->so_upcall(so, so->so_upcallarg, 0); | |
| 5217bcbc | 243 | lwkt_relpooltoken(head); |
| 984263bc MD |
244 | return; |
| 245 | } | |
| 6cef7136 MD |
246 | |
| 247 | /* | |
| 248 | * Listen socket are not per-cpu. | |
| 249 | */ | |
| 984263bc MD |
250 | TAILQ_REMOVE(&head->so_incomp, so, so_list); |
| 251 | head->so_incqlen--; | |
| 984263bc MD |
252 | TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); |
| 253 | head->so_qlen++; | |
| 6cef7136 | 254 | sosetstate(so, SS_COMP); |
| e28d8186 | 255 | soclrstate(so, SS_INCOMP); |
| 6cef7136 | 256 | |
| 5dfe1a1a MD |
257 | /* |
| 258 | * XXX head may be on a different protocol thread. | |
| 259 | * sorwakeup()->sowakeup() is hacked atm. | |
| 260 | */ | |
| 984263bc MD |
261 | sorwakeup(head); |
| 262 | wakeup_one(&head->so_timeo); | |
| 263 | } else { | |
| 264 | wakeup(&so->so_timeo); | |
| 265 | sorwakeup(so); | |
| 266 | sowwakeup(so); | |
| 267 | } | |
| 5217bcbc MD |
268 | if (head) |
| 269 | lwkt_relpooltoken(head); | |
| 984263bc MD |
270 | } |
| 271 | ||
| 272 | void | |
| c972a82f | 273 | soisdisconnecting(struct socket *so) |
| 984263bc | 274 | { |
| 6cef7136 MD |
275 | soclrstate(so, SS_ISCONNECTING); |
| 276 | sosetstate(so, SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE); | |
| 984263bc MD |
277 | wakeup((caddr_t)&so->so_timeo); |
| 278 | sowwakeup(so); | |
| 279 | sorwakeup(so); | |
| 280 | } | |
| 281 | ||
| 282 | void | |
| c972a82f | 283 | soisdisconnected(struct socket *so) |
| 984263bc | 284 | { |
| 6cef7136 MD |
285 | soclrstate(so, SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING); |
| 286 | sosetstate(so, SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED); | |
| 984263bc | 287 | wakeup((caddr_t)&so->so_timeo); |
| 6d49aa6f | 288 | sbdrop(&so->so_snd.sb, so->so_snd.ssb_cc); |
| 984263bc MD |
289 | sowwakeup(so); |
| 290 | sorwakeup(so); | |
| 291 | } | |
| 292 | ||
| 0f2e13ef SG |
293 | void |
| 294 | soisreconnecting(struct socket *so) | |
| 295 | { | |
| 6cef7136 MD |
296 | soclrstate(so, SS_ISDISCONNECTING | SS_ISDISCONNECTED | |
| 297 | SS_CANTRCVMORE | SS_CANTSENDMORE); | |
| 298 | sosetstate(so, SS_ISCONNECTING); | |
| 0f2e13ef SG |
299 | } |
| 300 | ||
| 301 | void | |
| 302 | soisreconnected(struct socket *so) | |
| 303 | { | |
| 6cef7136 | 304 | soclrstate(so, SS_ISDISCONNECTED | SS_CANTRCVMORE | SS_CANTSENDMORE); |
| 0f2e13ef SG |
305 | soisconnected(so); |
| 306 | } | |
| 307 | ||
| 984263bc | 308 | /* |
| 48e7b118 MD |
309 | * Set or change the message port a socket receives commands on. |
| 310 | * | |
| 311 | * XXX | |
| 312 | */ | |
| 313 | void | |
| 314 | sosetport(struct socket *so, lwkt_port_t port) | |
| 315 | { | |
| 316 | so->so_port = port; | |
| 317 | } | |
| 318 | ||
| 319 | /* | |
| 984263bc MD |
320 | * When an attempt at a new connection is noted on a socket |
| 321 | * which accepts connections, sonewconn is called. If the | |
| 322 | * connection is possible (subject to space constraints, etc.) | |
| 323 | * then we allocate a new structure, propoerly linked into the | |
| 324 | * data structure of the original socket, and return this. | |
| 325 | * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. | |
| 6cef7136 MD |
326 | * |
| 327 | * The new socket is returned with one ref and so_pcb assigned. | |
| 328 | * The reference is implied by so_pcb. | |
| 984263bc MD |
329 | */ |
| 330 | struct socket * | |
| dadab5e9 | 331 | sonewconn(struct socket *head, int connstatus) |
| 984263bc | 332 | { |
| 1fd87d54 | 333 | struct socket *so; |
| 4402d8a2 | 334 | struct socket *sp; |
| e4700d00 | 335 | struct pru_attach_info ai; |
| 984263bc MD |
336 | |
| 337 | if (head->so_qlen > 3 * head->so_qlimit / 2) | |
| 60233e58 | 338 | return (NULL); |
| 87de5057 | 339 | so = soalloc(1); |
| 984263bc | 340 | if (so == NULL) |
| 87de5057 | 341 | return (NULL); |
| 0ce0603e MD |
342 | |
| 343 | /* | |
| 344 | * Set the port prior to attaching the inpcb to the current | |
| 345 | * cpu's protocol thread (which should be the current thread | |
| 346 | * but might not be in all cases). This serializes any pcb ops | |
| 347 | * which occur to our cpu allowing us to complete the attachment | |
| 348 | * without racing anything. | |
| 349 | */ | |
| 350 | sosetport(so, cpu_portfn(mycpu->gd_cpuid)); | |
| 984263bc MD |
351 | if ((head->so_options & SO_ACCEPTFILTER) != 0) |
| 352 | connstatus = 0; | |
| 353 | so->so_head = head; | |
| 354 | so->so_type = head->so_type; | |
| 355 | so->so_options = head->so_options &~ SO_ACCEPTCONN; | |
| 356 | so->so_linger = head->so_linger; | |
| 6cef7136 MD |
357 | |
| 358 | /* | |
| 359 | * NOTE: Clearing NOFDREF implies referencing the so with | |
| 360 | * soreference(). | |
| 361 | */ | |
| e28d8186 | 362 | so->so_state = head->so_state | SS_NOFDREF | SS_ASSERTINPROG; |
| 984263bc | 363 | so->so_proto = head->so_proto; |
| dadab5e9 | 364 | so->so_cred = crhold(head->so_cred); |
| e4700d00 JH |
365 | ai.sb_rlimit = NULL; |
| 366 | ai.p_ucred = NULL; | |
| 367 | ai.fd_rdir = NULL; /* jail code cruft XXX JH */ | |
| 6cef7136 MD |
368 | |
| 369 | /* | |
| 002c1265 | 370 | * Reserve space and call pru_attach. We can direct-call the |
| 6cef7136 MD |
371 | * function since we're already in the protocol thread. |
| 372 | */ | |
| 373 | if (soreserve(so, head->so_snd.ssb_hiwat, | |
| 374 | head->so_rcv.ssb_hiwat, NULL) || | |
| 002c1265 | 375 | so_pru_attach_direct(so, 0, &ai)) { |
| 6cef7136 | 376 | so->so_head = NULL; |
| e28d8186 | 377 | soclrstate(so, SS_ASSERTINPROG); |
| 6cef7136 | 378 | sofree(so); /* remove implied pcb ref */ |
| 60233e58 | 379 | return (NULL); |
| 984263bc | 380 | } |
| 6cef7136 MD |
381 | KKASSERT(so->so_refs == 2); /* attach + our base ref */ |
| 382 | sofree(so); | |
| 48e7b118 | 383 | KKASSERT(so->so_port != NULL); |
| 5b0b9fa5 PA |
384 | so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat; |
| 385 | so->so_snd.ssb_lowat = head->so_snd.ssb_lowat; | |
| 386 | so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo; | |
| 387 | so->so_snd.ssb_timeo = head->so_snd.ssb_timeo; | |
| dbcbe5d3 MD |
388 | so->so_rcv.ssb_flags |= head->so_rcv.ssb_flags & |
| 389 | (SSB_AUTOSIZE | SSB_AUTOLOWAT); | |
| 390 | so->so_snd.ssb_flags |= head->so_snd.ssb_flags & | |
| 391 | (SSB_AUTOSIZE | SSB_AUTOLOWAT); | |
| 5217bcbc | 392 | lwkt_getpooltoken(head); |
| 984263bc MD |
393 | if (connstatus) { |
| 394 | TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); | |
| 6cef7136 | 395 | sosetstate(so, SS_COMP); |
| 984263bc MD |
396 | head->so_qlen++; |
| 397 | } else { | |
| 398 | if (head->so_incqlen > head->so_qlimit) { | |
| 984263bc | 399 | sp = TAILQ_FIRST(&head->so_incomp); |
| 4402d8a2 MD |
400 | TAILQ_REMOVE(&head->so_incomp, sp, so_list); |
| 401 | head->so_incqlen--; | |
| 6cef7136 | 402 | soclrstate(sp, SS_INCOMP); |
| 4402d8a2 | 403 | sp->so_head = NULL; |
| 9116be8e | 404 | soaborta(sp); |
| 984263bc MD |
405 | } |
| 406 | TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); | |
| 6cef7136 | 407 | sosetstate(so, SS_INCOMP); |
| 984263bc MD |
408 | head->so_incqlen++; |
| 409 | } | |
| 5217bcbc | 410 | lwkt_relpooltoken(head); |
| 984263bc | 411 | if (connstatus) { |
| 5dfe1a1a MD |
412 | /* |
| 413 | * XXX head may be on a different protocol thread. | |
| 414 | * sorwakeup()->sowakeup() is hacked atm. | |
| 415 | */ | |
| 984263bc MD |
416 | sorwakeup(head); |
| 417 | wakeup((caddr_t)&head->so_timeo); | |
| 6cef7136 | 418 | sosetstate(so, connstatus); |
| 984263bc | 419 | } |
| e28d8186 | 420 | soclrstate(so, SS_ASSERTINPROG); |
| 984263bc MD |
421 | return (so); |
| 422 | } | |
| 423 | ||
| 424 | /* | |
| 425 | * Socantsendmore indicates that no more data will be sent on the | |
| 426 | * socket; it would normally be applied to a socket when the user | |
| 427 | * informs the system that no more data is to be sent, by the protocol | |
| 428 | * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data | |
| 429 | * will be received, and will normally be applied to the socket by a | |
| 430 | * protocol when it detects that the peer will send no more data. | |
| 431 | * Data queued for reading in the socket may yet be read. | |
| 432 | */ | |
| 984263bc | 433 | void |
| c972a82f | 434 | socantsendmore(struct socket *so) |
| 984263bc | 435 | { |
| 6cef7136 | 436 | sosetstate(so, SS_CANTSENDMORE); |
| 984263bc MD |
437 | sowwakeup(so); |
| 438 | } | |
| 439 | ||
| 440 | void | |
| c972a82f | 441 | socantrcvmore(struct socket *so) |
| 984263bc | 442 | { |
| 6cef7136 | 443 | sosetstate(so, SS_CANTRCVMORE); |
| 984263bc MD |
444 | sorwakeup(so); |
| 445 | } | |
| 446 | ||
| 447 | /* | |
| b44419cb MD |
448 | * Wakeup processes waiting on a socket buffer. Do asynchronous notification |
| 449 | * via SIGIO if the socket has the SS_ASYNC flag set. | |
| dbcbe5d3 MD |
450 | * |
| 451 | * For users waiting on send/recv try to avoid unnecessary context switch | |
| 452 | * thrashing. Particularly for senders of large buffers (needs to be | |
| 453 | * extended to sel and aio? XXX) | |
| 5dfe1a1a MD |
454 | * |
| 455 | * WARNING! Can be called on a foreign socket from the wrong protocol | |
| 456 | * thread. aka is called on the 'head' listen socket when | |
| 457 | * a new connection comes in. | |
| 984263bc MD |
458 | */ |
| 459 | void | |
| 6d49aa6f | 460 | sowakeup(struct socket *so, struct signalsockbuf *ssb) |
| 984263bc | 461 | { |
| 5b22f1a7 | 462 | struct kqinfo *kqinfo = &ssb->ssb_kq; |
| 14343ad3 MD |
463 | uint32_t flags; |
| 464 | ||
| 465 | /* | |
| 466 | * Check conditions, set the WAKEUP flag, and clear and signal if | |
| 467 | * the WAIT flag is found to be set. This interlocks against the | |
| 468 | * client side. | |
| 469 | */ | |
| 470 | for (;;) { | |
| 471 | flags = ssb->ssb_flags; | |
| 472 | cpu_ccfence(); | |
| c1d0003c | 473 | |
| dbcbe5d3 MD |
474 | if ((ssb == &so->so_snd && ssb_space(ssb) >= ssb->ssb_lowat) || |
| 475 | (ssb == &so->so_rcv && ssb->ssb_cc >= ssb->ssb_lowat) || | |
| 476 | (ssb == &so->so_snd && (so->so_state & SS_CANTSENDMORE)) || | |
| 477 | (ssb == &so->so_rcv && (so->so_state & SS_CANTRCVMORE)) | |
| 478 | ) { | |
| 14343ad3 MD |
479 | if (atomic_cmpset_int(&ssb->ssb_flags, flags, |
| 480 | (flags | SSB_WAKEUP) & ~SSB_WAIT)) { | |
| 481 | if (flags & SSB_WAIT) | |
| 482 | wakeup(&ssb->ssb_cc); | |
| 483 | break; | |
| 484 | } | |
| 485 | } else { | |
| 486 | break; | |
| dbcbe5d3 | 487 | } |
| 984263bc | 488 | } |
| 14343ad3 MD |
489 | |
| 490 | /* | |
| 491 | * Misc other events | |
| 492 | */ | |
| 984263bc MD |
493 | if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) |
| 494 | pgsigio(so->so_sigio, SIGIO, 0); | |
| 6d49aa6f | 495 | if (ssb->ssb_flags & SSB_UPCALL) |
| 74f1caca | 496 | (*so->so_upcall)(so, so->so_upcallarg, MB_DONTWAIT); |
| 6d49aa6f MD |
497 | if (ssb->ssb_flags & SSB_AIO) |
| 498 | aio_swake(so, ssb); | |
| 5b22f1a7 | 499 | KNOTE(&kqinfo->ki_note, 0); |
| 5dfe1a1a MD |
500 | |
| 501 | /* | |
| 502 | * This is a bit of a hack. Multiple threads can wind up scanning | |
| 503 | * ki_mlist concurrently due to the fact that this function can be | |
| 504 | * called on a foreign socket, so we can't afford to block here. | |
| 5217bcbc MD |
505 | * |
| 506 | * We need the pool token for (so) (likely the listne socket if | |
| 507 | * SSB_MEVENT is set) because the predicate function may have | |
| 508 | * to access the accept queue. | |
| 5dfe1a1a | 509 | */ |
| 6d49aa6f | 510 | if (ssb->ssb_flags & SSB_MEVENT) { |
| c1d0003c JH |
511 | struct netmsg_so_notify *msg, *nmsg; |
| 512 | ||
| e6318d16 | 513 | lwkt_gettoken(&kq_token); |
| 5217bcbc | 514 | lwkt_getpooltoken(so); |
| 5b22f1a7 | 515 | TAILQ_FOREACH_MUTABLE(msg, &kqinfo->ki_mlist, nm_list, nmsg) { |
| 002c1265 | 516 | if (msg->nm_predicate(msg)) { |
| 5b22f1a7 | 517 | TAILQ_REMOVE(&kqinfo->ki_mlist, msg, nm_list); |
| 002c1265 MD |
518 | lwkt_replymsg(&msg->base.lmsg, |
| 519 | msg->base.lmsg.ms_error); | |
| c1d0003c JH |
520 | } |
| 521 | } | |
| 5b22f1a7 | 522 | if (TAILQ_EMPTY(&ssb->ssb_kq.ki_mlist)) |
| 14343ad3 | 523 | atomic_clear_int(&ssb->ssb_flags, SSB_MEVENT); |
| 5217bcbc | 524 | lwkt_relpooltoken(so); |
| e6318d16 | 525 | lwkt_reltoken(&kq_token); |
| c1d0003c | 526 | } |
| 984263bc MD |
527 | } |
| 528 | ||
| 529 | /* | |
| 6d49aa6f | 530 | * Socket buffer (struct signalsockbuf) utility routines. |
| 984263bc MD |
531 | * |
| 532 | * Each socket contains two socket buffers: one for sending data and | |
| 533 | * one for receiving data. Each buffer contains a queue of mbufs, | |
| 534 | * information about the number of mbufs and amount of data in the | |
| 5b22f1a7 SG |
535 | * queue, and other fields allowing kevent()/select()/poll() statements |
| 536 | * and notification on data availability to be implemented. | |
| 984263bc MD |
537 | * |
| 538 | * Data stored in a socket buffer is maintained as a list of records. | |
| 539 | * Each record is a list of mbufs chained together with the m_next | |
| 540 | * field. Records are chained together with the m_nextpkt field. The upper | |
| 541 | * level routine soreceive() expects the following conventions to be | |
| 542 | * observed when placing information in the receive buffer: | |
| 543 | * | |
| 544 | * 1. If the protocol requires each message be preceded by the sender's | |
| 545 | * name, then a record containing that name must be present before | |
| 546 | * any associated data (mbuf's must be of type MT_SONAME). | |
| 547 | * 2. If the protocol supports the exchange of ``access rights'' (really | |
| 548 | * just additional data associated with the message), and there are | |
| 549 | * ``rights'' to be received, then a record containing this data | |
| 550 | * should be present (mbuf's must be of type MT_RIGHTS). | |
| 551 | * 3. If a name or rights record exists, then it must be followed by | |
| 552 | * a data record, perhaps of zero length. | |
| 553 | * | |
| 554 | * Before using a new socket structure it is first necessary to reserve | |
| 555 | * buffer space to the socket, by calling sbreserve(). This should commit | |
| 556 | * some of the available buffer space in the system buffer pool for the | |
| 557 | * socket (currently, it does nothing but enforce limits). The space | |
| 6d49aa6f | 558 | * should be released by calling ssb_release() when the socket is destroyed. |
| 984263bc | 559 | */ |
| 984263bc | 560 | int |
| e4700d00 | 561 | soreserve(struct socket *so, u_long sndcc, u_long rcvcc, struct rlimit *rl) |
| 984263bc | 562 | { |
| dbcbe5d3 | 563 | if (so->so_snd.ssb_lowat == 0) |
| 14343ad3 | 564 | atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOLOWAT); |
| 6d49aa6f | 565 | if (ssb_reserve(&so->so_snd, sndcc, so, rl) == 0) |
| 984263bc | 566 | goto bad; |
| 6d49aa6f | 567 | if (ssb_reserve(&so->so_rcv, rcvcc, so, rl) == 0) |
| 984263bc | 568 | goto bad2; |
| 6d49aa6f MD |
569 | if (so->so_rcv.ssb_lowat == 0) |
| 570 | so->so_rcv.ssb_lowat = 1; | |
| 571 | if (so->so_snd.ssb_lowat == 0) | |
| 572 | so->so_snd.ssb_lowat = MCLBYTES; | |
| 573 | if (so->so_snd.ssb_lowat > so->so_snd.ssb_hiwat) | |
| 574 | so->so_snd.ssb_lowat = so->so_snd.ssb_hiwat; | |
| 984263bc MD |
575 | return (0); |
| 576 | bad2: | |
| 6d49aa6f | 577 | ssb_release(&so->so_snd, so); |
| 984263bc MD |
578 | bad: |
| 579 | return (ENOBUFS); | |
| 580 | } | |
| 581 | ||
| 582 | static int | |
| 583 | sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) | |
| 584 | { | |
| 585 | int error = 0; | |
| 586 | u_long old_sb_max = sb_max; | |
| 587 | ||
| 588 | error = SYSCTL_OUT(req, arg1, sizeof(int)); | |
| 589 | if (error || !req->newptr) | |
| 590 | return (error); | |
| 591 | error = SYSCTL_IN(req, arg1, sizeof(int)); | |
| 592 | if (error) | |
| 593 | return (error); | |
| 594 | if (sb_max < MSIZE + MCLBYTES) { | |
| 595 | sb_max = old_sb_max; | |
| 596 | return (EINVAL); | |
| 597 | } | |
| 598 | sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); | |
| 599 | return (0); | |
| 600 | } | |
| 601 | ||
| 602 | /* | |
| 6d49aa6f | 603 | * Allot mbufs to a signalsockbuf. |
| ed12ce95 | 604 | * |
| 984263bc MD |
605 | * Attempt to scale mbmax so that mbcnt doesn't become limiting |
| 606 | * if buffering efficiency is near the normal case. | |
| ed12ce95 MD |
607 | * |
| 608 | * sb_max only applies to user-sockets (where rl != NULL). It does | |
| 609 | * not apply to kernel sockets or kernel-controlled sockets. Note | |
| 610 | * that NFS overrides the sockbuf limits created when nfsd creates | |
| 611 | * a socket. | |
| 984263bc MD |
612 | */ |
| 613 | int | |
| 6d49aa6f MD |
614 | ssb_reserve(struct signalsockbuf *ssb, u_long cc, struct socket *so, |
| 615 | struct rlimit *rl) | |
| 984263bc | 616 | { |
| 984263bc | 617 | /* |
| e4700d00 JH |
618 | * rl will only be NULL when we're in an interrupt (eg, in tcp_input) |
| 619 | * or when called from netgraph (ie, ngd_attach) | |
| 984263bc | 620 | */ |
| ed12ce95 | 621 | if (rl && cc > sb_max_adj) |
| 3a6117bb | 622 | cc = sb_max_adj; |
| 6d49aa6f | 623 | if (!chgsbsize(so->so_cred->cr_uidinfo, &ssb->ssb_hiwat, cc, |
| e4700d00 | 624 | rl ? rl->rlim_cur : RLIM_INFINITY)) { |
| 984263bc MD |
625 | return (0); |
| 626 | } | |
| ed12ce95 MD |
627 | if (rl) |
| 628 | ssb->ssb_mbmax = min(cc * sb_efficiency, sb_max); | |
| 629 | else | |
| 630 | ssb->ssb_mbmax = cc * sb_efficiency; | |
| dbcbe5d3 MD |
631 | |
| 632 | /* | |
| 633 | * AUTOLOWAT is set on send buffers and prevents large writes | |
| 634 | * from generating a huge number of context switches. | |
| 635 | */ | |
| 636 | if (ssb->ssb_flags & SSB_AUTOLOWAT) { | |
| 637 | ssb->ssb_lowat = ssb->ssb_hiwat / 2; | |
| 638 | if (ssb->ssb_lowat < MCLBYTES) | |
| 639 | ssb->ssb_lowat = MCLBYTES; | |
| 640 | } | |
| 6d49aa6f MD |
641 | if (ssb->ssb_lowat > ssb->ssb_hiwat) |
| 642 | ssb->ssb_lowat = ssb->ssb_hiwat; | |
| 984263bc MD |
643 | return (1); |
| 644 | } | |
| 645 | ||
| 646 | /* | |
| 647 | * Free mbufs held by a socket, and reserved mbuf space. | |
| 648 | */ | |
| 649 | void | |
| 6d49aa6f | 650 | ssb_release(struct signalsockbuf *ssb, struct socket *so) |
| 984263bc | 651 | { |
| 6d49aa6f MD |
652 | sbflush(&ssb->sb); |
| 653 | (void)chgsbsize(so->so_cred->cr_uidinfo, &ssb->ssb_hiwat, 0, | |
| 984263bc | 654 | RLIM_INFINITY); |
| 6d49aa6f | 655 | ssb->ssb_mbmax = 0; |
| 984263bc MD |
656 | } |
| 657 | ||
| 658 | /* | |
| 659 | * Some routines that return EOPNOTSUPP for entry points that are not | |
| 660 | * supported by a protocol. Fill in as needed. | |
| 661 | */ | |
| 002c1265 MD |
662 | void |
| 663 | pr_generic_notsupp(netmsg_t msg) | |
| a95455e5 | 664 | { |
| 002c1265 | 665 | lwkt_replymsg(&msg->lmsg, EOPNOTSUPP); |
| a95455e5 MD |
666 | } |
| 667 | ||
| 668 | int | |
| 669 | pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, | |
| 670 | struct mbuf *top, struct mbuf *control, int flags, | |
| 671 | struct thread *td) | |
| 672 | { | |
| 673 | if (top) | |
| 674 | m_freem(top); | |
| 675 | if (control) | |
| 676 | m_freem(control); | |
| 677 | return (EOPNOTSUPP); | |
| 678 | } | |
| 679 | ||
| 680 | int | |
| 681 | pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, | |
| 682 | struct uio *uio, struct sockbuf *sio, | |
| 683 | struct mbuf **controlp, int *flagsp) | |
| 684 | { | |
| 685 | return (EOPNOTSUPP); | |
| 686 | } | |
| 687 | ||
| 984263bc MD |
688 | /* |
| 689 | * This isn't really a ``null'' operation, but it's the default one | |
| 690 | * and doesn't do anything destructive. | |
| 691 | */ | |
| 002c1265 MD |
692 | void |
| 693 | pru_sense_null(netmsg_t msg) | |
| 984263bc | 694 | { |
| 002c1265 MD |
695 | msg->sense.nm_stat->st_blksize = msg->base.nm_so->so_snd.ssb_hiwat; |
| 696 | lwkt_replymsg(&msg->lmsg, 0); | |
| 984263bc MD |
697 | } |
| 698 | ||
| 699 | /* | |
| cfa2ba21 MD |
700 | * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. Callers |
| 701 | * of this routine assume that it always succeeds, so we have to use a | |
| 702 | * blockable allocation even though we might be called from a critical thread. | |
| 984263bc MD |
703 | */ |
| 704 | struct sockaddr * | |
| 590b8cd4 | 705 | dup_sockaddr(const struct sockaddr *sa) |
| 984263bc MD |
706 | { |
| 707 | struct sockaddr *sa2; | |
| 708 | ||
| efda3bd0 | 709 | sa2 = kmalloc(sa->sa_len, M_SONAME, M_INTWAIT); |
| cfa2ba21 MD |
710 | bcopy(sa, sa2, sa->sa_len); |
| 711 | return (sa2); | |
| 984263bc MD |
712 | } |
| 713 | ||
| 714 | /* | |
| 715 | * Create an external-format (``xsocket'') structure using the information | |
| 716 | * in the kernel-format socket structure pointed to by so. This is done | |
| 717 | * to reduce the spew of irrelevant information over this interface, | |
| 718 | * to isolate user code from changes in the kernel structure, and | |
| 719 | * potentially to provide information-hiding if we decide that | |
| 720 | * some of this information should be hidden from users. | |
| 721 | */ | |
| 722 | void | |
| 723 | sotoxsocket(struct socket *so, struct xsocket *xso) | |
| 724 | { | |
| 725 | xso->xso_len = sizeof *xso; | |
| 726 | xso->xso_so = so; | |
| 727 | xso->so_type = so->so_type; | |
| 728 | xso->so_options = so->so_options; | |
| 729 | xso->so_linger = so->so_linger; | |
| 730 | xso->so_state = so->so_state; | |
| 731 | xso->so_pcb = so->so_pcb; | |
| 732 | xso->xso_protocol = so->so_proto->pr_protocol; | |
| 733 | xso->xso_family = so->so_proto->pr_domain->dom_family; | |
| 734 | xso->so_qlen = so->so_qlen; | |
| 735 | xso->so_incqlen = so->so_incqlen; | |
| 736 | xso->so_qlimit = so->so_qlimit; | |
| 737 | xso->so_timeo = so->so_timeo; | |
| 738 | xso->so_error = so->so_error; | |
| 739 | xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; | |
| 740 | xso->so_oobmark = so->so_oobmark; | |
| 6d49aa6f MD |
741 | ssbtoxsockbuf(&so->so_snd, &xso->so_snd); |
| 742 | ssbtoxsockbuf(&so->so_rcv, &xso->so_rcv); | |
| 984263bc MD |
743 | xso->so_uid = so->so_cred->cr_uid; |
| 744 | } | |
| 745 | ||
| 746 | /* | |
| 984263bc MD |
747 | * Here is the definition of some of the basic objects in the kern.ipc |
| 748 | * branch of the MIB. | |
| 749 | */ | |
| 750 | SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); | |
| 751 | ||
| ed12ce95 MD |
752 | /* |
| 753 | * This takes the place of kern.maxsockbuf, which moved to kern.ipc. | |
| 754 | * | |
| 755 | * NOTE! sb_max only applies to user-created socket buffers. | |
| 756 | */ | |
| 984263bc MD |
757 | static int dummy; |
| 758 | SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); | |
| 759 | SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT|CTLFLAG_RW, | |
| 760 | &sb_max, 0, sysctl_handle_sb_max, "I", "Maximum socket buffer size"); | |
| 761 | SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, | |
| 0ca0cd25 | 762 | &maxsockets, 0, "Maximum number of sockets available"); |
| 984263bc | 763 | SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, |
| 093e85dc SG |
764 | &sb_efficiency, 0, |
| 765 | "Socket buffer limit scaler"); | |
| 984263bc MD |
766 | |
| 767 | /* | |
| ba39e2e0 | 768 | * Initialize maxsockets |
| 984263bc | 769 | */ |
| c972a82f SW |
770 | static void |
| 771 | init_maxsockets(void *ignored) | |
| 984263bc MD |
772 | { |
| 773 | TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); | |
| 774 | maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); | |
| 775 | } | |
| ba39e2e0 MD |
776 | SYSINIT(param, SI_BOOT1_TUNABLES, SI_ORDER_ANY, |
| 777 | init_maxsockets, NULL); | |
| 778 |