accept: Implement fast soaccept predication
[dragonfly.git] / sys / kern / uipc_socket.c
CommitLineData
984263bc 1/*
6ea1e9b9 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
66d6c637
JH
3 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 *
5 * This code is derived from software contributed to The DragonFly Project
6 * by Jeffrey M. Hsu.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of The DragonFly Project nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific, prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34/*
984263bc
MD
35 * Copyright (c) 1982, 1986, 1988, 1990, 1993
36 * The Regents of the University of California. All rights reserved.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
7405c902 67 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $
984263bc
MD
68 */
69
70#include "opt_inet.h"
78812139 71#include "opt_sctp.h"
984263bc
MD
72
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/fcntl.h>
76#include <sys/malloc.h>
77#include <sys/mbuf.h>
78#include <sys/domain.h>
79#include <sys/file.h> /* for struct knote */
80#include <sys/kernel.h>
984263bc 81#include <sys/event.h>
984263bc
MD
82#include <sys/proc.h>
83#include <sys/protosw.h>
84#include <sys/socket.h>
85#include <sys/socketvar.h>
6b6e0885 86#include <sys/socketops.h>
984263bc
MD
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/uio.h>
91#include <sys/jail.h>
92#include <vm/vm_zone.h>
e71a125f 93#include <vm/pmap.h>
acd31a69 94#include <net/netmsg2.h>
984263bc 95
e43a034f 96#include <sys/thread2.h>
d6cb521d 97#include <sys/socketvar2.h>
e43a034f 98
984263bc
MD
99#include <machine/limits.h>
100
c16aca65 101extern int tcp_sosnd_agglim;
f2a3782e 102extern int tcp_sosnd_async;
c16aca65 103
984263bc
MD
104#ifdef INET
105static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
106#endif /* INET */
107
108static void filt_sordetach(struct knote *kn);
109static int filt_soread(struct knote *kn, long hint);
110static void filt_sowdetach(struct knote *kn);
111static int filt_sowrite(struct knote *kn, long hint);
112static int filt_solisten(struct knote *kn, long hint);
113
acd31a69
SZ
114static void sodiscard(struct socket *so);
115static int soclose_sync(struct socket *so, int fflag);
116static void soclose_fast(struct socket *so);
117
984263bc 118static struct filterops solisten_filtops =
8d1b9f93 119 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten };
984263bc 120static struct filterops soread_filtops =
8d1b9f93 121 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread };
984263bc 122static struct filterops sowrite_filtops =
8d1b9f93 123 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite };
73c344d3 124static struct filterops soexcept_filtops =
8d1b9f93 125 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread };
984263bc 126
69ea5b8d 127MALLOC_DEFINE(M_SOCKET, "socket", "socket struct");
984263bc
MD
128MALLOC_DEFINE(M_SONAME, "soname", "socket name");
129MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
130
984263bc
MD
131
132static int somaxconn = SOMAXCONN;
133SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
134 &somaxconn, 0, "Maximum pending socket connection queue size");
135
acd31a69
SZ
136static int use_soclose_fast = 1;
137SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW,
138 &use_soclose_fast, 0, "Fast socket close");
139
5e4b3994
SZ
140int use_soaccept_pred_fast = 1;
141SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW,
142 &use_soaccept_pred_fast, 0, "Fast socket accept predication");
143
984263bc
MD
144/*
145 * Socket operation routines.
146 * These routines are called by the routines in
147 * sys_socket.c or from a system process, and
148 * implement the semantics of socket operations by
149 * switching out to the protocol specific routines.
150 */
151
152/*
69ea5b8d 153 * Get a socket structure, and initialize it.
984263bc
MD
154 * Note that it would probably be better to allocate socket
155 * and PCB at the same time, but I'm not convinced that all
156 * the protocols can be easily modified to do this.
157 */
158struct socket *
c972a82f 159soalloc(int waitok)
984263bc
MD
160{
161 struct socket *so;
69ea5b8d 162 unsigned waitmask;
984263bc 163
69ea5b8d
NT
164 waitmask = waitok ? M_WAITOK : M_NOWAIT;
165 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask);
984263bc
MD
166 if (so) {
167 /* XXX race condition for reentrant kernel */
984263bc 168 TAILQ_INIT(&so->so_aiojobq);
5b22f1a7
SG
169 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist);
170 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist);
a3c18566
MD
171 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok");
172 lwkt_token_init(&so->so_snd.ssb_token, "sndtok");
6cef7136
MD
173 so->so_state = SS_NOFDREF;
174 so->so_refs = 1;
984263bc
MD
175 }
176 return so;
177}
178
179int
dadab5e9
MD
180socreate(int dom, struct socket **aso, int type,
181 int proto, struct thread *td)
984263bc 182{
dadab5e9
MD
183 struct proc *p = td->td_proc;
184 struct protosw *prp;
185 struct socket *so;
e4700d00 186 struct pru_attach_info ai;
dadab5e9 187 int error;
984263bc
MD
188
189 if (proto)
190 prp = pffindproto(dom, proto, type);
191 else
192 prp = pffindtype(dom, type);
193
194 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
195 return (EPROTONOSUPPORT);
196
41c20dac 197 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only &&
984263bc
MD
198 prp->pr_domain->dom_family != PF_LOCAL &&
199 prp->pr_domain->dom_family != PF_INET &&
3e4150ef 200 prp->pr_domain->dom_family != PF_INET6 &&
984263bc
MD
201 prp->pr_domain->dom_family != PF_ROUTE) {
202 return (EPROTONOSUPPORT);
203 }
204
205 if (prp->pr_type != type)
206 return (EPROTOTYPE);
207 so = soalloc(p != 0);
6cef7136 208 if (so == NULL)
984263bc
MD
209 return (ENOBUFS);
210
48e7b118 211 /*
6cef7136
MD
212 * Callers of socreate() presumably will connect up a descriptor
213 * and call soclose() if they cannot. This represents our so_refs
214 * (which should be 1) from soalloc().
215 */
216 soclrstate(so, SS_NOFDREF);
217
218 /*
48e7b118
MD
219 * Set a default port for protocol processing. No action will occur
220 * on the socket on this port until an inpcb is attached to it and
221 * is able to match incoming packets, or until the socket becomes
222 * available to userland.
002c1265
MD
223 *
224 * We normally default the socket to the protocol thread on cpu 0.
225 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol
226 * thread and all pr_*()/pru_*() calls are executed synchronously.
48e7b118 227 */
002c1265
MD
228 if (prp->pr_flags & PR_SYNC_PORT)
229 so->so_port = &netisr_sync_port;
230 else
231 so->so_port = cpu_portfn(0);
48e7b118 232
984263bc
MD
233 TAILQ_INIT(&so->so_incomp);
234 TAILQ_INIT(&so->so_comp);
235 so->so_type = type;
e9a372eb 236 so->so_cred = crhold(p->p_ucred);
984263bc 237 so->so_proto = prp;
e4700d00
JH
238 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
239 ai.p_ucred = p->p_ucred;
240 ai.fd_rdir = p->p_fd->fd_rdir;
48e7b118 241
5b0b9fa5
PA
242 /*
243 * Auto-sizing of socket buffers is managed by the protocols and
244 * the appropriate flags must be set in the pru_attach function.
245 */
e4700d00 246 error = so_pru_attach(so, proto, &ai);
984263bc 247 if (error) {
6cef7136
MD
248 sosetstate(so, SS_NOFDREF);
249 sofree(so); /* from soalloc */
250 return error;
984263bc 251 }
48e7b118 252
6cef7136
MD
253 /*
254 * NOTE: Returns referenced socket.
255 */
984263bc
MD
256 *aso = so;
257 return (0);
258}
259
260int
dadab5e9 261sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
984263bc 262{
984263bc
MD
263 int error;
264
6b6e0885 265 error = so_pru_bind(so, nam, td);
984263bc
MD
266 return (error);
267}
268
6cef7136 269static void
dadab5e9 270sodealloc(struct socket *so)
984263bc 271{
6d49aa6f 272 if (so->so_rcv.ssb_hiwat)
984263bc 273 (void)chgsbsize(so->so_cred->cr_uidinfo,
6d49aa6f
MD
274 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY);
275 if (so->so_snd.ssb_hiwat)
984263bc 276 (void)chgsbsize(so->so_cred->cr_uidinfo,
6d49aa6f 277 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY);
984263bc 278#ifdef INET
81d59d3d
HP
279 /* remove accept filter if present */
280 if (so->so_accf != NULL)
281 do_setopt_accept_filter(so, NULL);
984263bc
MD
282#endif /* INET */
283 crfree(so->so_cred);
69ea5b8d 284 kfree(so, M_SOCKET);
984263bc
MD
285}
286
287int
dadab5e9 288solisten(struct socket *so, int backlog, struct thread *td)
984263bc 289{
e43a034f 290 int error;
78812139
EN
291#ifdef SCTP
292 short oldopt, oldqlimit;
293#endif /* SCTP */
984263bc 294
6cef7136 295 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))
78812139 296 return (EINVAL);
78812139
EN
297
298#ifdef SCTP
299 oldopt = so->so_options;
300 oldqlimit = so->so_qlimit;
301#endif /* SCTP */
302
6cef7136 303 lwkt_gettoken(&so->so_rcv.ssb_token);
984263bc
MD
304 if (TAILQ_EMPTY(&so->so_comp))
305 so->so_options |= SO_ACCEPTCONN;
6cef7136 306 lwkt_reltoken(&so->so_rcv.ssb_token);
984263bc
MD
307 if (backlog < 0 || backlog > somaxconn)
308 backlog = somaxconn;
309 so->so_qlimit = backlog;
78812139
EN
310 /* SCTP needs to look at tweak both the inbound backlog parameter AND
311 * the so_options (UDP model both connect's and gets inbound
312 * connections .. implicitly).
313 */
314 error = so_pru_listen(so, td);
315 if (error) {
316#ifdef SCTP
317 /* Restore the params */
318 so->so_options = oldopt;
319 so->so_qlimit = oldqlimit;
320#endif /* SCTP */
78812139
EN
321 return (error);
322 }
984263bc
MD
323 return (0);
324}
325
4402d8a2
MD
326/*
327 * Destroy a disconnected socket. This routine is a NOP if entities
328 * still have a reference on the socket:
329 *
330 * so_pcb - The protocol stack still has a reference
331 * SS_NOFDREF - There is no longer a file pointer reference
4402d8a2 332 */
984263bc 333void
dadab5e9 334sofree(struct socket *so)
984263bc 335{
5217bcbc
MD
336 struct socket *head;
337
338 /*
339 * This is a bit hackish at the moment. We need to interlock
340 * any accept queue we are on before we potentially lose the
341 * last reference to avoid races against a re-reference from
342 * someone operating on the queue.
343 */
344 while ((head = so->so_head) != NULL) {
345 lwkt_getpooltoken(head);
346 if (so->so_head == head)
347 break;
348 lwkt_relpooltoken(head);
349 }
984263bc 350
6cef7136
MD
351 /*
352 * Arbitrage the last free.
353 */
354 KKASSERT(so->so_refs > 0);
5217bcbc
MD
355 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) {
356 if (head)
357 lwkt_relpooltoken(head);
4402d8a2 358 return;
5217bcbc 359 }
6cef7136
MD
360
361 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF));
e28d8186 362 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
6cef7136
MD
363
364 /*
5217bcbc
MD
365 * We're done, remove ourselves from the accept queue we are
366 * on, if we are on one.
6cef7136 367 */
984263bc
MD
368 if (head != NULL) {
369 if (so->so_state & SS_INCOMP) {
370 TAILQ_REMOVE(&head->so_incomp, so, so_list);
371 head->so_incqlen--;
372 } else if (so->so_state & SS_COMP) {
373 /*
374 * We must not decommission a socket that's
375 * on the accept(2) queue. If we do, then
376 * accept(2) may hang after select(2) indicated
377 * that the listening socket was ready.
378 */
5217bcbc 379 lwkt_relpooltoken(head);
984263bc
MD
380 return;
381 } else {
382 panic("sofree: not queued");
383 }
6cef7136 384 soclrstate(so, SS_INCOMP);
984263bc 385 so->so_head = NULL;
5217bcbc 386 lwkt_relpooltoken(head);
984263bc 387 }
6d49aa6f 388 ssb_release(&so->so_snd, so);
984263bc
MD
389 sorflush(so);
390 sodealloc(so);
391}
392
393/*
394 * Close a socket on last file table reference removal.
395 * Initiate disconnect if connected.
396 * Free socket when disconnect complete.
397 */
398int
9ba76b73 399soclose(struct socket *so, int fflag)
984263bc 400{
acd31a69 401 int error;
984263bc 402
58c2553a 403 funsetown(&so->so_sigio);
acd31a69
SZ
404 if (!use_soclose_fast ||
405 (so->so_proto->pr_flags & PR_SYNC_PORT) ||
406 (so->so_options & SO_LINGER)) {
407 error = soclose_sync(so, fflag);
408 } else {
409 soclose_fast(so);
410 error = 0;
411 }
412 return error;
413}
414
415static void
416sodiscard(struct socket *so)
417{
418 lwkt_getpooltoken(so);
419 if (so->so_options & SO_ACCEPTCONN) {
420 struct socket *sp;
421
422 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
423 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
424 soclrstate(sp, SS_INCOMP);
425 sp->so_head = NULL;
426 so->so_incqlen--;
427 soaborta(sp);
428 }
429 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
430 TAILQ_REMOVE(&so->so_comp, sp, so_list);
431 soclrstate(sp, SS_COMP);
432 sp->so_head = NULL;
433 so->so_qlen--;
434 soaborta(sp);
435 }
436 }
437 lwkt_relpooltoken(so);
438
439 if (so->so_state & SS_NOFDREF)
440 panic("soclose: NOFDREF");
441 sosetstate(so, SS_NOFDREF); /* take ref */
442}
443
444static int
445soclose_sync(struct socket *so, int fflag)
446{
447 int error = 0;
448
19be7d32 449 if (so->so_pcb == NULL)
984263bc
MD
450 goto discard;
451 if (so->so_state & SS_ISCONNECTED) {
452 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
453 error = sodisconnect(so);
454 if (error)
455 goto drop;
456 }
457 if (so->so_options & SO_LINGER) {
458 if ((so->so_state & SS_ISDISCONNECTING) &&
9ba76b73 459 (fflag & FNONBLOCK))
984263bc
MD
460 goto drop;
461 while (so->so_state & SS_ISCONNECTED) {
6cef7136
MD
462 error = tsleep(&so->so_timeo, PCATCH,
463 "soclos", so->so_linger * hz);
984263bc
MD
464 if (error)
465 break;
466 }
467 }
468 }
469drop:
470 if (so->so_pcb) {
6b6e0885
JH
471 int error2;
472
473 error2 = so_pru_detach(so);
984263bc
MD
474 if (error == 0)
475 error = error2;
476 }
477discard:
acd31a69
SZ
478 sodiscard(so);
479 so_pru_sync(so); /* unpend async sending */
480 sofree(so); /* dispose of ref */
19be7d32 481
acd31a69
SZ
482 return (error);
483}
484
485static void
486soclose_sofree_async_handler(netmsg_t msg)
487{
488 sofree(msg->base.nm_so);
489}
490
491static void
492soclose_sofree_async(struct socket *so)
493{
494 struct netmsg_base *base = &so->so_clomsg;
495
496 netmsg_init(base, so, &netisr_apanic_rport, 0,
497 soclose_sofree_async_handler);
498 lwkt_sendmsg(so->so_port, &base->lmsg);
499}
500
501static void
502soclose_disconn_async_handler(netmsg_t msg)
503{
504 struct socket *so = msg->base.nm_so;
505
506 if ((so->so_state & SS_ISCONNECTED) &&
507 (so->so_state & SS_ISDISCONNECTING) == 0)
508 so_pru_disconnect_direct(so);
509
510 if (so->so_pcb)
511 so_pru_detach_direct(so);
512
513 sodiscard(so);
514 sofree(so);
515}
516
517static void
518soclose_disconn_async(struct socket *so)
519{
520 struct netmsg_base *base = &so->so_clomsg;
521
522 netmsg_init(base, so, &netisr_apanic_rport, 0,
523 soclose_disconn_async_handler);
524 lwkt_sendmsg(so->so_port, &base->lmsg);
525}
526
527static void
528soclose_detach_async_handler(netmsg_t msg)
529{
530 struct socket *so = msg->base.nm_so;
531
532 if (so->so_pcb)
533 so_pru_detach_direct(so);
534
535 sodiscard(so);
536 sofree(so);
537}
538
539static void
540soclose_detach_async(struct socket *so)
541{
542 struct netmsg_base *base = &so->so_clomsg;
543
544 netmsg_init(base, so, &netisr_apanic_rport, 0,
545 soclose_detach_async_handler);
546 lwkt_sendmsg(so->so_port, &base->lmsg);
547}
548
549static void
550soclose_fast(struct socket *so)
551{
552 if (so->so_pcb == NULL)
553 goto discard;
554
555 if ((so->so_state & SS_ISCONNECTED) &&
556 (so->so_state & SS_ISDISCONNECTING) == 0) {
557 soclose_disconn_async(so);
558 return;
19be7d32 559 }
939c81e4 560
acd31a69
SZ
561 if (so->so_pcb) {
562 soclose_detach_async(so);
563 return;
564 }
565
566discard:
567 sodiscard(so);
568 soclose_sofree_async(so);
984263bc
MD
569}
570
571/*
9116be8e
MD
572 * Abort and destroy a socket. Only one abort can be in progress
573 * at any given moment.
984263bc 574 */
4402d8a2 575void
c972a82f 576soabort(struct socket *so)
984263bc 577{
6cef7136
MD
578 soreference(so);
579 so_pru_abort(so);
4402d8a2 580}
984263bc 581
4402d8a2
MD
582void
583soaborta(struct socket *so)
584{
6cef7136
MD
585 soreference(so);
586 so_pru_aborta(so);
984263bc
MD
587}
588
fd86a41c
SZ
589void
590soabort_oncpu(struct socket *so)
591{
6cef7136
MD
592 soreference(so);
593 so_pru_abort_oncpu(so);
fd86a41c
SZ
594}
595
c19fdb0e
MD
596/*
597 * so is passed in ref'd, which becomes owned by
598 * the cleared SS_NOFDREF flag.
599 */
984263bc 600int
dadab5e9 601soaccept(struct socket *so, struct sockaddr **nam)
984263bc 602{
984263bc
MD
603 int error;
604
605 if ((so->so_state & SS_NOFDREF) == 0)
606 panic("soaccept: !NOFDREF");
6cef7136 607 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */
002c1265 608 error = so_pru_accept_direct(so, nam);
984263bc
MD
609 return (error);
610}
611
612int
dadab5e9 613soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
984263bc 614{
984263bc
MD
615 int error;
616
617 if (so->so_options & SO_ACCEPTCONN)
618 return (EOPNOTSUPP);
984263bc
MD
619 /*
620 * If protocol is connection-based, can only connect once.
621 * Otherwise, if connected, try to disconnect first.
622 * This allows user to disconnect by connecting to, e.g.,
623 * a null address.
624 */
625 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
626 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
59429d28 627 (error = sodisconnect(so)))) {
984263bc 628 error = EISCONN;
59429d28
MD
629 } else {
630 /*
631 * Prevent accumulated error from previous connection
632 * from biting us.
633 */
634 so->so_error = 0;
6b6e0885 635 error = so_pru_connect(so, nam, td);
59429d28 636 }
984263bc
MD
637 return (error);
638}
639
640int
dadab5e9 641soconnect2(struct socket *so1, struct socket *so2)
984263bc 642{
984263bc
MD
643 int error;
644
6b6e0885 645 error = so_pru_connect2(so1, so2);
984263bc
MD
646 return (error);
647}
648
649int
dadab5e9 650sodisconnect(struct socket *so)
984263bc 651{
984263bc
MD
652 int error;
653
654 if ((so->so_state & SS_ISCONNECTED) == 0) {
655 error = ENOTCONN;
656 goto bad;
657 }
658 if (so->so_state & SS_ISDISCONNECTING) {
659 error = EALREADY;
660 goto bad;
661 }
6b6e0885 662 error = so_pru_disconnect(so);
984263bc 663bad:
984263bc
MD
664 return (error);
665}
666
667#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
668/*
669 * Send on a socket.
670 * If send must go all at once and message is larger than
671 * send buffering, then hard error.
672 * Lock against other senders.
673 * If must go all at once and not enough room now, then
674 * inform user that this would block and do nothing.
675 * Otherwise, if nonblocking, send as much as possible.
676 * The data to be sent is described by "uio" if nonzero,
677 * otherwise by the mbuf chain "top" (which must be null
678 * if uio is not). Data provided in mbuf chain must be small
679 * enough to send all at once.
680 *
681 * Returns nonzero on error, timeout or signal; callers
682 * must check for short counts if EINTR/ERESTART are returned.
683 * Data and control buffers are freed on return.
684 */
685int
dadab5e9
MD
686sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
687 struct mbuf *top, struct mbuf *control, int flags,
688 struct thread *td)
984263bc
MD
689{
690 struct mbuf **mp;
dadab5e9 691 struct mbuf *m;
e54488bb
MD
692 size_t resid;
693 int space, len;
e43a034f 694 int clen = 0, error, dontroute, mlen;
984263bc 695 int atomic = sosendallatonce(so) || top;
6b6e0885 696 int pru_flags;
984263bc 697
5bd48c1d 698 if (uio) {
984263bc 699 resid = uio->uio_resid;
5bd48c1d 700 } else {
e54488bb 701 resid = (size_t)top->m_pkthdr.len;
5bd48c1d
MD
702#ifdef INVARIANTS
703 len = 0;
704 for (m = top; m; m = m->m_next)
705 len += m->m_len;
706 KKASSERT(top->m_pkthdr.len == len);
707#endif
708 }
48e7b118 709
984263bc 710 /*
e54488bb
MD
711 * WARNING! resid is unsigned, space and len are signed. space
712 * can wind up negative if the sockbuf is overcommitted.
984263bc
MD
713 *
714 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
715 * type sockets since that's an error.
716 */
e54488bb 717 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
984263bc
MD
718 error = EINVAL;
719 goto out;
720 }
721
722 dontroute =
723 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
724 (so->so_proto->pr_flags & PR_ATOMIC);
fde7ac71
SS
725 if (td->td_lwp != NULL)
726 td->td_lwp->lwp_ru.ru_msgsnd++;
984263bc
MD
727 if (control)
728 clen = control->m_len;
6cef7136 729#define gotoerr(errcode) { error = errcode; goto release; }
984263bc
MD
730
731restart:
6d49aa6f 732 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
984263bc
MD
733 if (error)
734 goto out;
48e7b118 735
984263bc 736 do {
984263bc 737 if (so->so_state & SS_CANTSENDMORE)
6ea1e9b9 738 gotoerr(EPIPE);
984263bc
MD
739 if (so->so_error) {
740 error = so->so_error;
741 so->so_error = 0;
984263bc
MD
742 goto release;
743 }
744 if ((so->so_state & SS_ISCONNECTED) == 0) {
745 /*
746 * `sendto' and `sendmsg' is allowed on a connection-
747 * based socket if it supports implied connect.
748 * Return ENOTCONN if not connected and no address is
749 * supplied.
750 */
751 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
752 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
753 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
754 !(resid == 0 && clen != 0))
6ea1e9b9 755 gotoerr(ENOTCONN);
984263bc 756 } else if (addr == 0)
6ea1e9b9 757 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
984263bc
MD
758 ENOTCONN : EDESTADDRREQ);
759 }
3a6117bb
MD
760 if ((atomic && resid > so->so_snd.ssb_hiwat) ||
761 clen > so->so_snd.ssb_hiwat) {
762 gotoerr(EMSGSIZE);
763 }
6d49aa6f 764 space = ssb_space(&so->so_snd);
984263bc
MD
765 if (flags & MSG_OOB)
766 space += 1024;
e54488bb 767 if ((space < 0 || (size_t)space < resid + clen) && uio &&
6d49aa6f 768 (atomic || space < so->so_snd.ssb_lowat || space < clen)) {
9ba76b73 769 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
6ea1e9b9 770 gotoerr(EWOULDBLOCK);
6d49aa6f
MD
771 ssb_unlock(&so->so_snd);
772 error = ssb_wait(&so->so_snd);
984263bc
MD
773 if (error)
774 goto out;
775 goto restart;
776 }
984263bc
MD
777 mp = &top;
778 space -= clen;
779 do {
780 if (uio == NULL) {
781 /*
782 * Data is prepackaged in "top".
783 */
784 resid = 0;
785 if (flags & MSG_EOR)
786 top->m_flags |= M_EOR;
787 } else do {
e54488bb
MD
788 if (resid > INT_MAX)
789 resid = INT_MAX;
790 m = m_getl((int)resid, MB_WAIT, MT_DATA,
50503f0f
JH
791 top == NULL ? M_PKTHDR : 0, &mlen);
792 if (top == NULL) {
984263bc 793 m->m_pkthdr.len = 0;
60233e58 794 m->m_pkthdr.rcvif = NULL;
984263bc 795 }
e54488bb 796 len = imin((int)szmin(mlen, resid), space);
50503f0f 797 if (resid < MINCLSIZE) {
984263bc
MD
798 /*
799 * For datagram protocols, leave room
800 * for protocol headers in first mbuf.
801 */
802 if (atomic && top == 0 && len < mlen)
803 MH_ALIGN(m, len);
804 }
805 space -= len;
e54488bb 806 error = uiomove(mtod(m, caddr_t), (size_t)len, uio);
984263bc
MD
807 resid = uio->uio_resid;
808 m->m_len = len;
809 *mp = m;
810 top->m_pkthdr.len += len;
811 if (error)
812 goto release;
813 mp = &m->m_next;
e54488bb 814 if (resid == 0) {
984263bc
MD
815 if (flags & MSG_EOR)
816 top->m_flags |= M_EOR;
817 break;
818 }
819 } while (space > 0 && atomic);
820 if (dontroute)
821 so->so_options |= SO_DONTROUTE;
6b6e0885
JH
822 if (flags & MSG_OOB) {
823 pru_flags = PRUS_OOB;
824 } else if ((flags & MSG_EOF) &&
825 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
e54488bb 826 (resid == 0)) {
6b6e0885
JH
827 /*
828 * If the user set MSG_EOF, the protocol
829 * understands this flag and nothing left to
830 * send then use PRU_SEND_EOF instead of PRU_SEND.
831 */
832 pru_flags = PRUS_EOF;
833 } else if (resid > 0 && space > 0) {
834 /* If there is more to send, set PRUS_MORETOCOME */
835 pru_flags = PRUS_MORETOCOME;
836 } else {
837 pru_flags = 0;
838 }
984263bc
MD
839 /*
840 * XXX all the SS_CANTSENDMORE checks previously
841 * done could be out of date. We could have recieved
842 * a reset packet in an interrupt or maybe we slept
843 * while doing page faults in uiomove() etc. We could
844 * probably recheck again inside the splnet() protection
845 * here, but there are probably other places that this
846 * also happens. We must rethink this.
847 */
6b6e0885 848 error = so_pru_send(so, pru_flags, top, addr, control, td);
984263bc
MD
849 if (dontroute)
850 so->so_options &= ~SO_DONTROUTE;
851 clen = 0;
852 control = 0;
e28d8186 853 top = NULL;
984263bc
MD
854 mp = &top;
855 if (error)
6b6e0885 856 goto release;
984263bc
MD
857 } while (resid && space > 0);
858 } while (resid);
859
860release:
6d49aa6f 861 ssb_unlock(&so->so_snd);
984263bc
MD
862out:
863 if (top)
864 m_freem(top);
865 if (control)
866 m_freem(control);
867 return (error);
868}
869
870/*
6ea1e9b9
JH
871 * A specialization of sosend() for UDP based on protocol-specific knowledge:
872 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that
873 * sosendallatonce() returns true,
874 * the "atomic" variable is true,
875 * and sosendudp() blocks until space is available for the entire send.
876 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or
877 * PR_IMPLOPCL flags set.
878 * UDP has no out-of-band data.
879 * UDP has no control data.
880 * UDP does not support MSG_EOR.
881 */
882int
883sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio,
884 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
885{
6ea1e9b9 886 boolean_t dontroute; /* temporary SO_DONTROUTE setting */
e54488bb
MD
887 size_t resid;
888 int error;
889 int space;
6ea1e9b9 890
fde7ac71
SS
891 if (td->td_lwp != NULL)
892 td->td_lwp->lwp_ru.ru_msgsnd++;
6ea1e9b9
JH
893 if (control)
894 m_freem(control);
895
896 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp"));
e54488bb 897 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len;
6ea1e9b9
JH
898
899restart:
6d49aa6f 900 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
6ea1e9b9
JH
901 if (error)
902 goto out;
903
6ea1e9b9
JH
904 if (so->so_state & SS_CANTSENDMORE)
905 gotoerr(EPIPE);
906 if (so->so_error) {
907 error = so->so_error;
908 so->so_error = 0;
6ea1e9b9
JH
909 goto release;
910 }
911 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL)
912 gotoerr(EDESTADDRREQ);
6d49aa6f 913 if (resid > so->so_snd.ssb_hiwat)
6ea1e9b9 914 gotoerr(EMSGSIZE);
e54488bb
MD
915 space = ssb_space(&so->so_snd);
916 if (uio && (space < 0 || (size_t)space < resid)) {
9ba76b73 917 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
6ea1e9b9 918 gotoerr(EWOULDBLOCK);
6d49aa6f
MD
919 ssb_unlock(&so->so_snd);
920 error = ssb_wait(&so->so_snd);
6ea1e9b9
JH
921 if (error)
922 goto out;
923 goto restart;
924 }
6ea1e9b9
JH
925
926 if (uio) {
e12241e1 927 top = m_uiomove(uio);
6ea1e9b9
JH
928 if (top == NULL)
929 goto release;
930 }
931
932 dontroute = (flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE);
933 if (dontroute)
934 so->so_options |= SO_DONTROUTE;
935
936 error = so_pru_send(so, 0, top, addr, NULL, td);
937 top = NULL; /* sent or freed in lower layer */
938
939 if (dontroute)
940 so->so_options &= ~SO_DONTROUTE;
941
942release:
6d49aa6f 943 ssb_unlock(&so->so_snd);
6ea1e9b9
JH
944out:
945 if (top)
946 m_freem(top);
947 return (error);
948}
949
5bc42dd1
SZ
950int
951sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio,
952 struct mbuf *top, struct mbuf *control, int flags,
953 struct thread *td)
954{
955 struct mbuf **mp;
956 struct mbuf *m;
957 size_t resid;
958 int space, len;
959 int error, mlen;
960 int allatonce;
961 int pru_flags;
962
963 if (uio) {
964 KKASSERT(top == NULL);
965 allatonce = 0;
966 resid = uio->uio_resid;
967 } else {
968 allatonce = 1;
969 resid = (size_t)top->m_pkthdr.len;
970#ifdef INVARIANTS
971 len = 0;
972 for (m = top; m; m = m->m_next)
973 len += m->m_len;
974 KKASSERT(top->m_pkthdr.len == len);
975#endif
976 }
977
978 /*
979 * WARNING! resid is unsigned, space and len are signed. space
980 * can wind up negative if the sockbuf is overcommitted.
981 *
982 * Also check to make sure that MSG_EOR isn't used on TCP
983 */
984 if (flags & MSG_EOR) {
985 error = EINVAL;
986 goto out;
987 }
988
989 if (control) {
990 /* TCP doesn't do control messages (rights, creds, etc) */
991 if (control->m_len) {
992 error = EINVAL;
993 goto out;
994 }
995 m_freem(control); /* empty control, just free it */
996 control = NULL;
997 }
998
999 if (td->td_lwp != NULL)
1000 td->td_lwp->lwp_ru.ru_msgsnd++;
1001
1002#define gotoerr(errcode) { error = errcode; goto release; }
1003
1004restart:
1005 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
1006 if (error)
1007 goto out;
1008
1009 do {
1010 if (so->so_state & SS_CANTSENDMORE)
1011 gotoerr(EPIPE);
1012 if (so->so_error) {
1013 error = so->so_error;
1014 so->so_error = 0;
1015 goto release;
1016 }
1017 if ((so->so_state & SS_ISCONNECTED) == 0 &&
1018 (so->so_state & SS_ISCONFIRMING) == 0)
1019 gotoerr(ENOTCONN);
1020 if (allatonce && resid > so->so_snd.ssb_hiwat)
1021 gotoerr(EMSGSIZE);
1022
1023 space = ssb_space(&so->so_snd);
1024 if (flags & MSG_OOB)
1025 space += 1024;
1026 if ((space < 0 || (size_t)space < resid) && !allatonce &&
1027 space < so->so_snd.ssb_lowat) {
1028 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
1029 gotoerr(EWOULDBLOCK);
1030 ssb_unlock(&so->so_snd);
1031 error = ssb_wait(&so->so_snd);
1032 if (error)
1033 goto out;
1034 goto restart;
1035 }
1036 mp = &top;
1037 do {
f2a3782e 1038 int cnt = 0, async = 0;
c16aca65 1039
5bc42dd1
SZ
1040 if (uio == NULL) {
1041 /*
1042 * Data is prepackaged in "top".
1043 */
1044 resid = 0;
1045 } else do {
1046 if (resid > INT_MAX)
1047 resid = INT_MAX;
1048 m = m_getl((int)resid, MB_WAIT, MT_DATA,
1049 top == NULL ? M_PKTHDR : 0, &mlen);
1050 if (top == NULL) {
1051 m->m_pkthdr.len = 0;
1052 m->m_pkthdr.rcvif = NULL;
1053 }
1054 len = imin((int)szmin(mlen, resid), space);
1055 space -= len;
1056 error = uiomove(mtod(m, caddr_t), (size_t)len, uio);
1057 resid = uio->uio_resid;
1058 m->m_len = len;
1059 *mp = m;
1060 top->m_pkthdr.len += len;
1061 if (error)
1062 goto release;
1063 mp = &m->m_next;
1064 if (resid == 0)
1065 break;
c16aca65
SZ
1066 ++cnt;
1067 } while (space > 0 && cnt < tcp_sosnd_agglim);
5bc42dd1 1068
6d618102
SZ
1069 if (tcp_sosnd_async)
1070 async = 1;
1071
5bc42dd1
SZ
1072 if (flags & MSG_OOB) {
1073 pru_flags = PRUS_OOB;
6d618102
SZ
1074 async = 0;
1075 } else if ((flags & MSG_EOF) && resid == 0) {
1076 pru_flags = PRUS_EOF;
5bc42dd1
SZ
1077 } else if (resid > 0 && space > 0) {
1078 /* If there is more to send, set PRUS_MORETOCOME */
1079 pru_flags = PRUS_MORETOCOME;
f2a3782e 1080 async = 1;
5bc42dd1
SZ
1081 } else {
1082 pru_flags = 0;
1083 }
1084
1af30d61 1085 if (flags & MSG_SYNC)
6d618102 1086 async = 0;
1af30d61 1087
5bc42dd1
SZ
1088 /*
1089 * XXX all the SS_CANTSENDMORE checks previously
1090 * done could be out of date. We could have recieved
1091 * a reset packet in an interrupt or maybe we slept
1092 * while doing page faults in uiomove() etc. We could
1093 * probably recheck again inside the splnet() protection
1094 * here, but there are probably other places that this
1095 * also happens. We must rethink this.
1096 */
f2a3782e 1097 if (!async) {
0ad8e15e
SZ
1098 error = so_pru_send(so, pru_flags, top,
1099 NULL, NULL, td);
1100 } else {
1101 so_pru_send_async(so, pru_flags, top,
1102 NULL, NULL, td);
1103 error = 0;
1104 }
5bc42dd1
SZ
1105
1106 top = NULL;
1107 mp = &top;
1108 if (error)
1109 goto release;
1110 } while (resid && space > 0);
1111 } while (resid);
1112
1113release:
1114 ssb_unlock(&so->so_snd);
1115out:
1116 if (top)
1117 m_freem(top);
1118 if (control)
1119 m_freem(control);
1120 return (error);
1121}
1122
6ea1e9b9 1123/*
984263bc 1124 * Implement receive operations on a socket.
6cef7136 1125 *
6d49aa6f 1126 * We depend on the way that records are added to the signalsockbuf
984263bc
MD
1127 * by sbappend*. In particular, each record (mbufs linked through m_next)
1128 * must begin with an address if the protocol so specifies,
1129 * followed by an optional mbuf or mbufs containing ancillary data,
1130 * and then zero or more mbufs of data.
6cef7136
MD
1131 *
1132 * Although the signalsockbuf is locked, new data may still be appended.
1133 * A token inside the ssb_lock deals with MP issues and still allows
1134 * the network to access the socket if we block in a uio.
984263bc
MD
1135 *
1136 * The caller may receive the data as a single mbuf chain by supplying
1137 * an mbuf **mp0 for use in returning the chain. The uio is then used
1138 * only for the count in uio_resid.
1139 */
1140int
c972a82f 1141soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
6d49aa6f 1142 struct sockbuf *sio, struct mbuf **controlp, int *flagsp)
984263bc 1143{
d8a9a23b 1144 struct mbuf *m, *n;
857caa4a 1145 struct mbuf *free_chain = NULL;
e43a034f 1146 int flags, len, error, offset;
984263bc 1147 struct protosw *pr = so->so_proto;
984263bc 1148 int moff, type = 0;
e54488bb 1149 size_t resid, orig_resid;
d8a9a23b
MD
1150
1151 if (uio)
1152 resid = uio->uio_resid;
1153 else
e54488bb 1154 resid = (size_t)(sio->sb_climit - sio->sb_cc);
d8a9a23b 1155 orig_resid = resid;
984263bc 1156
984263bc 1157 if (psa)
857caa4a 1158 *psa = NULL;
984263bc 1159 if (controlp)
857caa4a 1160 *controlp = NULL;
984263bc
MD
1161 if (flagsp)
1162 flags = *flagsp &~ MSG_EOR;
1163 else
1164 flags = 0;
1165 if (flags & MSG_OOB) {
74f1caca 1166 m = m_get(MB_WAIT, MT_DATA);
984263bc
MD
1167 if (m == NULL)
1168 return (ENOBUFS);
6b6e0885 1169 error = so_pru_rcvoob(so, m, flags & MSG_PEEK);
984263bc
MD
1170 if (error)
1171 goto bad;
d8a9a23b
MD
1172 if (sio) {
1173 do {
6d49aa6f 1174 sbappend(sio, m);
e54488bb
MD
1175 KKASSERT(resid >= (size_t)m->m_len);
1176 resid -= (size_t)m->m_len;
d8a9a23b
MD
1177 } while (resid > 0 && m);
1178 } else {
1179 do {
1180 uio->uio_resid = resid;
1181 error = uiomove(mtod(m, caddr_t),
e54488bb
MD
1182 (int)szmin(resid, m->m_len),
1183 uio);
d8a9a23b
MD
1184 resid = uio->uio_resid;
1185 m = m_free(m);
1186 } while (uio->uio_resid && error == 0 && m);
1187 }
984263bc
MD
1188bad:
1189 if (m)
1190 m_freem(m);
1191 return (error);
1192 }
e54488bb 1193 if ((so->so_state & SS_ISCONFIRMING) && resid)
6b6e0885 1194 so_pru_rcvd(so, 0);
984263bc 1195
20156c7a
MD
1196 /*
1197 * The token interlocks against the protocol thread while
1198 * ssb_lock is a blocking lock against other userland entities.
1199 */
1200 lwkt_gettoken(&so->so_rcv.ssb_token);
984263bc 1201restart:
6d49aa6f 1202 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags));
984263bc 1203 if (error)
857caa4a 1204 goto done;
984263bc 1205
6d49aa6f 1206 m = so->so_rcv.ssb_mb;
984263bc
MD
1207 /*
1208 * If we have less data than requested, block awaiting more
1209 * (subject to any timeout) if:
1210 * 1. the current count is less than the low water mark, or
1211 * 2. MSG_WAITALL is set, and it is possible to do the entire
1212 * receive operation at once if we block (resid <= hiwat).
1213 * 3. MSG_DONTWAIT is not set
1214 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1215 * we have to do the receive in sections, and thus risk returning
1216 * a short count if a timeout or signal occurs after we start.
1217 */
857caa4a 1218 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
e54488bb 1219 (size_t)so->so_rcv.ssb_cc < resid) &&
6d49aa6f 1220 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat ||
e54488bb 1221 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) &&
984263bc 1222 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
6d49aa6f 1223 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1"));
984263bc
MD
1224 if (so->so_error) {
1225 if (m)
1226 goto dontblock;
1227 error = so->so_error;
1228 if ((flags & MSG_PEEK) == 0)
1229 so->so_error = 0;
1230 goto release;
1231 }
1232 if (so->so_state & SS_CANTRCVMORE) {
1233 if (m)
1234 goto dontblock;
1235 else
1236 goto release;
1237 }
857caa4a 1238 for (; m; m = m->m_next) {
984263bc 1239 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
6d49aa6f 1240 m = so->so_rcv.ssb_mb;
984263bc
MD
1241 goto dontblock;
1242 }
857caa4a 1243 }
984263bc 1244 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
6b6e0885 1245 (pr->pr_flags & PR_CONNREQUIRED)) {
984263bc
MD
1246 error = ENOTCONN;
1247 goto release;
1248 }
d8a9a23b 1249 if (resid == 0)
984263bc 1250 goto release;
9ba76b73 1251 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) {
984263bc
MD
1252 error = EWOULDBLOCK;
1253 goto release;
1254 }
6d49aa6f
MD
1255 ssb_unlock(&so->so_rcv);
1256 error = ssb_wait(&so->so_rcv);
984263bc 1257 if (error)
857caa4a 1258 goto done;
984263bc
MD
1259 goto restart;
1260 }
1261dontblock:
d8a9a23b 1262 if (uio && uio->uio_td && uio->uio_td->td_proc)
fde7ac71 1263 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++;
857caa4a
MD
1264
1265 /*
1266 * note: m should be == sb_mb here. Cache the next record while
1267 * cleaning up. Note that calling m_free*() will break out critical
1268 * section.
1269 */
6d49aa6f 1270 KKASSERT(m == so->so_rcv.ssb_mb);
857caa4a
MD
1271
1272 /*
1273 * Skip any address mbufs prepending the record.
1274 */
984263bc
MD
1275 if (pr->pr_flags & PR_ADDR) {
1276 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
1277 orig_resid = 0;
1278 if (psa)
cfa2ba21 1279 *psa = dup_sockaddr(mtod(m, struct sockaddr *));
857caa4a 1280 if (flags & MSG_PEEK)
984263bc 1281 m = m->m_next;
857caa4a 1282 else
6d49aa6f 1283 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
984263bc 1284 }
857caa4a
MD
1285
1286 /*
1287 * Skip any control mbufs prepending the record.
1288 */
78812139
EN
1289#ifdef SCTP
1290 if (pr->pr_flags & PR_ADDR_OPT) {
1291 /*
1292 * For SCTP we may be getting a
1293 * whole message OR a partial delivery.
1294 */
857caa4a 1295 if (m && m->m_type == MT_SONAME) {
78812139
EN
1296 orig_resid = 0;
1297 if (psa)
1298 *psa = dup_sockaddr(mtod(m, struct sockaddr *));
857caa4a 1299 if (flags & MSG_PEEK)
78812139 1300 m = m->m_next;
857caa4a 1301 else
6d49aa6f 1302 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
78812139
EN
1303 }
1304 }
1305#endif /* SCTP */
984263bc
MD
1306 while (m && m->m_type == MT_CONTROL && error == 0) {
1307 if (flags & MSG_PEEK) {
1308 if (controlp)
1309 *controlp = m_copy(m, 0, m->m_len);
857caa4a 1310 m = m->m_next; /* XXX race */
984263bc 1311 } else {
984263bc 1312 if (controlp) {
6d49aa6f 1313 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
984263bc
MD
1314 if (pr->pr_domain->dom_externalize &&
1315 mtod(m, struct cmsghdr *)->cmsg_type ==
1316 SCM_RIGHTS)
1317 error = (*pr->pr_domain->dom_externalize)(m);
1318 *controlp = m;
857caa4a 1319 m = n;
984263bc 1320 } else {
6d49aa6f 1321 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
984263bc
MD
1322 }
1323 }
857caa4a 1324 if (controlp && *controlp) {
984263bc
MD
1325 orig_resid = 0;
1326 controlp = &(*controlp)->m_next;
1327 }
1328 }
857caa4a
MD
1329
1330 /*
1331 * flag OOB data.
1332 */
984263bc 1333 if (m) {
984263bc
MD
1334 type = m->m_type;
1335 if (type == MT_OOBDATA)
1336 flags |= MSG_OOB;
1337 }
857caa4a
MD
1338
1339 /*
1340 * Copy to the UIO or mbuf return chain (*mp).
1341 */
984263bc
MD
1342 moff = 0;
1343 offset = 0;
d8a9a23b 1344 while (m && resid > 0 && error == 0) {
984263bc
MD
1345 if (m->m_type == MT_OOBDATA) {
1346 if (type != MT_OOBDATA)
1347 break;
1348 } else if (type == MT_OOBDATA)
1349 break;
1350 else
1351 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1352 ("receive 3"));
6cef7136 1353 soclrstate(so, SS_RCVATMARK);
e54488bb 1354 len = (resid > INT_MAX) ? INT_MAX : resid;
984263bc
MD
1355 if (so->so_oobmark && len > so->so_oobmark - offset)
1356 len = so->so_oobmark - offset;
1357 if (len > m->m_len - moff)
1358 len = m->m_len - moff;
d8a9a23b 1359
984263bc 1360 /*
d8a9a23b
MD
1361 * Copy out to the UIO or pass the mbufs back to the SIO.
1362 * The SIO is dealt with when we eat the mbuf, but deal
1363 * with the resid here either way.
984263bc 1364 */
d8a9a23b 1365 if (uio) {
d8a9a23b
MD
1366 uio->uio_resid = resid;
1367 error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1368 resid = uio->uio_resid;
984263bc
MD
1369 if (error)
1370 goto release;
857caa4a 1371 } else {
e54488bb 1372 resid -= (size_t)len;
857caa4a
MD
1373 }
1374
1375 /*
1376 * Eat the entire mbuf or just a piece of it
1377 */
984263bc
MD
1378 if (len == m->m_len - moff) {
1379 if (m->m_flags & M_EOR)
1380 flags |= MSG_EOR;
78812139
EN
1381#ifdef SCTP
1382 if (m->m_flags & M_NOTIFICATION)
1383 flags |= MSG_NOTIFICATION;
1384#endif /* SCTP */
984263bc
MD
1385 if (flags & MSG_PEEK) {
1386 m = m->m_next;
1387 moff = 0;
1388 } else {
d8a9a23b 1389 if (sio) {
6d49aa6f
MD
1390 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1391 sbappend(sio, m);
857caa4a 1392 m = n;
984263bc 1393 } else {
6d49aa6f 1394 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
984263bc 1395 }
984263bc
MD
1396 }
1397 } else {
857caa4a 1398 if (flags & MSG_PEEK) {
984263bc 1399 moff += len;
857caa4a 1400 } else {
d8a9a23b 1401 if (sio) {
6d49aa6f
MD
1402 n = m_copym(m, 0, len, MB_WAIT);
1403 if (n)
1404 sbappend(sio, n);
d8a9a23b 1405 }
984263bc
MD
1406 m->m_data += len;
1407 m->m_len -= len;
6d49aa6f 1408 so->so_rcv.ssb_cc -= len;
984263bc
MD
1409 }
1410 }
1411 if (so->so_oobmark) {
1412 if ((flags & MSG_PEEK) == 0) {
1413 so->so_oobmark -= len;
1414 if (so->so_oobmark == 0) {
6cef7136 1415 sosetstate(so, SS_RCVATMARK);
984263bc
MD
1416 break;
1417 }
1418 } else {
1419 offset += len;
1420 if (offset == so->so_oobmark)
1421 break;
1422 }
1423 }
1424 if (flags & MSG_EOR)
1425 break;
1426 /*
1427 * If the MSG_WAITALL flag is set (for non-atomic socket),
d8a9a23b 1428 * we must not quit until resid == 0 or an error
984263bc
MD
1429 * termination. If a signal/timeout occurs, return
1430 * with a short count but without error.
6d49aa6f 1431 * Keep signalsockbuf locked against other readers.
984263bc 1432 */
d8a9a23b
MD
1433 while ((flags & MSG_WAITALL) && m == NULL &&
1434 resid > 0 && !sosendallatonce(so) &&
6d49aa6f 1435 so->so_rcv.ssb_mb == NULL) {
984263bc
MD
1436 if (so->so_error || so->so_state & SS_CANTRCVMORE)
1437 break;
1438 /*
1439 * The window might have closed to zero, make
1440 * sure we send an ack now that we've drained
1441 * the buffer or we might end up blocking until
1442 * the idle takes over (5 seconds).
1443 */
1444 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
6b6e0885 1445 so_pru_rcvd(so, flags);
6d49aa6f 1446 error = ssb_wait(&so->so_rcv);
984263bc 1447 if (error) {
6d49aa6f 1448 ssb_unlock(&so->so_rcv);
857caa4a
MD
1449 error = 0;
1450 goto done;
984263bc 1451 }
6d49aa6f 1452 m = so->so_rcv.ssb_mb;
984263bc
MD
1453 }
1454 }
1455
857caa4a
MD
1456 /*
1457 * If an atomic read was requested but unread data still remains
1458 * in the record, set MSG_TRUNC.
1459 */
bf8a9a6f 1460 if (m && pr->pr_flags & PR_ATOMIC)
984263bc 1461 flags |= MSG_TRUNC;
857caa4a
MD
1462
1463 /*
1464 * Cleanup. If an atomic read was requested drop any unread data.
1465 */
1466 if ((flags & MSG_PEEK) == 0) {
1467 if (m && (pr->pr_flags & PR_ATOMIC))
6d49aa6f 1468 sbdroprecord(&so->so_rcv.sb);
857caa4a 1469 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
6b6e0885 1470 so_pru_rcvd(so, flags);
984263bc 1471 }
bf8a9a6f 1472
d8a9a23b 1473 if (orig_resid == resid && orig_resid &&
984263bc 1474 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
6d49aa6f 1475 ssb_unlock(&so->so_rcv);
984263bc
MD
1476 goto restart;
1477 }
1478
1479 if (flagsp)
1480 *flagsp |= flags;
1481release:
6d49aa6f 1482 ssb_unlock(&so->so_rcv);
857caa4a 1483done:
20156c7a 1484 lwkt_reltoken(&so->so_rcv.ssb_token);
857caa4a
MD
1485 if (free_chain)
1486 m_freem(free_chain);
984263bc
MD
1487 return (error);
1488}
1489
edf5c732
MD
1490/*
1491 * Shut a socket down. Note that we do not get a frontend lock as we
1492 * want to be able to shut the socket down even if another thread is
1493 * blocked in a read(), thus waking it up.
1494 */
984263bc 1495int
c972a82f 1496soshutdown(struct socket *so, int how)
984263bc 1497{
984263bc
MD
1498 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1499 return (EINVAL);
1500
ff518922 1501 if (how != SHUT_WR) {
edf5c732 1502 /*ssb_lock(&so->so_rcv, M_WAITOK);*/
984263bc 1503 sorflush(so);
edf5c732 1504 /*ssb_unlock(&so->so_rcv);*/
ff518922 1505 }
0cd6e642 1506 if (how != SHUT_RD)
6b6e0885 1507 return (so_pru_shutdown(so));
984263bc
MD
1508 return (0);
1509}
1510
1511void
c972a82f 1512sorflush(struct socket *so)
984263bc 1513{
6d49aa6f 1514 struct signalsockbuf *ssb = &so->so_rcv;
1fd87d54 1515 struct protosw *pr = so->so_proto;
6d49aa6f 1516 struct signalsockbuf asb;
984263bc 1517
14343ad3 1518 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR);
e43a034f 1519
ff518922 1520 lwkt_gettoken(&ssb->ssb_token);
984263bc 1521 socantrcvmore(so);
6d49aa6f 1522 asb = *ssb;
14343ad3
MD
1523
1524 /*
1525 * Can't just blow up the ssb structure here
1526 */
ff518922 1527 bzero(&ssb->sb, sizeof(ssb->sb));
14343ad3 1528 ssb->ssb_timeo = 0;
14343ad3
MD
1529 ssb->ssb_lowat = 0;
1530 ssb->ssb_hiwat = 0;
1531 ssb->ssb_mbmax = 0;
1532 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK);
1533
edf5c732 1534 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
6d49aa6f
MD
1535 (*pr->pr_domain->dom_dispose)(asb.ssb_mb);
1536 ssb_release(&asb, so);
edf5c732
MD
1537
1538 lwkt_reltoken(&ssb->ssb_token);
984263bc
MD
1539}
1540
1541#ifdef INET
1542static int
c972a82f 1543do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
984263bc
MD
1544{
1545 struct accept_filter_arg *afap = NULL;
1546 struct accept_filter *afp;
1547 struct so_accf *af = so->so_accf;
1548 int error = 0;
1549
1550 /* do not set/remove accept filters on non listen sockets */
1551 if ((so->so_options & SO_ACCEPTCONN) == 0) {
1552 error = EINVAL;
1553 goto out;
1554 }
1555
1556 /* removing the filter */
1557 if (sopt == NULL) {
1558 if (af != NULL) {
1559 if (af->so_accept_filter != NULL &&
1560 af->so_accept_filter->accf_destroy != NULL) {
1561 af->so_accept_filter->accf_destroy(so);
1562 }
1563 if (af->so_accept_filter_str != NULL) {
1564 FREE(af->so_accept_filter_str, M_ACCF);
1565 }
1566 FREE(af, M_ACCF);
1567 so->so_accf = NULL;
1568 }
1569 so->so_options &= ~SO_ACCEPTFILTER;
1570 return (0);
1571 }
1572 /* adding a filter */
1573 /* must remove previous filter first */
1574 if (af != NULL) {
1575 error = EINVAL;
1576 goto out;
1577 }
1578 /* don't put large objects on the kernel stack */
1579 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
1580 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1581 afap->af_name[sizeof(afap->af_name)-1] = '\0';
1582 afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1583 if (error)
1584 goto out;
1585 afp = accept_filt_get(afap->af_name);
1586 if (afp == NULL) {
1587 error = ENOENT;
1588 goto out;
1589 }
e7b4468c 1590 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
984263bc
MD
1591 if (afp->accf_create != NULL) {
1592 if (afap->af_name[0] != '\0') {
1593 int len = strlen(afap->af_name) + 1;
1594
1595 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
1596 strcpy(af->so_accept_filter_str, afap->af_name);
1597 }
1598 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1599 if (af->so_accept_filter_arg == NULL) {
1600 FREE(af->so_accept_filter_str, M_ACCF);
1601 FREE(af, M_ACCF);
1602 so->so_accf = NULL;
1603 error = EINVAL;
1604 goto out;
1605 }
1606 }
1607 af->so_accept_filter = afp;
1608 so->so_accf = af;
1609 so->so_options |= SO_ACCEPTFILTER;
1610out:
1611 if (afap != NULL)
1612 FREE(afap, M_TEMP);
1613 return (error);
1614}
1615#endif /* INET */
1616
1617/*
1618 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1619 * an additional variant to handle the case where the option value needs
1620 * to be some kind of integer, but not a specific size.
1621 * In addition to their use here, these functions are also called by the
1622 * protocol-level pr_ctloutput() routines.
1623 */
1624int
c972a82f 1625sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
984263bc 1626{
de0003fe
AE
1627 return soopt_to_kbuf(sopt, buf, len, minlen);
1628}
1629
1630int
1631soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1632{
984263bc
MD
1633 size_t valsize;
1634
792239df 1635 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
de0003fe
AE
1636 KKASSERT(kva_p(buf));
1637
984263bc
MD
1638 /*
1639 * If the user gives us more than we wanted, we ignore it,
1640 * but if we don't get the minimum length the caller
1641 * wants, we return EINVAL. On success, sopt->sopt_valsize
1642 * is set to however much we actually retrieved.
1643 */
1644 if ((valsize = sopt->sopt_valsize) < minlen)
1645 return EINVAL;
1646 if (valsize > len)
1647 sopt->sopt_valsize = valsize = len;
1648
984263bc
MD
1649 bcopy(sopt->sopt_val, buf, valsize);
1650 return 0;
1651}
1652
e71a125f
AE
1653
1654int
c972a82f 1655sosetopt(struct socket *so, struct sockopt *sopt)
984263bc
MD
1656{
1657 int error, optval;
1658 struct linger l;
1659 struct timeval tv;
1660 u_long val;
14343ad3 1661 struct signalsockbuf *sotmp;
984263bc
MD
1662
1663 error = 0;
e79d388f 1664 sopt->sopt_dir = SOPT_SET;
984263bc 1665 if (sopt->sopt_level != SOL_SOCKET) {
6b6e0885 1666 if (so->so_proto && so->so_proto->pr_ctloutput) {
002c1265 1667 return (so_pr_ctloutput(so, sopt));
6b6e0885 1668 }
984263bc
MD
1669 error = ENOPROTOOPT;
1670 } else {
1671 switch (sopt->sopt_name) {
1672#ifdef INET
1673 case SO_ACCEPTFILTER:
1674 error = do_setopt_accept_filter(so, sopt);
1675 if (error)
1676 goto bad;
1677 break;
1678#endif /* INET */
1679 case SO_LINGER:
1680 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1681 if (error)
1682 goto bad;
1683
1684 so->so_linger = l.l_linger;
1685 if (l.l_onoff)
1686 so->so_options |= SO_LINGER;
1687 else
1688 so->so_options &= ~SO_LINGER;
1689 break;
1690
1691 case SO_DEBUG:
1692 case SO_KEEPALIVE:
1693 case SO_DONTROUTE:
1694 case SO_USELOOPBACK:
1695 case SO_BROADCAST:
1696 case SO_REUSEADDR:
1697 case SO_REUSEPORT:
1698 case SO_OOBINLINE:
1699 case SO_TIMESTAMP:
1700 error = sooptcopyin(sopt, &optval, sizeof optval,
1701 sizeof optval);
1702 if (error)
1703 goto bad;
1704 if (optval)
1705 so->so_options |= sopt->sopt_name;
1706 else
1707 so->so_options &= ~sopt->sopt_name;
1708 break;
1709
1710 case SO_SNDBUF:
1711 case SO_RCVBUF:
1712 case SO_SNDLOWAT:
1713 case SO_RCVLOWAT:
1714 error = sooptcopyin(sopt, &optval, sizeof optval,
1715 sizeof optval);
1716 if (error)
1717 goto bad;
1718
1719 /*
1720 * Values < 1 make no sense for any of these
1721 * options, so disallow them.
1722 */
1723 if (optval < 1) {
1724 error = EINVAL;
1725 goto bad;
1726 }
1727
1728 switch (sopt->sopt_name) {
1729 case SO_SNDBUF:
1730 case SO_RCVBUF:
6d49aa6f 1731 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ?
984263bc 1732 &so->so_snd : &so->so_rcv, (u_long)optval,
e4700d00
JH
1733 so,
1734 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) {
984263bc
MD
1735 error = ENOBUFS;
1736 goto bad;
1737 }
14343ad3
MD
1738 sotmp = (sopt->sopt_name == SO_SNDBUF) ?
1739 &so->so_snd : &so->so_rcv;
1740 atomic_clear_int(&sotmp->ssb_flags,
1741 SSB_AUTOSIZE);
984263bc
MD
1742 break;
1743
1744 /*
1745 * Make sure the low-water is never greater than
1746 * the high-water.
1747 */
1748 case SO_SNDLOWAT:
6d49aa6f
MD
1749 so->so_snd.ssb_lowat =
1750 (optval > so->so_snd.ssb_hiwat) ?
1751 so->so_snd.ssb_hiwat : optval;
14343ad3
MD
1752 atomic_clear_int(&so->so_snd.ssb_flags,
1753 SSB_AUTOLOWAT);
984263bc
MD
1754 break;
1755 case SO_RCVLOWAT:
6d49aa6f
MD
1756 so->so_rcv.ssb_lowat =
1757 (optval > so->so_rcv.ssb_hiwat) ?
1758 so->so_rcv.ssb_hiwat : optval;
14343ad3
MD
1759 atomic_clear_int(&so->so_rcv.ssb_flags,
1760 SSB_AUTOLOWAT);
984263bc
MD
1761 break;
1762 }
1763 break;
1764
1765 case SO_SNDTIMEO:
1766 case SO_RCVTIMEO:
1767 error = sooptcopyin(sopt, &tv, sizeof tv,
1768 sizeof tv);
1769 if (error)
1770 goto bad;
1771
1772 /* assert(hz > 0); */
45546849 1773 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
984263bc
MD
1774 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1775 error = EDOM;
1776 goto bad;
1777 }
1778 /* assert(tick > 0); */
45546849 1779 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
a591f597 1780 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick;
45546849 1781 if (val > INT_MAX) {
984263bc
MD
1782 error = EDOM;
1783 goto bad;
1784 }
1785 if (val == 0 && tv.tv_usec != 0)
1786 val = 1;
1787
1788 switch (sopt->sopt_name) {
1789 case SO_SNDTIMEO:
6d49aa6f 1790 so->so_snd.ssb_timeo = val;
984263bc
MD
1791 break;
1792 case SO_RCVTIMEO:
6d49aa6f 1793 so->so_rcv.ssb_timeo = val;
984263bc
MD
1794 break;
1795 }
1796 break;
1797 default:
1798 error = ENOPROTOOPT;
1799 break;
1800 }
1801 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
002c1265 1802 (void) so_pr_ctloutput(so, sopt);
984263bc
MD
1803 }
1804 }
1805bad:
1806 return (error);
1807}
1808
1809/* Helper routine for getsockopt */
1810int
f1f552f6 1811sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
984263bc 1812{
de0003fe
AE
1813 soopt_from_kbuf(sopt, buf, len);
1814 return 0;
1815}
1816
1817void
1818soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len)
1819{
984263bc
MD
1820 size_t valsize;
1821
565d9f6f
SZ
1822 if (len == 0) {
1823 sopt->sopt_valsize = 0;
1824 return;
1825 }
1826
792239df 1827 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
de0003fe 1828 KKASSERT(kva_p(buf));
984263bc
MD
1829
1830 /*
1831 * Documented get behavior is that we always return a value,
1832 * possibly truncated to fit in the user's buffer.
1833 * Traditional behavior is that we always tell the user
1834 * precisely how much we copied, rather than something useful
1835 * like the total amount we had available for her.
1836 * Note that this interface is not idempotent; the entire answer must
1837 * generated ahead of time.
1838 */
231d276b 1839 valsize = szmin(len, sopt->sopt_valsize);
984263bc
MD
1840 sopt->sopt_valsize = valsize;
1841 if (sopt->sopt_val != 0) {
de0003fe 1842 bcopy(buf, sopt->sopt_val, valsize);
984263bc 1843 }
e71a125f
AE
1844}
1845
984263bc 1846int
c972a82f 1847sogetopt(struct socket *so, struct sockopt *sopt)
984263bc
MD
1848{
1849 int error, optval;
755c519c 1850 long optval_l;
984263bc
MD
1851 struct linger l;
1852 struct timeval tv;
51f4ca92 1853#ifdef INET
984263bc 1854 struct accept_filter_arg *afap;
51f4ca92 1855#endif
984263bc
MD
1856
1857 error = 0;
e79d388f 1858 sopt->sopt_dir = SOPT_GET;
984263bc
MD
1859 if (sopt->sopt_level != SOL_SOCKET) {
1860 if (so->so_proto && so->so_proto->pr_ctloutput) {
002c1265 1861 return (so_pr_ctloutput(so, sopt));
984263bc
MD
1862 } else
1863 return (ENOPROTOOPT);
1864 } else {
1865 switch (sopt->sopt_name) {
1866#ifdef INET
1867 case SO_ACCEPTFILTER:
1868 if ((so->so_options & SO_ACCEPTCONN) == 0)
1869 return (EINVAL);
1870 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
e7b4468c 1871 M_TEMP, M_WAITOK | M_ZERO);
984263bc
MD
1872 if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1873 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1874 if (so->so_accf->so_accept_filter_str != NULL)
1875 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1876 }
1877 error = sooptcopyout(sopt, afap, sizeof(*afap));
1878 FREE(afap, M_TEMP);
1879 break;
1880#endif /* INET */
1881
1882 case SO_LINGER:
1883 l.l_onoff = so->so_options & SO_LINGER;
1884 l.l_linger = so->so_linger;
1885 error = sooptcopyout(sopt, &l, sizeof l);
1886 break;
1887
1888 case SO_USELOOPBACK:
1889 case SO_DONTROUTE:
1890 case SO_DEBUG:
1891 case SO_KEEPALIVE:
1892 case SO_REUSEADDR:
1893 case SO_REUSEPORT:
1894 case SO_BROADCAST:
1895 case SO_OOBINLINE:
1896 case SO_TIMESTAMP:
1897 optval = so->so_options & sopt->sopt_name;
1898integer:
1899 error = sooptcopyout(sopt, &optval, sizeof optval);
1900 break;
1901
1902 case SO_TYPE:
1903 optval = so->so_type;
1904 goto integer;
1905
1906 case SO_ERROR:
1907 optval = so->so_error;
1908 so->so_error = 0;
1909 goto integer;
1910
1911 case SO_SNDBUF:
6d49aa6f 1912 optval = so->so_snd.ssb_hiwat;
984263bc
MD
1913 goto integer;
1914
1915 case SO_RCVBUF:
6d49aa6f 1916 optval = so->so_rcv.ssb_hiwat;
984263bc
MD
1917 goto integer;
1918
1919 case SO_SNDLOWAT:
6d49aa6f 1920 optval = so->so_snd.ssb_lowat;
984263bc
MD
1921 goto integer;
1922
1923 case SO_RCVLOWAT:
6d49aa6f 1924 optval = so->so_rcv.ssb_lowat;
984263bc
MD
1925 goto integer;
1926
1927 case SO_SNDTIMEO:
1928 case SO_RCVTIMEO:
1929 optval = (sopt->sopt_name == SO_SNDTIMEO ?
6d49aa6f 1930 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo);
984263bc
MD
1931
1932 tv.tv_sec = optval / hz;
a591f597 1933 tv.tv_usec = (optval % hz) * ustick;
984263bc
MD
1934 error = sooptcopyout(sopt, &tv, sizeof tv);
1935 break;
1936
755c519c
SZ
1937 case SO_SNDSPACE:
1938 optval_l = ssb_space(&so->so_snd);
1939 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l));
1940 break;
1941
984263bc
MD
1942 default:
1943 error = ENOPROTOOPT;
1944 break;
1945 }
1946 return (error);
1947 }
1948}
1949
1950/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1951int
1952soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1953{
1954 struct mbuf *m, *m_prev;
bf6ac9fa
JH
1955 int sopt_size = sopt->sopt_valsize, msize;
1956
1957 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA,
1958 0, &msize);
1959 if (m == NULL)
1960 return (ENOBUFS);
1961 m->m_len = min(msize, sopt_size);
984263bc
MD
1962 sopt_size -= m->m_len;
1963 *mp = m;
1964 m_prev = m;
1965
bf6ac9fa
JH
1966 while (sopt_size > 0) {
1967 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT,
1968 MT_DATA, 0, &msize);
1969 if (m == NULL) {
984263bc 1970 m_freem(*mp);
bf6ac9fa 1971 return (ENOBUFS);
984263bc 1972 }
bf6ac9fa 1973 m->m_len = min(msize, sopt_size);
984263bc
MD
1974 sopt_size -= m->m_len;
1975 m_prev->m_next = m;
1976 m_prev = m;
1977 }
bf6ac9fa 1978 return (0);
984263bc
MD
1979}
1980
1981/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1982int
1983soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1984{
de0003fe
AE
1985 soopt_to_mbuf(sopt, m);
1986 return 0;
1987}
1988
1989void
1990soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m)
1991{
c3e742f9
NT
1992 size_t valsize;
1993 void *val;
984263bc 1994
792239df 1995 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
de0003fe 1996 KKASSERT(kva_p(m));
984263bc 1997 if (sopt->sopt_val == NULL)
792239df 1998 return;
c3e742f9
NT
1999 val = sopt->sopt_val;
2000 valsize = sopt->sopt_valsize;
2001 while (m != NULL && valsize >= m->m_len) {
de0003fe 2002 bcopy(val, mtod(m, char *), m->m_len);
c3e742f9
NT
2003 valsize -= m->m_len;
2004 val = (caddr_t)val + m->m_len;
984263bc
MD
2005 m = m->m_next;
2006 }
2007 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2008 panic("ip6_sooptmcopyin");
984263bc
MD
2009}
2010
de0003fe
AE
2011/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2012int
2013soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
e71a125f 2014{
de0003fe 2015 return soopt_from_mbuf(sopt, m);
e71a125f
AE
2016}
2017
984263bc 2018int
de0003fe 2019soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m)
984263bc
MD
2020{
2021 struct mbuf *m0 = m;
2022 size_t valsize = 0;
c3e742f9
NT
2023 size_t maxsize;
2024 void *val;
984263bc 2025
792239df 2026 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
de0003fe 2027 KKASSERT(kva_p(m));
984263bc
MD
2028 if (sopt->sopt_val == NULL)
2029 return 0;
c3e742f9
NT
2030 val = sopt->sopt_val;
2031 maxsize = sopt->sopt_valsize;
2032 while (m != NULL && maxsize >= m->m_len) {
de0003fe 2033 bcopy(mtod(m, char *), val, m->m_len);
c3e742f9
NT
2034 maxsize -= m->m_len;
2035 val = (caddr_t)val + m->m_len;
984263bc
MD
2036 valsize += m->m_len;
2037 m = m->m_next;
2038 }
2039 if (m != NULL) {
2040 /* enough soopt buffer should be given from user-land */
2041 m_freem(m0);
bf6ac9fa 2042 return (EINVAL);
984263bc
MD
2043 }
2044 sopt->sopt_valsize = valsize;
2045 return 0;
2046}
2047
2048void
c972a82f 2049sohasoutofband(struct socket *so)
984263bc
MD
2050{
2051 if (so->so_sigio != NULL)
2052 pgsigio(so->so_sigio, SIGURG, 0);
5b22f1a7 2053 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB);
984263bc
MD
2054}
2055
2056int
984263bc
MD
2057sokqfilter(struct file *fp, struct knote *kn)
2058{
2059 struct socket *so = (struct socket *)kn->kn_fp->f_data;
6d49aa6f 2060 struct signalsockbuf *ssb;
984263bc
MD
2061
2062 switch (kn->kn_filter) {
2063 case EVFILT_READ:
2064 if (so->so_options & SO_ACCEPTCONN)
2065 kn->kn_fop = &solisten_filtops;
2066 else
2067 kn->kn_fop = &soread_filtops;
6d49aa6f 2068 ssb = &so->so_rcv;
984263bc
MD
2069 break;
2070 case EVFILT_WRITE:
2071 kn->kn_fop = &sowrite_filtops;
6d49aa6f 2072 ssb = &so->so_snd;
984263bc 2073 break;
73c344d3
SG
2074 case EVFILT_EXCEPT:
2075 kn->kn_fop = &soexcept_filtops;
2076 ssb = &so->so_rcv;
2077 break;
984263bc 2078 default:
b287d649 2079 return (EOPNOTSUPP);
984263bc
MD
2080 }
2081
5b22f1a7 2082 knote_insert(&ssb->ssb_kq.ki_note, kn);
14343ad3 2083 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE);
984263bc
MD
2084 return (0);
2085}
2086
2087static void
2088filt_sordetach(struct knote *kn)
2089{
2090 struct socket *so = (struct socket *)kn->kn_fp->f_data;
984263bc 2091
5b22f1a7
SG
2092 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn);
2093 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note))
14343ad3 2094 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE);
984263bc
MD
2095}
2096
2097/*ARGSUSED*/
2098static int
2099filt_soread(struct knote *kn, long hint)
2100{
2101 struct socket *so = (struct socket *)kn->kn_fp->f_data;
2102
73c344d3
SG
2103 if (kn->kn_sfflags & NOTE_OOB) {
2104 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) {
2105 kn->kn_fflags |= NOTE_OOB;
2106 return (1);
2107 }
2108 return (0);
70a4a30f 2109 }
6d49aa6f 2110 kn->kn_data = so->so_rcv.ssb_cc;
8c4ed426 2111
3bcb6e5e
SZ
2112 if (so->so_state & SS_CANTRCVMORE) {
2113 /*
2114 * Only set NODATA if all data has been exhausted.
2115 */
2116 if (kn->kn_data == 0)
2117 kn->kn_flags |= EV_NODATA;
984263bc
MD
2118 kn->kn_flags |= EV_EOF;
2119 kn->kn_fflags = so->so_error;
2120 return (1);
2121 }
2122 if (so->so_error) /* temporary udp error */
2123 return (1);
2124 if (kn->kn_sfflags & NOTE_LOWAT)
2125 return (kn->kn_data >= kn->kn_sdata);
e5857bf7 2126 return ((kn->kn_data >= so->so_rcv.ssb_lowat) ||
6cef7136 2127 !TAILQ_EMPTY(&so->so_comp));
984263bc
MD
2128}
2129
2130static void
2131filt_sowdetach(struct knote *kn)
2132{
2133 struct socket *so = (struct socket *)kn->kn_fp->f_data;
984263bc 2134
5b22f1a7
SG
2135 knote_remove(&so->so_snd.ssb_kq.ki_note, kn);
2136 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note))
14343ad3 2137 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE);
984263bc
MD
2138}
2139
2140/*ARGSUSED*/
2141static int
2142filt_sowrite(struct knote *kn, long hint)
2143{
2144 struct socket *so = (struct socket *)kn->kn_fp->f_data;
2145
6d49aa6f 2146 kn->kn_data = ssb_space(&so->so_snd);
984263bc 2147 if (so->so_state & SS_CANTSENDMORE) {
3bcb6e5e 2148 kn->kn_flags |= (EV_EOF | EV_NODATA);
984263bc
MD
2149 kn->kn_fflags = so->so_error;
2150 return (1);
2151 }
2152 if (so->so_error) /* temporary udp error */
2153 return (1);
2154 if (((so->so_state & SS_ISCONNECTED) == 0) &&
2155 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2156 return (0);
2157 if (kn->kn_sfflags & NOTE_LOWAT)
2158 return (kn->kn_data >= kn->kn_sdata);
6d49aa6f 2159 return (kn->kn_data >= so->so_snd.ssb_lowat);
984263bc
MD
2160}
2161
2162/*ARGSUSED*/
2163static int
2164filt_solisten(struct knote *kn, long hint)
2165{
2166 struct socket *so = (struct socket *)kn->kn_fp->f_data;
2167
2168 kn->kn_data = so->so_qlen;
2169 return (! TAILQ_EMPTY(&so->so_comp));
2170}