2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993
4 * The Regents of the University of California. All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 * must display the following acknowledgement:
16 * This product includes software developed by the University of
17 * California, Berkeley and its contributors.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
35 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $
36 * $DragonFly: src/sys/kern/uipc_socket.c,v 1.19 2004/05/12 20:21:21 hmp Exp $
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/fcntl.h>
44 #include <sys/malloc.h>
46 #include <sys/domain.h>
47 #include <sys/file.h> /* for struct knote */
48 #include <sys/kernel.h>
49 #include <sys/malloc.h>
50 #include <sys/event.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/socketops.h>
57 #include <sys/resourcevar.h>
58 #include <sys/signalvar.h>
59 #include <sys/sysctl.h>
62 #include <vm/vm_zone.h>
64 #include <machine/limits.h>
67 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
70 static void filt_sordetach(struct knote *kn);
71 static int filt_soread(struct knote *kn, long hint);
72 static void filt_sowdetach(struct knote *kn);
73 static int filt_sowrite(struct knote *kn, long hint);
74 static int filt_solisten(struct knote *kn, long hint);
76 static struct filterops solisten_filtops =
77 { 1, NULL, filt_sordetach, filt_solisten };
78 static struct filterops soread_filtops =
79 { 1, NULL, filt_sordetach, filt_soread };
80 static struct filterops sowrite_filtops =
81 { 1, NULL, filt_sowdetach, filt_sowrite };
83 struct vm_zone *socket_zone;
84 so_gen_t so_gencnt; /* generation count for sockets */
86 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
87 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
89 SYSCTL_DECL(_kern_ipc);
91 static int somaxconn = SOMAXCONN;
92 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
93 &somaxconn, 0, "Maximum pending socket connection queue size");
96 * Socket operation routines.
97 * These routines are called by the routines in
98 * sys_socket.c or from a system process, and
99 * implement the semantics of socket operations by
100 * switching out to the protocol specific routines.
104 * Get a socket structure from our zone, and initialize it.
105 * We don't implement `waitok' yet (see comments in uipc_domain.c).
106 * Note that it would probably be better to allocate socket
107 * and PCB at the same time, but I'm not convinced that all
108 * the protocols can be easily modified to do this.
116 so = zalloc(socket_zone);
118 /* XXX race condition for reentrant kernel */
119 bzero(so, sizeof *so);
120 so->so_gencnt = ++so_gencnt;
121 TAILQ_INIT(&so->so_aiojobq);
122 TAILQ_INIT(&so->so_rcv.sb_sel.si_mlist);
123 TAILQ_INIT(&so->so_snd.sb_sel.si_mlist);
129 socreate(int dom, struct socket **aso, int type,
130 int proto, struct thread *td)
132 struct proc *p = td->td_proc;
135 struct pru_attach_info ai;
139 prp = pffindproto(dom, proto, type);
141 prp = pffindtype(dom, type);
143 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
144 return (EPROTONOSUPPORT);
146 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only &&
147 prp->pr_domain->dom_family != PF_LOCAL &&
148 prp->pr_domain->dom_family != PF_INET &&
149 prp->pr_domain->dom_family != PF_ROUTE) {
150 return (EPROTONOSUPPORT);
153 if (prp->pr_type != type)
155 so = soalloc(p != 0);
159 TAILQ_INIT(&so->so_incomp);
160 TAILQ_INIT(&so->so_comp);
162 so->so_cred = crhold(p->p_ucred);
164 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
165 ai.p_ucred = p->p_ucred;
166 ai.fd_rdir = p->p_fd->fd_rdir;
167 error = so_pru_attach(so, proto, &ai);
169 so->so_state |= SS_NOFDREF;
178 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
183 error = so_pru_bind(so, nam, td);
189 sodealloc(struct socket *so)
192 so->so_gencnt = ++so_gencnt;
193 if (so->so_rcv.sb_hiwat)
194 (void)chgsbsize(so->so_cred->cr_uidinfo,
195 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
196 if (so->so_snd.sb_hiwat)
197 (void)chgsbsize(so->so_cred->cr_uidinfo,
198 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
200 /* remove accept filter if present */
201 if (so->so_accf != NULL)
202 do_setopt_accept_filter(so, NULL);
205 zfree(socket_zone, so);
209 solisten(struct socket *so, int backlog, struct thread *td)
214 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) {
219 error = so_pru_listen(so, td);
224 if (TAILQ_EMPTY(&so->so_comp))
225 so->so_options |= SO_ACCEPTCONN;
226 if (backlog < 0 || backlog > somaxconn)
228 so->so_qlimit = backlog;
234 sofree(struct socket *so)
236 struct socket *head = so->so_head;
238 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
241 if (so->so_state & SS_INCOMP) {
242 TAILQ_REMOVE(&head->so_incomp, so, so_list);
244 } else if (so->so_state & SS_COMP) {
246 * We must not decommission a socket that's
247 * on the accept(2) queue. If we do, then
248 * accept(2) may hang after select(2) indicated
249 * that the listening socket was ready.
253 panic("sofree: not queued");
255 so->so_state &= ~SS_INCOMP;
258 sbrelease(&so->so_snd, so);
264 * Close a socket on last file table reference removal.
265 * Initiate disconnect if connected.
266 * Free socket when disconnect complete.
269 soclose(struct socket *so)
271 int s = splnet(); /* conservative */
274 funsetown(so->so_sigio);
275 if (so->so_options & SO_ACCEPTCONN) {
276 struct socket *sp, *sonext;
278 sp = TAILQ_FIRST(&so->so_incomp);
279 for (; sp != NULL; sp = sonext) {
280 sonext = TAILQ_NEXT(sp, so_list);
283 for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
284 sonext = TAILQ_NEXT(sp, so_list);
285 /* Dequeue from so_comp since sofree() won't do it */
286 TAILQ_REMOVE(&so->so_comp, sp, so_list);
288 sp->so_state &= ~SS_COMP;
295 if (so->so_state & SS_ISCONNECTED) {
296 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
297 error = sodisconnect(so);
301 if (so->so_options & SO_LINGER) {
302 if ((so->so_state & SS_ISDISCONNECTING) &&
303 (so->so_state & SS_NBIO))
305 while (so->so_state & SS_ISCONNECTED) {
306 error = tsleep((caddr_t)&so->so_timeo,
307 PCATCH, "soclos", so->so_linger * hz);
317 error2 = so_pru_detach(so);
322 if (so->so_state & SS_NOFDREF)
323 panic("soclose: NOFDREF");
324 so->so_state |= SS_NOFDREF;
331 * Must be called at splnet...
339 error = so_pru_abort(so);
348 soaccept(struct socket *so, struct sockaddr **nam)
353 if ((so->so_state & SS_NOFDREF) == 0)
354 panic("soaccept: !NOFDREF");
355 so->so_state &= ~SS_NOFDREF;
356 error = so_pru_accept(so, nam);
362 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
367 if (so->so_options & SO_ACCEPTCONN)
371 * If protocol is connection-based, can only connect once.
372 * Otherwise, if connected, try to disconnect first.
373 * This allows user to disconnect by connecting to, e.g.,
376 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
377 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
378 (error = sodisconnect(so))))
381 error = so_pru_connect(so, nam, td);
387 soconnect2(struct socket *so1, struct socket *so2)
392 error = so_pru_connect2(so1, so2);
398 sodisconnect(struct socket *so)
403 if ((so->so_state & SS_ISCONNECTED) == 0) {
407 if (so->so_state & SS_ISDISCONNECTING) {
411 error = so_pru_disconnect(so);
417 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
420 * If send must go all at once and message is larger than
421 * send buffering, then hard error.
422 * Lock against other senders.
423 * If must go all at once and not enough room now, then
424 * inform user that this would block and do nothing.
425 * Otherwise, if nonblocking, send as much as possible.
426 * The data to be sent is described by "uio" if nonzero,
427 * otherwise by the mbuf chain "top" (which must be null
428 * if uio is not). Data provided in mbuf chain must be small
429 * enough to send all at once.
431 * Returns nonzero on error, timeout or signal; callers
432 * must check for short counts if EINTR/ERESTART are returned.
433 * Data and control buffers are freed on return.
436 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
437 struct mbuf *top, struct mbuf *control, int flags,
442 long space, len, resid;
443 int clen = 0, error, s, dontroute, mlen;
444 int atomic = sosendallatonce(so) || top;
448 resid = uio->uio_resid;
450 resid = top->m_pkthdr.len;
452 * In theory resid should be unsigned.
453 * However, space must be signed, as it might be less than 0
454 * if we over-committed, and we must use a signed comparison
455 * of space and resid. On the other hand, a negative resid
456 * causes us to loop sending 0-length segments to the protocol.
458 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
459 * type sockets since that's an error.
461 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
467 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
468 (so->so_proto->pr_flags & PR_ATOMIC);
469 if (td->td_proc && td->td_proc->p_stats)
470 td->td_proc->p_stats->p_ru.ru_msgsnd++;
472 clen = control->m_len;
473 #define gotoerr(errno) { error = errno; splx(s); goto release; }
476 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
481 if (so->so_state & SS_CANTSENDMORE)
484 error = so->so_error;
489 if ((so->so_state & SS_ISCONNECTED) == 0) {
491 * `sendto' and `sendmsg' is allowed on a connection-
492 * based socket if it supports implied connect.
493 * Return ENOTCONN if not connected and no address is
496 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
497 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
498 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
499 !(resid == 0 && clen != 0))
501 } else if (addr == 0)
502 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
503 ENOTCONN : EDESTADDRREQ);
505 space = sbspace(&so->so_snd);
508 if ((atomic && resid > so->so_snd.sb_hiwat) ||
509 clen > so->so_snd.sb_hiwat)
511 if (space < resid + clen && uio &&
512 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
513 if (so->so_state & SS_NBIO)
514 gotoerr(EWOULDBLOCK);
515 sbunlock(&so->so_snd);
516 error = sbwait(&so->so_snd);
528 * Data is prepackaged in "top".
532 top->m_flags |= M_EOR;
535 MGETHDR(m, M_WAIT, MT_DATA);
542 m->m_pkthdr.rcvif = (struct ifnet *)0;
544 MGET(m, M_WAIT, MT_DATA);
551 if (resid >= MINCLSIZE) {
553 if ((m->m_flags & M_EXT) == 0)
556 len = min(min(mlen, resid), space);
559 len = min(min(mlen, resid), space);
561 * For datagram protocols, leave room
562 * for protocol headers in first mbuf.
564 if (atomic && top == 0 && len < mlen)
568 error = uiomove(mtod(m, caddr_t), (int)len, uio);
569 resid = uio->uio_resid;
572 top->m_pkthdr.len += len;
578 top->m_flags |= M_EOR;
581 } while (space > 0 && atomic);
583 so->so_options |= SO_DONTROUTE;
584 if (flags & MSG_OOB) {
585 pru_flags = PRUS_OOB;
586 } else if ((flags & MSG_EOF) &&
587 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
590 * If the user set MSG_EOF, the protocol
591 * understands this flag and nothing left to
592 * send then use PRU_SEND_EOF instead of PRU_SEND.
594 pru_flags = PRUS_EOF;
595 } else if (resid > 0 && space > 0) {
596 /* If there is more to send, set PRUS_MORETOCOME */
597 pru_flags = PRUS_MORETOCOME;
601 s = splnet(); /* XXX */
603 * XXX all the SS_CANTSENDMORE checks previously
604 * done could be out of date. We could have recieved
605 * a reset packet in an interrupt or maybe we slept
606 * while doing page faults in uiomove() etc. We could
607 * probably recheck again inside the splnet() protection
608 * here, but there are probably other places that this
609 * also happens. We must rethink this.
611 error = so_pru_send(so, pru_flags, top, addr, control, td);
614 so->so_options &= ~SO_DONTROUTE;
621 } while (resid && space > 0);
625 sbunlock(&so->so_snd);
635 * A specialization of sosend() for UDP based on protocol-specific knowledge:
636 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that
637 * sosendallatonce() returns true,
638 * the "atomic" variable is true,
639 * and sosendudp() blocks until space is available for the entire send.
640 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or
641 * PR_IMPLOPCL flags set.
642 * UDP has no out-of-band data.
643 * UDP has no control data.
644 * UDP does not support MSG_EOR.
647 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio,
648 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
651 boolean_t dontroute; /* temporary SO_DONTROUTE setting */
653 if (td->td_proc && td->td_proc->p_stats)
654 td->td_proc->p_stats->p_ru.ru_msgsnd++;
658 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp"));
659 resid = uio ? uio->uio_resid : top->m_pkthdr.len;
662 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
667 if (so->so_state & SS_CANTSENDMORE)
670 error = so->so_error;
675 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL)
676 gotoerr(EDESTADDRREQ);
677 if (resid > so->so_snd.sb_hiwat)
679 if (uio && sbspace(&so->so_snd) < resid) {
680 if (so->so_state & SS_NBIO)
681 gotoerr(EWOULDBLOCK);
682 sbunlock(&so->so_snd);
683 error = sbwait(&so->so_snd);
692 top = m_uiomove(uio, M_WAIT, 0);
697 dontroute = (flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE);
699 so->so_options |= SO_DONTROUTE;
701 error = so_pru_send(so, 0, top, addr, NULL, td);
702 top = NULL; /* sent or freed in lower layer */
705 so->so_options &= ~SO_DONTROUTE;
708 sbunlock(&so->so_snd);
716 * Implement receive operations on a socket.
717 * We depend on the way that records are added to the sockbuf
718 * by sbappend*. In particular, each record (mbufs linked through m_next)
719 * must begin with an address if the protocol so specifies,
720 * followed by an optional mbuf or mbufs containing ancillary data,
721 * and then zero or more mbufs of data.
722 * In order to avoid blocking network interrupts for the entire time here,
723 * we splx() while doing the actual copy to user space.
724 * Although the sockbuf is locked, new data may still be appended,
725 * and thus we must maintain consistency of the sockbuf during that time.
727 * The caller may receive the data as a single mbuf chain by supplying
728 * an mbuf **mp0 for use in returning the chain. The uio is then used
729 * only for the count in uio_resid.
732 soreceive(so, psa, uio, mp0, controlp, flagsp)
734 struct sockaddr **psa;
737 struct mbuf **controlp;
740 struct mbuf *m, **mp;
741 int flags, len, error, s, offset;
742 struct protosw *pr = so->so_proto;
743 struct mbuf *nextrecord;
745 int orig_resid = uio->uio_resid;
753 flags = *flagsp &~ MSG_EOR;
756 if (flags & MSG_OOB) {
757 m = m_get(M_WAIT, MT_DATA);
760 error = so_pru_rcvoob(so, m, flags & MSG_PEEK);
764 error = uiomove(mtod(m, caddr_t),
765 (int) min(uio->uio_resid, m->m_len), uio);
767 } while (uio->uio_resid && error == 0 && m);
774 *mp = (struct mbuf *)0;
775 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
779 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
784 m = so->so_rcv.sb_mb;
786 * If we have less data than requested, block awaiting more
787 * (subject to any timeout) if:
788 * 1. the current count is less than the low water mark, or
789 * 2. MSG_WAITALL is set, and it is possible to do the entire
790 * receive operation at once if we block (resid <= hiwat).
791 * 3. MSG_DONTWAIT is not set
792 * If MSG_WAITALL is set but resid is larger than the receive buffer,
793 * we have to do the receive in sections, and thus risk returning
794 * a short count if a timeout or signal occurs after we start.
796 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
797 so->so_rcv.sb_cc < uio->uio_resid) &&
798 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
799 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
800 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
801 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
805 error = so->so_error;
806 if ((flags & MSG_PEEK) == 0)
810 if (so->so_state & SS_CANTRCVMORE) {
816 for (; m; m = m->m_next)
817 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
818 m = so->so_rcv.sb_mb;
821 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
822 (pr->pr_flags & PR_CONNREQUIRED)) {
826 if (uio->uio_resid == 0)
828 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
832 sbunlock(&so->so_rcv);
833 error = sbwait(&so->so_rcv);
840 if (uio->uio_td && uio->uio_td->td_proc)
841 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
842 nextrecord = m->m_nextpkt;
843 if (pr->pr_flags & PR_ADDR) {
844 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
847 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
849 if (flags & MSG_PEEK) {
852 sbfree(&so->so_rcv, m);
853 so->so_rcv.sb_mb = m_free(m);
854 m = so->so_rcv.sb_mb;
857 while (m && m->m_type == MT_CONTROL && error == 0) {
858 if (flags & MSG_PEEK) {
860 *controlp = m_copy(m, 0, m->m_len);
863 sbfree(&so->so_rcv, m);
865 if (pr->pr_domain->dom_externalize &&
866 mtod(m, struct cmsghdr *)->cmsg_type ==
868 error = (*pr->pr_domain->dom_externalize)(m);
870 so->so_rcv.sb_mb = m->m_next;
872 m = so->so_rcv.sb_mb;
874 so->so_rcv.sb_mb = m_free(m);
875 m = so->so_rcv.sb_mb;
880 controlp = &(*controlp)->m_next;
884 if ((flags & MSG_PEEK) == 0)
885 m->m_nextpkt = nextrecord;
887 if (type == MT_OOBDATA)
892 while (m && uio->uio_resid > 0 && error == 0) {
893 if (m->m_type == MT_OOBDATA) {
894 if (type != MT_OOBDATA)
896 } else if (type == MT_OOBDATA)
899 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
901 so->so_state &= ~SS_RCVATMARK;
902 len = uio->uio_resid;
903 if (so->so_oobmark && len > so->so_oobmark - offset)
904 len = so->so_oobmark - offset;
905 if (len > m->m_len - moff)
906 len = m->m_len - moff;
908 * If mp is set, just pass back the mbufs.
909 * Otherwise copy them out via the uio, then free.
910 * Sockbuf must be consistent here (points to current mbuf,
911 * it points to next record) when we drop priority;
912 * we must note any additions to the sockbuf when we
913 * block interrupts again.
917 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
922 uio->uio_resid -= len;
923 if (len == m->m_len - moff) {
924 if (m->m_flags & M_EOR)
926 if (flags & MSG_PEEK) {
930 nextrecord = m->m_nextpkt;
931 sbfree(&so->so_rcv, m);
935 so->so_rcv.sb_mb = m = m->m_next;
936 *mp = (struct mbuf *)0;
938 so->so_rcv.sb_mb = m = m_free(m);
941 m->m_nextpkt = nextrecord;
944 if (flags & MSG_PEEK)
948 *mp = m_copym(m, 0, len, M_WAIT);
951 so->so_rcv.sb_cc -= len;
954 if (so->so_oobmark) {
955 if ((flags & MSG_PEEK) == 0) {
956 so->so_oobmark -= len;
957 if (so->so_oobmark == 0) {
958 so->so_state |= SS_RCVATMARK;
963 if (offset == so->so_oobmark)
970 * If the MSG_WAITALL flag is set (for non-atomic socket),
971 * we must not quit until "uio->uio_resid == 0" or an error
972 * termination. If a signal/timeout occurs, return
973 * with a short count but without error.
974 * Keep sockbuf locked against other readers.
976 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
977 !sosendallatonce(so) && !nextrecord) {
978 if (so->so_error || so->so_state & SS_CANTRCVMORE)
981 * The window might have closed to zero, make
982 * sure we send an ack now that we've drained
983 * the buffer or we might end up blocking until
984 * the idle takes over (5 seconds).
986 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
987 so_pru_rcvd(so, flags);
988 error = sbwait(&so->so_rcv);
990 sbunlock(&so->so_rcv);
994 m = so->so_rcv.sb_mb;
996 nextrecord = m->m_nextpkt;
1000 if (m && pr->pr_flags & PR_ATOMIC) {
1002 if ((flags & MSG_PEEK) == 0)
1003 (void) sbdroprecord(&so->so_rcv);
1005 if ((flags & MSG_PEEK) == 0) {
1007 so->so_rcv.sb_mb = nextrecord;
1008 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1009 so_pru_rcvd(so, flags);
1011 if (orig_resid == uio->uio_resid && orig_resid &&
1012 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1013 sbunlock(&so->so_rcv);
1021 sbunlock(&so->so_rcv);
1031 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1037 return (so_pru_shutdown(so));
1045 struct sockbuf *sb = &so->so_rcv;
1046 struct protosw *pr = so->so_proto;
1050 sb->sb_flags |= SB_NOINTR;
1051 (void) sblock(sb, M_WAITOK);
1056 bzero((caddr_t)sb, sizeof (*sb));
1057 if (asb.sb_flags & SB_KNOTE) {
1058 sb->sb_sel.si_note = asb.sb_sel.si_note;
1059 sb->sb_flags = SB_KNOTE;
1062 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1063 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1064 sbrelease(&asb, so);
1069 do_setopt_accept_filter(so, sopt)
1071 struct sockopt *sopt;
1073 struct accept_filter_arg *afap = NULL;
1074 struct accept_filter *afp;
1075 struct so_accf *af = so->so_accf;
1078 /* do not set/remove accept filters on non listen sockets */
1079 if ((so->so_options & SO_ACCEPTCONN) == 0) {
1084 /* removing the filter */
1087 if (af->so_accept_filter != NULL &&
1088 af->so_accept_filter->accf_destroy != NULL) {
1089 af->so_accept_filter->accf_destroy(so);
1091 if (af->so_accept_filter_str != NULL) {
1092 FREE(af->so_accept_filter_str, M_ACCF);
1097 so->so_options &= ~SO_ACCEPTFILTER;
1100 /* adding a filter */
1101 /* must remove previous filter first */
1106 /* don't put large objects on the kernel stack */
1107 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
1108 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1109 afap->af_name[sizeof(afap->af_name)-1] = '\0';
1110 afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1113 afp = accept_filt_get(afap->af_name);
1118 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK);
1119 bzero(af, sizeof(*af));
1120 if (afp->accf_create != NULL) {
1121 if (afap->af_name[0] != '\0') {
1122 int len = strlen(afap->af_name) + 1;
1124 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
1125 strcpy(af->so_accept_filter_str, afap->af_name);
1127 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1128 if (af->so_accept_filter_arg == NULL) {
1129 FREE(af->so_accept_filter_str, M_ACCF);
1136 af->so_accept_filter = afp;
1138 so->so_options |= SO_ACCEPTFILTER;
1147 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1148 * an additional variant to handle the case where the option value needs
1149 * to be some kind of integer, but not a specific size.
1150 * In addition to their use here, these functions are also called by the
1151 * protocol-level pr_ctloutput() routines.
1154 sooptcopyin(sopt, buf, len, minlen)
1155 struct sockopt *sopt;
1163 * If the user gives us more than we wanted, we ignore it,
1164 * but if we don't get the minimum length the caller
1165 * wants, we return EINVAL. On success, sopt->sopt_valsize
1166 * is set to however much we actually retrieved.
1168 if ((valsize = sopt->sopt_valsize) < minlen)
1171 sopt->sopt_valsize = valsize = len;
1173 if (sopt->sopt_td != NULL)
1174 return (copyin(sopt->sopt_val, buf, valsize));
1176 bcopy(sopt->sopt_val, buf, valsize);
1183 struct sockopt *sopt;
1191 if (sopt->sopt_level != SOL_SOCKET) {
1192 if (so->so_proto && so->so_proto->pr_ctloutput) {
1193 return (so_pr_ctloutput(so, sopt));
1195 error = ENOPROTOOPT;
1197 switch (sopt->sopt_name) {
1199 case SO_ACCEPTFILTER:
1200 error = do_setopt_accept_filter(so, sopt);
1206 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1210 so->so_linger = l.l_linger;
1212 so->so_options |= SO_LINGER;
1214 so->so_options &= ~SO_LINGER;
1220 case SO_USELOOPBACK:
1226 error = sooptcopyin(sopt, &optval, sizeof optval,
1231 so->so_options |= sopt->sopt_name;
1233 so->so_options &= ~sopt->sopt_name;
1240 error = sooptcopyin(sopt, &optval, sizeof optval,
1246 * Values < 1 make no sense for any of these
1247 * options, so disallow them.
1254 switch (sopt->sopt_name) {
1257 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1258 &so->so_snd : &so->so_rcv, (u_long)optval,
1260 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) {
1267 * Make sure the low-water is never greater than
1271 so->so_snd.sb_lowat =
1272 (optval > so->so_snd.sb_hiwat) ?
1273 so->so_snd.sb_hiwat : optval;
1276 so->so_rcv.sb_lowat =
1277 (optval > so->so_rcv.sb_hiwat) ?
1278 so->so_rcv.sb_hiwat : optval;
1285 error = sooptcopyin(sopt, &tv, sizeof tv,
1290 /* assert(hz > 0); */
1291 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1292 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1296 /* assert(tick > 0); */
1297 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1298 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1299 if (val > SHRT_MAX) {
1303 if (val == 0 && tv.tv_usec != 0)
1306 switch (sopt->sopt_name) {
1308 so->so_snd.sb_timeo = val;
1311 so->so_rcv.sb_timeo = val;
1316 error = ENOPROTOOPT;
1319 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1320 (void) so_pr_ctloutput(so, sopt);
1327 /* Helper routine for getsockopt */
1329 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1337 * Documented get behavior is that we always return a value,
1338 * possibly truncated to fit in the user's buffer.
1339 * Traditional behavior is that we always tell the user
1340 * precisely how much we copied, rather than something useful
1341 * like the total amount we had available for her.
1342 * Note that this interface is not idempotent; the entire answer must
1343 * generated ahead of time.
1345 valsize = min(len, sopt->sopt_valsize);
1346 sopt->sopt_valsize = valsize;
1347 if (sopt->sopt_val != 0) {
1348 if (sopt->sopt_td != NULL)
1349 error = copyout(buf, sopt->sopt_val, valsize);
1351 bcopy(buf, sopt->sopt_val, valsize);
1359 struct sockopt *sopt;
1365 struct accept_filter_arg *afap;
1369 if (sopt->sopt_level != SOL_SOCKET) {
1370 if (so->so_proto && so->so_proto->pr_ctloutput) {
1371 return (so_pr_ctloutput(so, sopt));
1373 return (ENOPROTOOPT);
1375 switch (sopt->sopt_name) {
1377 case SO_ACCEPTFILTER:
1378 if ((so->so_options & SO_ACCEPTCONN) == 0)
1380 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
1382 bzero(afap, sizeof(*afap));
1383 if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1384 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1385 if (so->so_accf->so_accept_filter_str != NULL)
1386 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1388 error = sooptcopyout(sopt, afap, sizeof(*afap));
1394 l.l_onoff = so->so_options & SO_LINGER;
1395 l.l_linger = so->so_linger;
1396 error = sooptcopyout(sopt, &l, sizeof l);
1399 case SO_USELOOPBACK:
1408 optval = so->so_options & sopt->sopt_name;
1410 error = sooptcopyout(sopt, &optval, sizeof optval);
1414 optval = so->so_type;
1418 optval = so->so_error;
1423 optval = so->so_snd.sb_hiwat;
1427 optval = so->so_rcv.sb_hiwat;
1431 optval = so->so_snd.sb_lowat;
1435 optval = so->so_rcv.sb_lowat;
1440 optval = (sopt->sopt_name == SO_SNDTIMEO ?
1441 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1443 tv.tv_sec = optval / hz;
1444 tv.tv_usec = (optval % hz) * tick;
1445 error = sooptcopyout(sopt, &tv, sizeof tv);
1449 error = ENOPROTOOPT;
1456 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1458 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1460 struct mbuf *m, *m_prev;
1461 int sopt_size = sopt->sopt_valsize;
1463 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
1466 if (sopt_size > MLEN) {
1467 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
1468 if ((m->m_flags & M_EXT) == 0) {
1472 m->m_len = min(MCLBYTES, sopt_size);
1474 m->m_len = min(MLEN, sopt_size);
1476 sopt_size -= m->m_len;
1481 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
1486 if (sopt_size > MLEN) {
1487 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
1488 if ((m->m_flags & M_EXT) == 0) {
1492 m->m_len = min(MCLBYTES, sopt_size);
1494 m->m_len = min(MLEN, sopt_size);
1496 sopt_size -= m->m_len;
1503 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1505 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1507 struct mbuf *m0 = m;
1509 if (sopt->sopt_val == NULL)
1511 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1512 if (sopt->sopt_td != NULL) {
1515 error = copyin(sopt->sopt_val, mtod(m, char *),
1522 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
1523 sopt->sopt_valsize -= m->m_len;
1524 (caddr_t)sopt->sopt_val += m->m_len;
1527 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
1528 panic("ip6_sooptmcopyin");
1532 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
1534 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1536 struct mbuf *m0 = m;
1539 if (sopt->sopt_val == NULL)
1541 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1542 if (sopt->sopt_td != NULL) {
1545 error = copyout(mtod(m, char *), sopt->sopt_val,
1552 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
1553 sopt->sopt_valsize -= m->m_len;
1554 (caddr_t)sopt->sopt_val += m->m_len;
1555 valsize += m->m_len;
1559 /* enough soopt buffer should be given from user-land */
1563 sopt->sopt_valsize = valsize;
1571 if (so->so_sigio != NULL)
1572 pgsigio(so->so_sigio, SIGURG, 0);
1573 selwakeup(&so->so_rcv.sb_sel);
1577 sopoll(struct socket *so, int events, struct ucred *cred, struct thread *td)
1582 if (events & (POLLIN | POLLRDNORM))
1584 revents |= events & (POLLIN | POLLRDNORM);
1586 if (events & POLLINIGNEOF)
1587 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
1588 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
1589 revents |= POLLINIGNEOF;
1591 if (events & (POLLOUT | POLLWRNORM))
1592 if (sowriteable(so))
1593 revents |= events & (POLLOUT | POLLWRNORM);
1595 if (events & (POLLPRI | POLLRDBAND))
1596 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1597 revents |= events & (POLLPRI | POLLRDBAND);
1601 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
1603 selrecord(td, &so->so_rcv.sb_sel);
1604 so->so_rcv.sb_flags |= SB_SEL;
1607 if (events & (POLLOUT | POLLWRNORM)) {
1608 selrecord(td, &so->so_snd.sb_sel);
1609 so->so_snd.sb_flags |= SB_SEL;
1618 sokqfilter(struct file *fp, struct knote *kn)
1620 struct socket *so = (struct socket *)kn->kn_fp->f_data;
1624 switch (kn->kn_filter) {
1626 if (so->so_options & SO_ACCEPTCONN)
1627 kn->kn_fop = &solisten_filtops;
1629 kn->kn_fop = &soread_filtops;
1633 kn->kn_fop = &sowrite_filtops;
1641 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1642 sb->sb_flags |= SB_KNOTE;
1648 filt_sordetach(struct knote *kn)
1650 struct socket *so = (struct socket *)kn->kn_fp->f_data;
1653 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1654 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1655 so->so_rcv.sb_flags &= ~SB_KNOTE;
1661 filt_soread(struct knote *kn, long hint)
1663 struct socket *so = (struct socket *)kn->kn_fp->f_data;
1665 kn->kn_data = so->so_rcv.sb_cc;
1666 if (so->so_state & SS_CANTRCVMORE) {
1667 kn->kn_flags |= EV_EOF;
1668 kn->kn_fflags = so->so_error;
1671 if (so->so_error) /* temporary udp error */
1673 if (kn->kn_sfflags & NOTE_LOWAT)
1674 return (kn->kn_data >= kn->kn_sdata);
1675 return (kn->kn_data >= so->so_rcv.sb_lowat);
1679 filt_sowdetach(struct knote *kn)
1681 struct socket *so = (struct socket *)kn->kn_fp->f_data;
1684 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1685 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1686 so->so_snd.sb_flags &= ~SB_KNOTE;
1692 filt_sowrite(struct knote *kn, long hint)
1694 struct socket *so = (struct socket *)kn->kn_fp->f_data;
1696 kn->kn_data = sbspace(&so->so_snd);
1697 if (so->so_state & SS_CANTSENDMORE) {
1698 kn->kn_flags |= EV_EOF;
1699 kn->kn_fflags = so->so_error;
1702 if (so->so_error) /* temporary udp error */
1704 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1705 (so->so_proto->pr_flags & PR_CONNREQUIRED))
1707 if (kn->kn_sfflags & NOTE_LOWAT)
1708 return (kn->kn_data >= kn->kn_sdata);
1709 return (kn->kn_data >= so->so_snd.sb_lowat);
1714 filt_solisten(struct knote *kn, long hint)
1716 struct socket *so = (struct socket *)kn->kn_fp->f_data;
1718 kn->kn_data = so->so_qlen;
1719 return (! TAILQ_EMPTY(&so->so_comp));