2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
37 * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $
38 * $DragonFly: src/sys/kern/uipc_syscalls.c,v 1.14 2003/09/12 00:43:30 daver Exp $
41 #include "opt_compat.h"
42 #include "opt_ktrace.h"
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/sysproto.h>
48 #include <sys/malloc.h>
49 #include <sys/filedesc.h>
50 #include <sys/event.h>
52 #include <sys/fcntl.h>
54 #include <sys/filio.h>
55 #include <sys/kern_syscall.h>
57 #include <sys/protosw.h>
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/signalvar.h>
62 #include <sys/vnode.h>
64 #include <sys/mount.h>
66 #include <sys/ktrace.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74 #include <sys/file2.h>
76 #if defined(COMPAT_43)
77 #include <emulation/43bsd/43bsd_socket.h>
78 #endif /* COMPAT_43 */
80 static void sf_buf_init(void *arg);
81 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
83 static int sendit(int s, struct msghdr *mp, int flags, int *res);
84 static int recvit(int s, struct msghdr *mp, caddr_t namelenp, int *res);
86 static int do_sendfile(struct sendfile_args *uap, int compat);
88 static SLIST_HEAD(, sf_buf) sf_freelist;
89 static vm_offset_t sf_base;
90 static struct sf_buf *sf_bufs;
91 static int sf_buf_alloc_want;
94 * System call interface to the socket abstraction.
96 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
97 #define COMPAT_OLDSOCK
100 extern struct fileops socketops;
103 * socket_args(int domain, int type, int protocol)
106 socket(struct socket_args *uap)
108 struct thread *td = curthread;
109 struct proc *p = td->td_proc;
110 struct filedesc *fdp;
118 error = falloc(p, &fp, &fd);
122 error = socreate(uap->domain, &so, uap->type, uap->protocol, td);
124 if (fdp->fd_ofiles[fd] == fp) {
125 fdp->fd_ofiles[fd] = NULL;
129 fp->f_data = (caddr_t)so;
130 fp->f_flag = FREAD|FWRITE;
131 fp->f_ops = &socketops;
132 fp->f_type = DTYPE_SOCKET;
133 uap->sysmsg_result = fd;
140 kern_bind(int s, struct sockaddr *sa)
142 struct thread *td = curthread;
143 struct proc *p = td->td_proc;
148 error = holdsock(p->p_fd, s, &fp);
151 error = sobind((struct socket *)fp->f_data, sa, td);
157 * bind_args(int s, caddr_t name, int namelen)
160 bind(struct bind_args *uap)
165 error = getsockaddr(&sa, uap->name, uap->namelen);
168 error = kern_bind(uap->s, sa);
175 kern_listen(int s, int backlog)
177 struct thread *td = curthread;
178 struct proc *p = td->td_proc;
183 error = holdsock(p->p_fd, s, &fp);
186 error = solisten((struct socket *)fp->f_data, backlog, td);
192 * listen_args(int s, int backlog)
195 listen(struct listen_args *uap)
199 error = kern_listen(uap->s, uap->backlog);
204 * The second argument to kern_accept() is a handle to a struct sockaddr.
205 * This allows kern_accept() to return a pointer to an allocated struct
206 * sockaddr which must be freed later with FREE(). The caller must
207 * initialize *name to NULL.
210 kern_accept(int s, struct sockaddr **name, int *namelen, int *res)
212 struct thread *td = curthread;
213 struct proc *p = td->td_proc;
214 struct filedesc *fdp = p->p_fd;
215 struct file *lfp = NULL;
216 struct file *nfp = NULL;
219 struct socket *head, *so;
221 u_int fflag; /* type must match fp->f_flag */
224 if (name && namelen && *namelen < 0)
227 error = holdsock(fdp, s, &lfp);
231 head = (struct socket *)lfp->f_data;
232 if ((head->so_options & SO_ACCEPTCONN) == 0) {
237 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
238 if (head->so_state & SS_CANTRCVMORE) {
239 head->so_error = ECONNABORTED;
242 if ((head->so_state & SS_NBIO) != 0) {
243 head->so_error = EWOULDBLOCK;
246 error = tsleep((caddr_t)&head->so_timeo, PCATCH, "accept", 0);
252 if (head->so_error) {
253 error = head->so_error;
260 * At this point we know that there is at least one connection
261 * ready to be accepted. Remove it from the queue prior to
262 * allocating the file descriptor for it since falloc() may
263 * block allowing another process to accept the connection
266 so = TAILQ_FIRST(&head->so_comp);
267 TAILQ_REMOVE(&head->so_comp, so, so_list);
271 error = falloc(p, &nfp, &fd);
274 * Probably ran out of file descriptors. Put the
275 * unaccepted connection back onto the queue and
276 * do another wakeup so some other process might
277 * have a chance at it.
279 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
281 wakeup_one(&head->so_timeo);
288 /* connection has been removed from the listen queue */
289 KNOTE(&head->so_rcv.sb_sel.si_note, 0);
291 so->so_state &= ~SS_COMP;
293 if (head->so_sigio != NULL)
294 fsetown(fgetown(head->so_sigio), &so->so_sigio);
296 nfp->f_data = (caddr_t)so;
298 nfp->f_ops = &socketops;
299 nfp->f_type = DTYPE_SOCKET;
300 /* Sync socket nonblocking/async state with file flags */
301 tmp = fflag & FNONBLOCK;
302 (void) fo_ioctl(nfp, FIONBIO, (caddr_t)&tmp, td);
303 tmp = fflag & FASYNC;
304 (void) fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, td);
307 error = soaccept(so, &sa);
310 * Set the returned name and namelen as applicable. Set the returned
311 * namelen to 0 for older code which might ignore the return value
315 if (sa && name && namelen) {
316 if (*namelen > sa->sa_len)
317 *namelen = sa->sa_len;
326 * close the new descriptor, assuming someone hasn't ripped it
327 * out from under us. Note that *res is normally ignored if an
328 * error is returned but a syscall message will still have access
329 * to the result code.
333 if (fdp->fd_ofiles[fd] == nfp) {
334 fdp->fd_ofiles[fd] = NULL;
341 * Release explicitly held references before returning.
351 * accept_args(int s, caddr_t name, int *anamelen)
354 accept(struct accept_args *uap)
356 struct sockaddr *sa = NULL;
361 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len));
365 error = kern_accept(uap->s, &sa, &sa_len, &uap->sysmsg_result);
368 error = copyout(sa, uap->name, sa_len);
370 error = copyout(&sa_len, uap->anamelen,
371 sizeof(*uap->anamelen));
376 error = kern_accept(uap->s, NULL, 0, &uap->sysmsg_result);
382 kern_connect(int s, struct sockaddr *sa)
384 struct thread *td = curthread;
385 struct proc *p = td->td_proc;
390 error = holdsock(p->p_fd, s, &fp);
393 so = (struct socket *)fp->f_data;
394 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
398 error = soconnect(so, sa, td);
401 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
406 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
407 error = tsleep((caddr_t)&so->so_timeo, PCATCH, "connec", 0);
412 error = so->so_error;
417 so->so_state &= ~SS_ISCONNECTING;
418 if (error == ERESTART)
426 * connect_args(int s, caddr_t name, int namelen)
429 connect(struct connect_args *uap)
434 error = getsockaddr(&sa, uap->name, uap->namelen);
437 error = kern_connect(uap->s, sa);
444 kern_socketpair(int domain, int type, int protocol, int *sv)
446 struct thread *td = curthread;
447 struct proc *p = td->td_proc;
448 struct filedesc *fdp;
449 struct file *fp1, *fp2;
450 struct socket *so1, *so2;
455 error = socreate(domain, &so1, type, protocol, td);
458 error = socreate(domain, &so2, type, protocol, td);
461 error = falloc(p, &fp1, &fd);
466 fp1->f_data = (caddr_t)so1;
467 error = falloc(p, &fp2, &fd);
471 fp2->f_data = (caddr_t)so2;
473 error = soconnect2(so1, so2);
476 if (type == SOCK_DGRAM) {
478 * Datagram socket connection is asymmetric.
480 error = soconnect2(so2, so1);
484 fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
485 fp1->f_ops = fp2->f_ops = &socketops;
486 fp1->f_type = fp2->f_type = DTYPE_SOCKET;
491 if (fdp->fd_ofiles[sv[1]] == fp2) {
492 fdp->fd_ofiles[sv[1]] = NULL;
497 if (fdp->fd_ofiles[sv[0]] == fp1) {
498 fdp->fd_ofiles[sv[0]] = NULL;
510 * socketpair(int domain, int type, int protocol, int *rsv)
513 socketpair(struct socketpair_args *uap)
517 error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv);
520 error = copyout(sockv, uap->rsv, sizeof(sockv));
525 sendit(int s, struct msghdr *mp, int flags, int *res)
527 struct thread *td = curthread;
528 struct proc *p = td->td_proc;
533 struct mbuf *control;
538 struct iovec *ktriov = NULL;
542 error = holdsock(p->p_fd, s, &fp);
545 auio.uio_iov = mp->msg_iov;
546 auio.uio_iovcnt = mp->msg_iovlen;
547 auio.uio_segflg = UIO_USERSPACE;
548 auio.uio_rw = UIO_WRITE;
550 auio.uio_offset = 0; /* XXX */
553 for (i = 0; i < mp->msg_iovlen; i++, iov++) {
554 if ((auio.uio_resid += iov->iov_len) < 0) {
560 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
568 if (mp->msg_control) {
569 if (mp->msg_controllen < sizeof(struct cmsghdr)
570 #ifdef COMPAT_OLDSOCK
571 && mp->msg_flags != MSG_COMPAT
577 error = sockargs(&control, mp->msg_control,
578 mp->msg_controllen, MT_CONTROL);
581 #ifdef COMPAT_OLDSOCK
582 if (mp->msg_flags == MSG_COMPAT) {
585 M_PREPEND(control, sizeof(*cm), M_WAIT);
590 cm = mtod(control, struct cmsghdr *);
591 cm->cmsg_len = control->m_len;
592 cm->cmsg_level = SOL_SOCKET;
593 cm->cmsg_type = SCM_RIGHTS;
601 if (KTRPOINT(td, KTR_GENIO)) {
602 int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
604 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
605 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
609 len = auio.uio_resid;
610 so = (struct socket *)fp->f_data;
611 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
614 if (auio.uio_resid != len && (error == ERESTART ||
615 error == EINTR || error == EWOULDBLOCK))
621 *res = len - auio.uio_resid;
623 if (ktriov != NULL) {
625 ktruio.uio_iov = ktriov;
626 ktruio.uio_resid = *res;
627 ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
629 FREE(ktriov, M_TEMP);
640 * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen)
643 sendto(struct sendto_args *uap)
648 msg.msg_name = uap->to;
649 msg.msg_namelen = uap->tolen;
653 #ifdef COMPAT_OLDSOCK
656 aiov.iov_base = uap->buf;
657 aiov.iov_len = uap->len;
658 return (sendit(uap->s, &msg, uap->flags, &uap->sysmsg_result));
661 #ifdef COMPAT_OLDSOCK
663 * osend_args(int s, caddr_t buf, int len, int flags)
666 osend(struct osend_args *uap)
675 aiov.iov_base = uap->buf;
676 aiov.iov_len = uap->len;
679 return (sendit(uap->s, &msg, uap->flags, &uap->sysmsg_result));
683 * osendmsg_args(int s, caddr_t msg, int flags)
686 osendmsg(struct osendmsg_args *uap)
689 struct iovec aiov[UIO_SMALLIOV], *iov;
692 error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
695 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
696 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
698 MALLOC(iov, struct iovec *,
699 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
703 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
704 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
707 msg.msg_flags = MSG_COMPAT;
709 error = sendit(uap->s, &msg, uap->flags, &uap->sysmsg_result);
718 * sendmsg_args(int s, caddr_t msg, int flags)
721 sendmsg(struct sendmsg_args *uap)
724 struct iovec aiov[UIO_SMALLIOV], *iov;
727 error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
730 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
731 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
733 MALLOC(iov, struct iovec *,
734 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
738 if (msg.msg_iovlen &&
739 (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
740 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
743 #ifdef COMPAT_OLDSOCK
746 error = sendit(uap->s, &msg, uap->flags, &uap->sysmsg_result);
754 recvit(int s, struct msghdr *mp, caddr_t namelenp, int *res)
756 struct thread *td = curthread;
757 struct proc *p = td->td_proc;
763 struct mbuf *m, *control = 0;
766 struct sockaddr *fromsa = 0;
768 struct iovec *ktriov = NULL;
772 error = holdsock(p->p_fd, s, &fp);
775 auio.uio_iov = mp->msg_iov;
776 auio.uio_iovcnt = mp->msg_iovlen;
777 auio.uio_segflg = UIO_USERSPACE;
778 auio.uio_rw = UIO_READ;
780 auio.uio_offset = 0; /* XXX */
783 for (i = 0; i < mp->msg_iovlen; i++, iov++) {
784 if ((auio.uio_resid += iov->iov_len) < 0) {
790 if (KTRPOINT(td, KTR_GENIO)) {
791 int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
793 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
794 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
798 len = auio.uio_resid;
799 so = (struct socket *)fp->f_data;
800 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
801 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
804 if (auio.uio_resid != len && (error == ERESTART ||
805 error == EINTR || error == EWOULDBLOCK))
809 if (ktriov != NULL) {
811 ktruio.uio_iov = ktriov;
812 ktruio.uio_resid = len - auio.uio_resid;
813 ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
815 FREE(ktriov, M_TEMP);
820 *res = len - auio.uio_resid;
822 len = mp->msg_namelen;
823 if (len <= 0 || fromsa == 0)
826 /* save sa_len before it is destroyed by MSG_COMPAT */
827 len = MIN(len, fromsa->sa_len);
828 #ifdef COMPAT_OLDSOCK
829 if (mp->msg_flags & MSG_COMPAT)
830 ((struct osockaddr *)fromsa)->sa_family =
833 error = copyout(fromsa,
834 (caddr_t)mp->msg_name, (unsigned)len);
838 mp->msg_namelen = len;
840 (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
841 #ifdef COMPAT_OLDSOCK
842 if (mp->msg_flags & MSG_COMPAT)
843 error = 0; /* old recvfrom didn't check */
849 if (mp->msg_control) {
850 #ifdef COMPAT_OLDSOCK
852 * We assume that old recvmsg calls won't receive access
853 * rights and other control info, esp. as control info
854 * is always optional and those options didn't exist in 4.3.
855 * If we receive rights, trim the cmsghdr; anything else
858 if (control && mp->msg_flags & MSG_COMPAT) {
859 if (mtod(control, struct cmsghdr *)->cmsg_level !=
861 mtod(control, struct cmsghdr *)->cmsg_type !=
863 mp->msg_controllen = 0;
866 control->m_len -= sizeof (struct cmsghdr);
867 control->m_data += sizeof (struct cmsghdr);
870 len = mp->msg_controllen;
872 mp->msg_controllen = 0;
873 ctlbuf = (caddr_t) mp->msg_control;
875 while (m && len > 0) {
881 mp->msg_flags |= MSG_CTRUNC;
885 if ((error = copyout((caddr_t)mtod(m, caddr_t),
886 ctlbuf, tocopy)) != 0)
893 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
898 FREE(fromsa, M_SONAME);
905 * recvfrom_args(int s, caddr_t buf, size_t len, int flags,
906 * caddr_t from, int *fromlenaddr)
909 recvfrom(struct recvfrom_args *uap)
915 if (uap->fromlenaddr) {
916 error = copyin((caddr_t)uap->fromlenaddr,
917 (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
922 msg.msg_name = uap->from;
925 aiov.iov_base = uap->buf;
926 aiov.iov_len = uap->len;
928 msg.msg_flags = uap->flags;
929 return (recvit(uap->s, &msg, (caddr_t)uap->fromlenaddr, &uap->sysmsg_result));
932 #ifdef COMPAT_OLDSOCK
934 orecvfrom(struct recvfrom_args *uap)
936 uap->flags |= MSG_COMPAT;
937 return (recvfrom(uap));
941 #ifdef COMPAT_OLDSOCK
943 * struct orecv_args(int s, caddr_t buf, int len, int flags)
946 orecv(struct orecv_args *uap)
955 aiov.iov_base = uap->buf;
956 aiov.iov_len = uap->len;
958 msg.msg_flags = uap->flags;
959 return (recvit(uap->s, &msg, (caddr_t)0, &uap->sysmsg_result));
963 * Old recvmsg. This code takes advantage of the fact that the old msghdr
964 * overlays the new one, missing only the flags, and with the (old) access
965 * rights where the control fields are now.
967 * orecvmsg_args(int s, struct omsghdr *msg, int flags)
970 orecvmsg(struct orecvmsg_args *uap)
973 struct iovec aiov[UIO_SMALLIOV], *iov;
976 error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
977 sizeof (struct omsghdr));
980 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
981 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
983 MALLOC(iov, struct iovec *,
984 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
988 msg.msg_flags = uap->flags | MSG_COMPAT;
989 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
990 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
994 error = recvit(uap->s, &msg, (caddr_t)&uap->msg->msg_namelen, &uap->sysmsg_result);
996 if (msg.msg_controllen && error == 0)
997 error = copyout((caddr_t)&msg.msg_controllen,
998 (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
1007 * recvmsg_args(int s, struct msghdr *msg, int flags)
1010 recvmsg(struct recvmsg_args *uap)
1013 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1016 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
1019 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1020 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
1022 MALLOC(iov, struct iovec *,
1023 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1027 #ifdef COMPAT_OLDSOCK
1028 msg.msg_flags = uap->flags &~ MSG_COMPAT;
1030 msg.msg_flags = uap->flags;
1034 error = copyin((caddr_t)uiov, (caddr_t)iov,
1035 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1038 error = recvit(uap->s, &msg, (caddr_t)0, &uap->sysmsg_result);
1041 error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
1050 * shutdown_args(int s, int how)
1054 shutdown(struct shutdown_args *uap)
1056 struct thread *td = curthread;
1057 struct proc *p = td->td_proc;
1062 error = holdsock(p->p_fd, uap->s, &fp);
1065 error = soshutdown((struct socket *)fp->f_data, uap->how);
1071 * setsockopt_args(int s, int level, int name, caddr_t val, int valsize)
1075 setsockopt(struct setsockopt_args *uap)
1077 struct thread *td = curthread;
1078 struct proc *p = td->td_proc;
1080 struct sockopt sopt;
1083 if (uap->val == 0 && uap->valsize != 0)
1085 if (uap->valsize < 0)
1088 error = holdsock(p->p_fd, uap->s, &fp);
1092 sopt.sopt_dir = SOPT_SET;
1093 sopt.sopt_level = uap->level;
1094 sopt.sopt_name = uap->name;
1095 sopt.sopt_val = uap->val;
1096 sopt.sopt_valsize = uap->valsize;
1098 error = sosetopt((struct socket *)fp->f_data, &sopt);
1104 * getsockopt_Args(int s, int level, int name, caddr_t val, int *avalsize)
1108 getsockopt(struct getsockopt_args *uap)
1110 struct thread *td = curthread;
1111 struct proc *p = td->td_proc;
1114 struct sockopt sopt;
1116 error = holdsock(p->p_fd, uap->s, &fp);
1120 error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1134 sopt.sopt_dir = SOPT_GET;
1135 sopt.sopt_level = uap->level;
1136 sopt.sopt_name = uap->name;
1137 sopt.sopt_val = uap->val;
1138 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1141 error = sogetopt((struct socket *)fp->f_data, &sopt);
1143 valsize = sopt.sopt_valsize;
1144 error = copyout((caddr_t)&valsize,
1145 (caddr_t)uap->avalsize, sizeof (valsize));
1152 * The second argument to kern_getsockname() is a handle to a struct sockaddr.
1153 * This allows kern_getsockname() to return a pointer to an allocated struct
1154 * sockaddr which must be freed later with FREE(). The caller must
1155 * initialize *name to NULL.
1158 kern_getsockname(int s, struct sockaddr **name, int *namelen)
1160 struct thread *td = curthread;
1161 struct proc *p = td->td_proc;
1164 struct sockaddr *sa = NULL;
1167 error = holdsock(p->p_fd, s, &fp);
1174 so = (struct socket *)fp->f_data;
1175 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1180 *namelen = MIN(*namelen, sa->sa_len);
1190 * getsockname_args(int fdes, caddr_t asa, int *alen)
1195 getsockname(struct getsockname_args *uap)
1197 struct sockaddr *sa = NULL;
1200 error = copyin(uap->alen, &sa_len, sizeof(sa_len));
1204 error = kern_getsockname(uap->fdes, &sa, &sa_len);
1207 error = copyout(sa, uap->asa, sa_len);
1209 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen));
1216 * The second argument to kern_getpeername() is a handle to a struct sockaddr.
1217 * This allows kern_getpeername() to return a pointer to an allocated struct
1218 * sockaddr which must be freed later with FREE(). The caller must
1219 * initialize *name to NULL.
1222 kern_getpeername(int s, struct sockaddr **name, int *namelen)
1224 struct thread *td = curthread;
1225 struct proc *p = td->td_proc;
1228 struct sockaddr *sa = NULL;
1231 error = holdsock(p->p_fd, s, &fp);
1238 so = (struct socket *)fp->f_data;
1239 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1243 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1248 *namelen = MIN(*namelen, sa->sa_len);
1258 * getpeername_args(int fdes, caddr_t asa, int *alen)
1260 * Get name of peer for connected socket.
1263 getpeername(struct getpeername_args *uap)
1265 struct sockaddr *sa = NULL;
1268 error = copyin(uap->alen, &sa_len, sizeof(sa_len));
1272 error = kern_getpeername(uap->fdes, &sa, &sa_len);
1275 error = copyout(sa, uap->asa, sa_len);
1277 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen));
1284 sockargs(mp, buf, buflen, type)
1289 struct sockaddr *sa;
1293 if ((u_int)buflen > MLEN) {
1294 #ifdef COMPAT_OLDSOCK
1295 if (type == MT_SONAME && (u_int)buflen <= 112)
1296 buflen = MLEN; /* unix domain compat. hack */
1301 m = m_get(M_WAIT, type);
1305 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1310 if (type == MT_SONAME) {
1311 sa = mtod(m, struct sockaddr *);
1313 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1314 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1315 sa->sa_family = sa->sa_len;
1317 sa->sa_len = buflen;
1324 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len)
1326 struct sockaddr *sa;
1330 if (len > SOCK_MAXADDRLEN)
1331 return ENAMETOOLONG;
1332 if (len < offsetof(struct sockaddr, sa_data[0]))
1334 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1335 error = copyin(uaddr, sa, len);
1339 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1340 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1341 sa->sa_family = sa->sa_len;
1350 * holdsock() - load the struct file pointer associated
1351 * with a socket into *fpp. If an error occurs, non-zero
1352 * will be returned and *fpp will be set to NULL.
1355 holdsock(fdp, fdes, fpp)
1356 struct filedesc *fdp;
1360 struct file *fp = NULL;
1363 if ((unsigned)fdes >= fdp->fd_nfiles ||
1364 (fp = fdp->fd_ofiles[fdes]) == NULL) {
1366 } else if (fp->f_type != DTYPE_SOCKET) {
1377 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1380 sf_buf_init(void *arg)
1384 SLIST_INIT(&sf_freelist);
1385 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1386 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1387 bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1388 for (i = 0; i < nsfbufs; i++) {
1389 sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1390 SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1395 * Get an sf_buf from the freelist. Will block if none are available.
1405 while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1406 sf_buf_alloc_want = 1;
1407 error = tsleep(&sf_freelist, PCATCH, "sfbufa", 0);
1412 SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1419 #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1421 sf_buf_ref(caddr_t addr, u_int size)
1426 if (sf->refcnt == 0)
1427 panic("sf_buf_ref: referencing a free sf_buf");
1432 * Lose a reference to an sf_buf. When none left, detach mapped page
1433 * and release resources back to the system.
1435 * Must be called at splimp.
1438 sf_buf_free(caddr_t addr, u_int size)
1445 if (sf->refcnt == 0)
1446 panic("sf_buf_free: freeing free sf_buf");
1448 if (sf->refcnt == 0) {
1449 pmap_qremove((vm_offset_t)addr, 1);
1452 vm_page_unwire(m, 0);
1454 * Check for the object going away on us. This can
1455 * happen since we don't hold a reference to it.
1456 * If so, we're responsible for freeing the page.
1458 if (m->wire_count == 0 && m->object == NULL)
1462 SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1463 if (sf_buf_alloc_want) {
1464 sf_buf_alloc_want = 0;
1465 wakeup(&sf_freelist);
1472 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1473 * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1475 * Send a file specified by 'fd' and starting at 'offset' to a socket
1476 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1477 * nbytes == 0. Optionally add a header and/or trailer to the socket
1478 * output. If specified, write the total number of bytes sent into *sbytes.
1481 sendfile(struct sendfile_args *uap)
1483 return (do_sendfile(uap, 0));
1488 osendfile(struct osendfile_args *uap)
1490 struct sendfile_args args;
1494 args.offset = uap->offset;
1495 args.nbytes = uap->nbytes;
1496 args.hdtr = uap->hdtr;
1497 args.sbytes = uap->sbytes;
1498 args.flags = uap->flags;
1500 return (do_sendfile(&args, 1));
1505 do_sendfile(struct sendfile_args *uap, int compat)
1507 struct thread *td = curthread;
1508 struct proc *p = td->td_proc;
1510 struct filedesc *fdp;
1512 struct vm_object *obj;
1517 struct writev_args nuap;
1518 struct sf_hdtr hdtr;
1519 off_t off, xfsize, hdtr_size, sbytes = 0;
1528 * Do argument checking. Must be a regular file in, stream
1529 * type and connected socket out, positive offset.
1531 fp = holdfp(fdp, uap->fd, FREAD);
1536 if (fp->f_type != DTYPE_VNODE) {
1540 vp = (struct vnode *)fp->f_data;
1542 if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1547 error = holdsock(p->p_fd, uap->s, &fp);
1550 so = (struct socket *)fp->f_data;
1551 if (so->so_type != SOCK_STREAM) {
1555 if ((so->so_state & SS_ISCONNECTED) == 0) {
1559 if (uap->offset < 0) {
1565 * If specified, get the pointer to the sf_hdtr struct for
1566 * any headers/trailers.
1568 if (uap->hdtr != NULL) {
1569 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1573 * Send any headers. Wimp out and use writev(2).
1575 if (hdtr.headers != NULL) {
1577 nuap.iovp = hdtr.headers;
1578 nuap.iovcnt = hdtr.hdr_cnt;
1579 error = writev(&nuap);
1583 sbytes += nuap.sysmsg_result;
1585 hdtr_size += nuap.sysmsg_result;
1590 * Protect against multiple writers to the socket.
1592 (void) sblock(&so->so_snd, M_WAITOK);
1595 * Loop through the pages in the file, starting with the requested
1596 * offset. Get a file page (do I/O if necessary), map the file page
1597 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1600 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1604 pindex = OFF_TO_IDX(off);
1607 * Calculate the amount to transfer. Not to exceed a page,
1608 * the EOF, or the passed in nbytes.
1610 xfsize = obj->un_pager.vnp.vnp_size - off;
1611 if (xfsize > PAGE_SIZE)
1613 pgoff = (vm_offset_t)(off & PAGE_MASK);
1614 if (PAGE_SIZE - pgoff < xfsize)
1615 xfsize = PAGE_SIZE - pgoff;
1616 if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1617 xfsize = uap->nbytes - sbytes;
1621 * Optimize the non-blocking case by looking at the socket space
1622 * before going to the extra work of constituting the sf_buf.
1624 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1625 if (so->so_state & SS_CANTSENDMORE)
1629 sbunlock(&so->so_snd);
1633 * Attempt to look up the page.
1635 * Allocate if not found
1637 * Wait and loop if busy.
1639 pg = vm_page_lookup(obj, pindex);
1642 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1648 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1653 * Wire the page so it does not get ripped out from under
1660 * If page is not valid for what we need, initiate I/O
1663 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1669 * Ensure that our page is still around when the I/O
1672 vm_page_io_start(pg);
1675 * Get the page from backing store.
1677 bsize = vp->v_mount->mnt_stat.f_iosize;
1678 auio.uio_iov = &aiov;
1679 auio.uio_iovcnt = 1;
1681 aiov.iov_len = MAXBSIZE;
1682 auio.uio_resid = MAXBSIZE;
1683 auio.uio_offset = trunc_page(off);
1684 auio.uio_segflg = UIO_NOCOPY;
1685 auio.uio_rw = UIO_READ;
1687 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1688 error = VOP_READ(vp, &auio,
1689 IO_VMIO | ((MAXBSIZE / bsize) << 16),
1691 VOP_UNLOCK(vp, 0, td);
1692 vm_page_flag_clear(pg, PG_ZERO);
1693 vm_page_io_finish(pg);
1695 vm_page_unwire(pg, 0);
1697 * See if anyone else might know about this page.
1698 * If not and it is not valid, then free it.
1700 if (pg->wire_count == 0 && pg->valid == 0 &&
1701 pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1702 pg->hold_count == 0) {
1706 sbunlock(&so->so_snd);
1713 * Get a sendfile buf. We usually wait as long as necessary,
1714 * but this wait can be interrupted.
1716 if ((sf = sf_buf_alloc()) == NULL) {
1718 vm_page_unwire(pg, 0);
1719 if (pg->wire_count == 0 && pg->object == NULL)
1722 sbunlock(&so->so_snd);
1729 * Allocate a kernel virtual page and insert the physical page
1734 pmap_qenter(sf->kva, &pg, 1);
1736 * Get an mbuf header and set it up as having external storage.
1738 MGETHDR(m, M_WAIT, MT_DATA);
1741 sf_buf_free((void *)sf->kva, PAGE_SIZE);
1742 sbunlock(&so->so_snd);
1745 m->m_ext.ext_free = sf_buf_free;
1746 m->m_ext.ext_ref = sf_buf_ref;
1747 m->m_ext.ext_buf = (void *)sf->kva;
1748 m->m_ext.ext_size = PAGE_SIZE;
1749 m->m_data = (char *) sf->kva + pgoff;
1750 m->m_flags |= M_EXT;
1751 m->m_pkthdr.len = m->m_len = xfsize;
1753 * Add the buffer to the socket buffer chain.
1758 * Make sure that the socket is still able to take more data.
1759 * CANTSENDMORE being true usually means that the connection
1760 * was closed. so_error is true when an error was sensed after
1762 * The state is checked after the page mapping and buffer
1763 * allocation above since those operations may block and make
1764 * any socket checks stale. From this point forward, nothing
1765 * blocks before the pru_send (or more accurately, any blocking
1766 * results in a loop back to here to re-check).
1768 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1769 if (so->so_state & SS_CANTSENDMORE) {
1772 error = so->so_error;
1776 sbunlock(&so->so_snd);
1781 * Wait for socket space to become available. We do this just
1782 * after checking the connection state above in order to avoid
1783 * a race condition with sbwait().
1785 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1786 if (so->so_state & SS_NBIO) {
1788 sbunlock(&so->so_snd);
1793 error = sbwait(&so->so_snd);
1795 * An error from sbwait usually indicates that we've
1796 * been interrupted by a signal. If we've sent anything
1797 * then return bytes sent, otherwise return the error.
1801 sbunlock(&so->so_snd);
1808 (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
1811 sbunlock(&so->so_snd);
1815 sbunlock(&so->so_snd);
1818 * Send trailers. Wimp out and use writev(2).
1820 if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1822 nuap.iovp = hdtr.trailers;
1823 nuap.iovcnt = hdtr.trl_cnt;
1824 error = writev(&nuap);
1828 sbytes += nuap.sysmsg_result;
1830 hdtr_size += nuap.sysmsg_result;
1834 if (uap->sbytes != NULL) {
1836 sbytes += hdtr_size;
1837 copyout(&sbytes, uap->sbytes, sizeof(off_t));