2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.8 2003/07/26 18:12:44 dillon Exp $
43 #include "opt_ktrace.h"
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
56 #include <sys/kernel.h>
57 #include <sys/malloc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
64 #include <sys/ktrace.h>
67 #include <vm/vm_page.h>
68 #include <sys/file2.h>
70 #include <machine/limits.h>
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 static int pollscan __P((struct proc *, struct pollfd *, u_int, int *));
77 static int selscan __P((struct proc *, fd_mask **, fd_mask **,
79 static int dofileread __P((struct file *, int, void *,
80 size_t, off_t, int, int *));
81 static int dofilewrite __P((struct file *, int,
82 const void *, size_t, off_t, int, int *));
91 if (((u_int)fd) >= fdp->fd_nfiles ||
92 (fp = fdp->fd_ofiles[fd]) == NULL ||
93 (fp->f_flag & flag) == 0) {
104 read(struct read_args *uap)
106 struct thread *td = curthread;
107 struct proc *p = td->td_proc;
112 if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
114 error = dofileread(fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0,
115 &uap->lmsg.u.ms_result);
124 pread(struct pread_args *uap)
126 struct thread *td = curthread;
127 struct proc *p = td->td_proc;
132 if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
134 if (fp->f_type != DTYPE_VNODE) {
137 error = dofileread(fp, uap->fd, uap->buf, uap->nbyte,
138 uap->offset, FOF_OFFSET, &uap->lmsg.u.ms_result);
145 * Code common for read and pread
148 dofileread(fp, fd, buf, nbyte, offset, flags, res)
156 struct thread *td = curthread;
157 struct proc *p = td->td_proc;
167 aiov.iov_base = (caddr_t)buf;
168 aiov.iov_len = nbyte;
169 auio.uio_iov = &aiov;
171 auio.uio_offset = offset;
174 auio.uio_resid = nbyte;
175 auio.uio_rw = UIO_READ;
176 auio.uio_segflg = UIO_USERSPACE;
180 * if tracing, save a copy of iovec
182 if (KTRPOINT(td, KTR_GENIO)) {
190 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
191 if (auio.uio_resid != cnt && (error == ERESTART ||
192 error == EINTR || error == EWOULDBLOCK))
195 cnt -= auio.uio_resid;
197 if (didktr && error == 0) {
198 ktruio.uio_iov = &ktriov;
199 ktruio.uio_resid = cnt;
200 ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
208 * Scatter read system call.
211 readv(struct readv_args *uap)
213 struct thread *td = curthread;
214 struct proc *p = td->td_proc;
216 struct filedesc *fdp = p->p_fd;
219 struct iovec *needfree;
220 struct iovec aiov[UIO_SMALLIOV];
221 long i, cnt, error = 0;
224 struct iovec *ktriov = NULL;
228 if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
230 /* note: can't use iovlen until iovcnt is validated */
231 iovlen = uap->iovcnt * sizeof (struct iovec);
232 if (uap->iovcnt > UIO_SMALLIOV) {
233 if (uap->iovcnt > UIO_MAXIOV)
235 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
242 auio.uio_iovcnt = uap->iovcnt;
243 auio.uio_rw = UIO_READ;
244 auio.uio_segflg = UIO_USERSPACE;
246 auio.uio_offset = -1;
247 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
250 for (i = 0; i < uap->iovcnt; i++) {
251 if (iov->iov_len > INT_MAX - auio.uio_resid) {
255 auio.uio_resid += iov->iov_len;
260 * if tracing, save a copy of iovec
262 if (KTRPOINT(td, KTR_GENIO)) {
263 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
264 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
268 cnt = auio.uio_resid;
269 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
270 if (auio.uio_resid != cnt && (error == ERESTART ||
271 error == EINTR || error == EWOULDBLOCK))
274 cnt -= auio.uio_resid;
276 if (ktriov != NULL) {
278 ktruio.uio_iov = ktriov;
279 ktruio.uio_resid = cnt;
280 ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
283 FREE(ktriov, M_TEMP);
286 uap->lmsg.u.ms_result = cnt;
290 FREE(needfree, M_IOV);
298 write(struct write_args *uap)
300 struct thread *td = curthread;
301 struct proc *p = td->td_proc;
307 if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
309 error = dofilewrite(fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0,
310 &uap->lmsg.u.ms_result);
319 pwrite(struct pwrite_args *uap)
321 struct thread *td = curthread;
322 struct proc *p = td->td_proc;
327 if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
329 if (fp->f_type != DTYPE_VNODE) {
332 error = dofilewrite(fp, uap->fd, uap->buf, uap->nbyte,
333 uap->offset, FOF_OFFSET, &uap->lmsg.u.ms_result);
349 struct thread *td = curthread;
350 struct proc *p = td->td_proc;
360 aiov.iov_base = (void *)(uintptr_t)buf;
361 aiov.iov_len = nbyte;
362 auio.uio_iov = &aiov;
364 auio.uio_offset = offset;
367 auio.uio_resid = nbyte;
368 auio.uio_rw = UIO_WRITE;
369 auio.uio_segflg = UIO_USERSPACE;
373 * if tracing, save a copy of iovec and uio
375 if (KTRPOINT(td, KTR_GENIO)) {
382 if (fp->f_type == DTYPE_VNODE)
384 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
385 if (auio.uio_resid != cnt && (error == ERESTART ||
386 error == EINTR || error == EWOULDBLOCK))
391 cnt -= auio.uio_resid;
393 if (didktr && error == 0) {
394 ktruio.uio_iov = &ktriov;
395 ktruio.uio_resid = cnt;
396 ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
404 * Gather write system call
407 writev(struct writev_args *uap)
409 struct thread *td = curthread;
410 struct proc *p = td->td_proc;
412 struct filedesc *fdp;
415 struct iovec *needfree;
416 struct iovec aiov[UIO_SMALLIOV];
417 long i, cnt, error = 0;
420 struct iovec *ktriov = NULL;
427 if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
429 /* note: can't use iovlen until iovcnt is validated */
430 iovlen = uap->iovcnt * sizeof (struct iovec);
431 if (uap->iovcnt > UIO_SMALLIOV) {
432 if (uap->iovcnt > UIO_MAXIOV) {
437 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
444 auio.uio_iovcnt = uap->iovcnt;
445 auio.uio_rw = UIO_WRITE;
446 auio.uio_segflg = UIO_USERSPACE;
448 auio.uio_offset = -1;
449 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
452 for (i = 0; i < uap->iovcnt; i++) {
453 if (iov->iov_len > INT_MAX - auio.uio_resid) {
457 auio.uio_resid += iov->iov_len;
462 * if tracing, save a copy of iovec and uio
464 if (KTRPOINT(td, KTR_GENIO)) {
465 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
466 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
470 cnt = auio.uio_resid;
471 if (fp->f_type == DTYPE_VNODE)
473 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
474 if (auio.uio_resid != cnt && (error == ERESTART ||
475 error == EINTR || error == EWOULDBLOCK))
480 cnt -= auio.uio_resid;
482 if (ktriov != NULL) {
484 ktruio.uio_iov = ktriov;
485 ktruio.uio_resid = cnt;
486 ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
489 FREE(ktriov, M_TEMP);
492 uap->lmsg.u.ms_result = cnt;
496 FREE(needfree, M_IOV);
505 ioctl(struct ioctl_args *uap)
507 struct thread *td = curthread;
508 struct proc *p = td->td_proc;
510 struct filedesc *fdp;
516 #define STK_PARAMS 128
518 char stkbuf[STK_PARAMS];
524 if ((u_int)uap->fd >= fdp->fd_nfiles ||
525 (fp = fdp->fd_ofiles[uap->fd]) == NULL)
528 if ((fp->f_flag & (FREAD | FWRITE)) == 0)
531 switch (com = uap->com) {
533 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
536 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
541 * Interpret high order word to find amount of data to be
542 * copied to/from the user's address space.
544 size = IOCPARM_LEN(com);
545 if (size > IOCPARM_MAX)
551 if (size > sizeof (ubuf.stkbuf)) {
552 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
559 error = copyin(uap->data, data, (u_int)size);
562 free(memp, M_IOCTLOPS);
567 *(caddr_t *)data = uap->data;
569 } else if ((com&IOC_OUT) && size) {
571 * Zero the buffer so the user always
572 * gets back something deterministic.
575 } else if (com&IOC_VOID) {
576 *(caddr_t *)data = uap->data;
582 if ((tmp = *(int *)data))
583 fp->f_flag |= FNONBLOCK;
585 fp->f_flag &= ~FNONBLOCK;
586 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
590 if ((tmp = *(int *)data))
591 fp->f_flag |= FASYNC;
593 fp->f_flag &= ~FASYNC;
594 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
598 error = fo_ioctl(fp, com, data, td);
600 * Copy any data to user, size was
601 * already set and checked above.
603 if (error == 0 && (com&IOC_OUT) && size)
604 error = copyout(data, uap->data, (u_int)size);
608 free(memp, M_IOCTLOPS);
613 static int nselcoll; /* Select collisions since boot */
615 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
618 * Select system call.
621 select(struct select_args *uap)
623 struct proc *p = curproc;
626 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
627 * infds with the new FD_SETSIZE of 1024, and more than enough for
628 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
631 fd_mask s_selbits[howmany(2048, NFDBITS)];
632 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
633 struct timeval atv, rtv, ttv;
634 int s, ncoll, error, timo;
635 u_int nbufbytes, ncpbytes, nfdbits;
639 if (uap->nd > p->p_fd->fd_nfiles)
640 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
643 * Allocate just enough bits for the non-null fd_sets. Use the
644 * preallocated auto buffer if possible.
646 nfdbits = roundup(uap->nd, NFDBITS);
647 ncpbytes = nfdbits / NBBY;
650 nbufbytes += 2 * ncpbytes;
652 nbufbytes += 2 * ncpbytes;
654 nbufbytes += 2 * ncpbytes;
655 if (nbufbytes <= sizeof s_selbits)
656 selbits = &s_selbits[0];
658 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
661 * Assign pointers into the bit buffers and fetch the input bits.
662 * Put the output buffers together so that they can be bzeroed
666 #define getbits(name, x) \
668 if (uap->name == NULL) \
671 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
673 sbp += ncpbytes / sizeof *sbp; \
674 error = copyin(uap->name, ibits[x], ncpbytes); \
684 bzero(selbits, nbufbytes / 2);
687 error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
691 if (itimerfix(&atv)) {
695 getmicrouptime(&rtv);
696 timevaladd(&atv, &rtv);
704 p->p_flag |= P_SELECT;
705 error = selscan(p, ibits, obits, uap->nd, &uap->lmsg.u.ms_result);
706 if (error || uap->lmsg.u.ms_result)
708 if (atv.tv_sec || atv.tv_usec) {
709 getmicrouptime(&rtv);
710 if (timevalcmp(&rtv, &atv, >=))
713 timevalsub(&ttv, &rtv);
714 timo = ttv.tv_sec > 24 * 60 * 60 ?
715 24 * 60 * 60 * hz : tvtohz(&ttv);
718 if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
722 p->p_flag &= ~P_SELECT;
724 error = tsleep((caddr_t)&selwait, PCATCH, "select", timo);
730 p->p_flag &= ~P_SELECT;
731 /* select is not restarted after signals... */
732 if (error == ERESTART)
734 if (error == EWOULDBLOCK)
736 #define putbits(name, x) \
737 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
747 if (selbits != &s_selbits[0])
748 free(selbits, M_SELECT);
753 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res)
755 struct thread *td = p->p_thread;
756 struct filedesc *fdp = p->p_fd;
761 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
762 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
764 for (msk = 0; msk < 3; msk++) {
765 if (ibits[msk] == NULL)
767 for (i = 0; i < nfd; i += NFDBITS) {
768 bits = ibits[msk][i/NFDBITS];
769 /* ffs(int mask) not portable, fd_mask is long */
770 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
773 fp = fdp->fd_ofiles[fd];
776 if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
777 obits[msk][(fd)/NFDBITS] |=
778 ((fd_mask)1 << ((fd) % NFDBITS));
792 poll(struct poll_args *uap)
795 char smallbits[32 * sizeof(struct pollfd)];
796 struct timeval atv, rtv, ttv;
797 int s, ncoll, error = 0, timo;
800 struct proc *p = curproc;
802 nfds = SCARG(uap, nfds);
804 * This is kinda bogus. We have fd limits, but that is not
805 * really related to the size of the pollfd array. Make sure
806 * we let the process use at least FD_SETSIZE entries and at
807 * least enough for the current limits. We want to be reasonably
808 * safe, but not overly restrictive.
810 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
812 ni = nfds * sizeof(struct pollfd);
813 if (ni > sizeof(smallbits))
814 bits = malloc(ni, M_TEMP, M_WAITOK);
817 error = copyin(SCARG(uap, fds), bits, ni);
820 if (SCARG(uap, timeout) != INFTIM) {
821 atv.tv_sec = SCARG(uap, timeout) / 1000;
822 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
823 if (itimerfix(&atv)) {
827 getmicrouptime(&rtv);
828 timevaladd(&atv, &rtv);
836 p->p_flag |= P_SELECT;
837 error = pollscan(p, (struct pollfd *)bits, nfds, &uap->lmsg.u.ms_result);
838 if (error || uap->lmsg.u.ms_result)
840 if (atv.tv_sec || atv.tv_usec) {
841 getmicrouptime(&rtv);
842 if (timevalcmp(&rtv, &atv, >=))
845 timevalsub(&ttv, &rtv);
846 timo = ttv.tv_sec > 24 * 60 * 60 ?
847 24 * 60 * 60 * hz : tvtohz(&ttv);
850 if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
854 p->p_flag &= ~P_SELECT;
855 error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo);
860 p->p_flag &= ~P_SELECT;
861 /* poll is not restarted after signals... */
862 if (error == ERESTART)
864 if (error == EWOULDBLOCK)
867 error = copyout(bits, SCARG(uap, fds), ni);
872 if (ni > sizeof(smallbits))
878 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res)
880 struct thread *td = p->p_thread;
881 struct filedesc *fdp = p->p_fd;
886 for (i = 0; i < nfd; i++, fds++) {
887 if (fds->fd >= fdp->fd_nfiles) {
888 fds->revents = POLLNVAL;
890 } else if (fds->fd < 0) {
893 fp = fdp->fd_ofiles[fds->fd];
895 fds->revents = POLLNVAL;
899 * Note: backend also returns POLLHUP and
900 * POLLERR if appropriate.
902 fds->revents = fo_poll(fp, fds->events,
904 if (fds->revents != 0)
914 * OpenBSD poll system call.
915 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
918 openbsd_poll(struct openbsd_poll_args *uap)
920 return (poll((struct poll_args *)uap));
925 seltrue(dev_t dev, int events, struct thread *td)
927 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
931 * Record a select request. A global wait must be used since a process/thread
932 * might go away after recording its request.
935 selrecord(struct thread *selector, struct selinfo *sip)
940 if ((p = selector->td_proc) == NULL)
941 panic("selrecord: thread needs a process");
944 if (sip->si_pid == mypid)
946 if (sip->si_pid && (p = pfind(sip->si_pid)) &&
947 p->p_wchan == (caddr_t)&selwait) {
948 sip->si_flags |= SI_COLL;
955 * Do a wakeup when a selectable event occurs.
958 selwakeup(struct selinfo *sip)
963 if (sip->si_pid == 0)
965 if (sip->si_flags & SI_COLL) {
967 sip->si_flags &= ~SI_COLL;
968 wakeup((caddr_t)&selwait); /* YYY fixable */
970 p = pfind(sip->si_pid);
974 if (p->p_wchan == (caddr_t)&selwait) {
975 if (p->p_stat == SSLEEP)
978 unsleep(p->p_thread);
979 } else if (p->p_flag & P_SELECT)
980 p->p_flag &= ~P_SELECT;