2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/kern_fp.c,v 1.6 2004/07/16 05:51:10 dillon Exp $
38 * Direct file pointer API functions for in-kernel operations on files. These
39 * functions provide a open/read/write/close like interface within the kernel
40 * for operating on files that are not necessarily associated with processes
41 * and which do not (typically) have descriptors.
43 * FUTURE: file handle conversion routines to support checkpointing,
44 * and additional file operations (ioctl, fcntl).
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/systm.h>
50 #include <sys/malloc.h>
51 #include <sys/sysproto.h>
53 #include <sys/filedesc.h>
54 #include <sys/sysctl.h>
55 #include <sys/vnode.h>
57 #include <sys/namei.h>
60 #include <sys/filio.h>
61 #include <sys/fcntl.h>
62 #include <sys/unistd.h>
63 #include <sys/resourcevar.h>
64 #include <sys/event.h>
68 #include <vm/vm_param.h>
71 #include <vm/vm_map.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_pager.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_extern.h>
77 #include <vm/vm_page.h>
78 #include <vm/vm_kern.h>
80 #include <sys/file2.h>
81 #include <machine/limits.h>
83 typedef struct file *file_t;
88 * Open a file as specified. Use O_* flags for flags.
90 * NOTE! O_ROOTCRED not quite working yet, vn_open() asserts that the
91 * cred must match the process's cred.
94 fp_open(const char *path, int flags, int mode, file_t *fpp)
101 if ((error = falloc(NULL, fpp, NULL)) != 0)
105 if ((flags & O_ROOTCRED) == 0 && td->td_proc)
106 fsetcred(fp, td->td_proc->p_ucred);
108 NDINIT(&nd, NAMEI_LOOKUP, 0, UIO_SYSSPACE, path, td);
109 flags = FFLAGS(flags);
110 if ((error = vn_open(&nd, flags, mode)) == 0) {
111 NDFREE(&nd, NDF_ONLY_PNBUF);
112 fp->f_data = (caddr_t)nd.ni_vp;
115 fp->f_type = DTYPE_VNODE;
116 VOP_UNLOCK(nd.ni_vp, NULL, 0, td);
126 * fp_vpopen(): open a file pointer given a vnode. The vnode must be locked.
127 * The vnode will be returned unlocked whether an error occurs or not.
130 fp_vpopen(struct vnode *vp, int flags, file_t *fpp)
141 * Vnode checks (from vn_open())
143 if (vp->v_type == VLNK) {
147 if (vp->v_type == VSOCK) {
151 flags = FFLAGS(flags);
153 if (flags & (FWRITE | O_TRUNC)) {
154 if (vp->v_type == VDIR) {
158 error = vn_writechk(vp);
166 error = VOP_ACCESS(vp, vmode, td->td_proc->p_ucred, td);
170 error = VOP_OPEN(vp, flags, td->td_proc->p_ucred, td);
174 * Make sure that a VM object is created for VMIO support.
176 if (vn_canvmio(vp) == TRUE) {
177 if ((error = vfs_object_create(vp, td)) != 0)
184 if ((error = falloc(NULL, fpp, NULL)) != 0)
187 if ((flags & O_ROOTCRED) == 0 && td->td_proc)
188 fsetcred(fp, td->td_proc->p_ucred);
189 fp->f_data = (caddr_t)vp;
192 fp->f_type = DTYPE_VNODE;
195 * All done, set return value and update v_writecount now that no more
202 VOP_UNLOCK(vp, NULL, 0, td);
207 * fp_*read() is meant to operate like the normal descriptor based syscalls
208 * would. Note that if 'buf' points to user memory a UIO_USERSPACE
209 * transfer will be used.
212 fp_pread(file_t fp, void *buf, size_t nbytes, off_t offset, ssize_t *res)
221 if (nbytes > INT_MAX)
223 bzero(&auio, sizeof(auio));
224 aiov.iov_base = (caddr_t)buf;
225 aiov.iov_len = nbytes;
226 auio.uio_iov = &aiov;
228 auio.uio_offset = offset;
229 auio.uio_resid = nbytes;
230 auio.uio_rw = UIO_READ;
231 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS)
232 auio.uio_segflg = UIO_USERSPACE;
234 auio.uio_segflg = UIO_SYSSPACE;
235 auio.uio_td = curthread;
238 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, auio.uio_td);
240 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR ||
241 error == EWOULDBLOCK)
246 count -= auio.uio_resid;
253 fp_read(file_t fp, void *buf, size_t nbytes, ssize_t *res)
262 if (nbytes > INT_MAX)
264 bzero(&auio, sizeof(auio));
265 aiov.iov_base = (caddr_t)buf;
266 aiov.iov_len = nbytes;
267 auio.uio_iov = &aiov;
270 auio.uio_resid = nbytes;
271 auio.uio_rw = UIO_READ;
272 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS)
273 auio.uio_segflg = UIO_USERSPACE;
275 auio.uio_segflg = UIO_SYSSPACE;
276 auio.uio_td = curthread;
279 error = fo_read(fp, &auio, fp->f_cred, 0, auio.uio_td);
281 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR ||
282 error == EWOULDBLOCK)
287 count -= auio.uio_resid;
294 fp_pwrite(file_t fp, void *buf, size_t nbytes, off_t offset, ssize_t *res)
303 if (nbytes > INT_MAX)
305 bzero(&auio, sizeof(auio));
306 aiov.iov_base = (caddr_t)buf;
307 aiov.iov_len = nbytes;
308 auio.uio_iov = &aiov;
310 auio.uio_offset = offset;
311 auio.uio_resid = nbytes;
312 auio.uio_rw = UIO_WRITE;
313 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS)
314 auio.uio_segflg = UIO_USERSPACE;
316 auio.uio_segflg = UIO_SYSSPACE;
317 auio.uio_td = curthread;
320 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, auio.uio_td);
322 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR ||
323 error == EWOULDBLOCK)
328 count -= auio.uio_resid;
336 fp_write(file_t fp, void *buf, size_t nbytes, ssize_t *res)
345 if (nbytes > INT_MAX)
347 bzero(&auio, sizeof(auio));
348 aiov.iov_base = (caddr_t)buf;
349 aiov.iov_len = nbytes;
350 auio.uio_iov = &aiov;
353 auio.uio_resid = nbytes;
354 auio.uio_rw = UIO_WRITE;
355 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS)
356 auio.uio_segflg = UIO_USERSPACE;
358 auio.uio_segflg = UIO_SYSSPACE;
359 auio.uio_td = curthread;
362 error = fo_write(fp, &auio, fp->f_cred, 0, auio.uio_td);
364 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR ||
365 error == EWOULDBLOCK)
370 count -= auio.uio_resid;
377 fp_stat(file_t fp, struct stat *ub)
381 error = fo_stat(fp, ub, curthread);
386 * non-anonymous, non-stack descriptor mappings only!
388 * This routine mostly snarfed from vm/vm_mmap.c
391 fp_mmap(void *addr_arg, size_t size, int prot, int flags, struct file *fp,
392 off_t pos, void **resp)
394 struct thread *td = curthread;
395 struct proc *p = td->td_proc;
402 struct vmspace *vms = p->p_vmspace;
404 int disablexworkaround;
408 if ((ssize_t)size < 0 || (flags & MAP_ANON))
411 pageoff = (pos & PAGE_MASK);
414 /* Adjust size for rounding (on both ends). */
415 size += pageoff; /* low end... */
416 size = (vm_size_t)round_page(size); /* hi end */
417 addr = (vm_offset_t)addr_arg;
420 * Check for illegal addresses. Watch out for address wrap... Note
421 * that VM_*_ADDRESS are not constants due to casts (argh).
423 if (flags & MAP_FIXED) {
425 * The specified address must have the same remainder
426 * as the file offset taken modulo PAGE_SIZE, so it
427 * should be aligned after adjustment by pageoff.
430 if (addr & PAGE_MASK)
432 /* Address range must be all in user VM space. */
433 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
436 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
439 if (addr + size < addr)
441 } else if (addr == 0 ||
442 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
443 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))
446 * XXX for non-fixed mappings where no hint is provided or
447 * the hint would fall in the potential heap space,
448 * place it after the end of the largest possible heap.
450 * There should really be a pmap call to determine a reasonable
453 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
457 * Mapping file, get fp for validation. Obtain vnode and make
458 * sure it is of appropriate type.
460 if (fp->f_type != DTYPE_VNODE)
464 * POSIX shared-memory objects are defined to have
465 * kernel persistence, and are not defined to support
466 * read(2)/write(2) -- or even open(2). Thus, we can
467 * use MAP_ASYNC to trade on-disk coherence for speed.
468 * The shm_open(3) library routine turns on the FPOSIXSHM
469 * flag to request this behavior.
471 if (fp->f_flag & FPOSIXSHM)
473 vp = (struct vnode *) fp->f_data;
474 if (vp->v_type != VREG && vp->v_type != VCHR)
478 * Get the proper underlying object
480 if (vp->v_type == VREG) {
481 if (VOP_GETVOBJECT(vp, &obj) != 0)
483 vp = (struct vnode*)obj->handle;
487 * XXX hack to handle use of /dev/zero to map anon memory (ala
490 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
492 maxprot = VM_PROT_ALL;
497 * cdevs does not provide private mappings of any kind.
500 * However, for XIG X server to continue to work,
501 * we should allow the superuser to do it anyway.
502 * We only allow it at securelevel < 1.
503 * (Because the XIG X server writes directly to video
504 * memory via /dev/mem, it should never work at any
506 * XXX this will have to go
508 if (securelevel >= 1)
509 disablexworkaround = 1;
511 disablexworkaround = suser(td);
512 if (vp->v_type == VCHR && disablexworkaround &&
513 (flags & (MAP_PRIVATE|MAP_COPY))) {
518 * Ensure that file and memory protections are
519 * compatible. Note that we only worry about
520 * writability if mapping is shared; in this case,
521 * current and max prot are dictated by the open file.
522 * XXX use the vnode instead? Problem is: what
523 * credentials do we use for determination? What if
524 * proc does a setuid?
526 maxprot = VM_PROT_EXECUTE; /* ??? */
527 if (fp->f_flag & FREAD) {
528 maxprot |= VM_PROT_READ;
529 } else if (prot & PROT_READ) {
534 * If we are sharing potential changes (either via
535 * MAP_SHARED or via the implicit sharing of character
536 * device mappings), and we are trying to get write
537 * permission although we opened it without asking
538 * for it, bail out. Check for superuser, only if
539 * we're at securelevel < 1, to allow the XIG X server
540 * to continue to work.
543 if ((flags & MAP_SHARED) != 0 ||
544 (vp->v_type == VCHR && disablexworkaround)
546 if ((fp->f_flag & FWRITE) != 0) {
548 if ((error = VOP_GETATTR(vp, &va, td))) {
551 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) {
552 maxprot |= VM_PROT_WRITE;
553 } else if (prot & PROT_WRITE) {
557 } else if ((prot & PROT_WRITE) != 0) {
562 maxprot |= VM_PROT_WRITE;
566 error = vm_mmap(&vms->vm_map, &addr, size, prot,
567 maxprot, flags, handle, pos);
568 if (error == 0 && addr_arg)
569 *resp = (void *)addr;
577 return(fdrop(fp, curthread));