Add kernel-layer support for chflags checks, remove (most) from the VFS layer.
[dragonfly.git] / sys / kern / vfs_vnops.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
40 * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $
41 */
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/fcntl.h>
46#include <sys/file.h>
47#include <sys/stat.h>
48#include <sys/proc.h>
49#include <sys/priv.h>
50#include <sys/mount.h>
51#include <sys/nlookup.h>
52#include <sys/vnode.h>
53#include <sys/buf.h>
54#include <sys/filio.h>
55#include <sys/ttycom.h>
56#include <sys/conf.h>
57#include <sys/syslog.h>
58
59static int vn_closefile (struct file *fp);
60static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
61 struct ucred *cred);
62static int vn_read (struct file *fp, struct uio *uio,
63 struct ucred *cred, int flags);
64static int svn_read (struct file *fp, struct uio *uio,
65 struct ucred *cred, int flags);
66static int vn_poll (struct file *fp, int events, struct ucred *cred);
67static int vn_kqfilter (struct file *fp, struct knote *kn);
68static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
69static int vn_write (struct file *fp, struct uio *uio,
70 struct ucred *cred, int flags);
71static int svn_write (struct file *fp, struct uio *uio,
72 struct ucred *cred, int flags);
73
74struct fileops vnode_fileops = {
75 .fo_read = vn_read,
76 .fo_write = vn_write,
77 .fo_ioctl = vn_ioctl,
78 .fo_poll = vn_poll,
79 .fo_kqfilter = vn_kqfilter,
80 .fo_stat = vn_statfile,
81 .fo_close = vn_closefile,
82 .fo_shutdown = nofo_shutdown
83};
84
85struct fileops specvnode_fileops = {
86 .fo_read = svn_read,
87 .fo_write = svn_write,
88 .fo_ioctl = vn_ioctl,
89 .fo_poll = vn_poll,
90 .fo_kqfilter = vn_kqfilter,
91 .fo_stat = vn_statfile,
92 .fo_close = vn_closefile,
93 .fo_shutdown = nofo_shutdown
94};
95
96/*
97 * Shortcut the device read/write. This avoids a lot of vnode junk.
98 * Basically the specfs vnops for read and write take the locked vnode,
99 * unlock it (because we can't hold the vnode locked while reading or writing
100 * a device which may block indefinitely), issues the device operation, then
101 * relock the vnode before returning, plus other junk. This bypasses all
102 * of that and just does the device operation.
103 */
104void
105vn_setspecops(struct file *fp)
106{
107 if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
108 fp->f_ops = &specvnode_fileops;
109 }
110}
111
112/*
113 * Common code for vnode open operations. Check permissions, and call
114 * the VOP_NOPEN or VOP_NCREATE routine.
115 *
116 * The caller is responsible for setting up nd with nlookup_init() and
117 * for cleaning it up with nlookup_done(), whether we return an error
118 * or not.
119 *
120 * On success nd->nl_open_vp will hold a referenced and, if requested,
121 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp
122 * is non-NULL the vnode will be installed in the file pointer.
123 *
124 * NOTE: The vnode is referenced just once on return whether or not it
125 * is also installed in the file pointer.
126 */
127int
128vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
129{
130 struct vnode *vp;
131 struct ucred *cred = nd->nl_cred;
132 struct vattr vat;
133 struct vattr *vap = &vat;
134 int error;
135
136 /*
137 * Lookup the path and create or obtain the vnode. After a
138 * successful lookup a locked nd->nl_nch will be returned.
139 *
140 * The result of this section should be a locked vnode.
141 *
142 * XXX with only a little work we should be able to avoid locking
143 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
144 */
145 nd->nl_flags |= NLC_OPEN;
146 if (fmode & O_APPEND)
147 nd->nl_flags |= NLC_APPEND;
148 if (fmode & O_TRUNC)
149 nd->nl_flags |= NLC_TRUNCATE;
150 if (fmode & FREAD)
151 nd->nl_flags |= NLC_READ;
152 if (fmode & FWRITE)
153 nd->nl_flags |= NLC_WRITE;
154
155 if (fmode & O_CREAT) {
156 /*
157 * CONDITIONAL CREATE FILE CASE
158 *
159 * Setting NLC_CREATE causes a negative hit to store
160 * the negative hit ncp and not return an error. Then
161 * nc_error or nc_vp may be checked to see if the ncp
162 * represents a negative hit. NLC_CREATE also requires
163 * write permission on the governing directory or EPERM
164 * is returned.
165 */
166 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
167 nd->nl_flags |= NLC_FOLLOW;
168 nd->nl_flags |= NLC_CREATE;
169 nd->nl_flags |= NLC_REFDVP;
170 bwillinode(1);
171 error = nlookup(nd);
172 } else {
173 /*
174 * NORMAL OPEN FILE CASE
175 */
176 error = nlookup(nd);
177 }
178
179 if (error)
180 return (error);
181
182 /*
183 * split case to allow us to re-resolve and retry the ncp in case
184 * we get ESTALE.
185 */
186again:
187 if (fmode & O_CREAT) {
188 if (nd->nl_nch.ncp->nc_vp == NULL) {
189 if ((error = ncp_writechk(&nd->nl_nch)) != 0)
190 return (error);
191 VATTR_NULL(vap);
192 vap->va_type = VREG;
193 vap->va_mode = cmode;
194 if (fmode & O_EXCL)
195 vap->va_vaflags |= VA_EXCLUSIVE;
196 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp,
197 nd->nl_cred, vap);
198 if (error)
199 return (error);
200 fmode &= ~O_TRUNC;
201 /* locked vnode is returned */
202 } else {
203 if (fmode & O_EXCL) {
204 error = EEXIST;
205 } else {
206 error = cache_vget(&nd->nl_nch, cred,
207 LK_EXCLUSIVE, &vp);
208 }
209 if (error)
210 return (error);
211 fmode &= ~O_CREAT;
212 }
213 } else {
214 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
215 if (error)
216 return (error);
217 }
218
219 /*
220 * We have a locked vnode and ncp now. Note that the ncp will
221 * be cleaned up by the caller if nd->nl_nch is left intact.
222 */
223 if (vp->v_type == VLNK) {
224 error = EMLINK;
225 goto bad;
226 }
227 if (vp->v_type == VSOCK) {
228 error = EOPNOTSUPP;
229 goto bad;
230 }
231 if ((fmode & O_CREAT) == 0) {
232 if (fmode & (FWRITE | O_TRUNC)) {
233 if (vp->v_type == VDIR) {
234 error = EISDIR;
235 goto bad;
236 }
237 error = vn_writechk(vp, &nd->nl_nch);
238 if (error) {
239 /*
240 * Special stale handling, re-resolve the
241 * vnode.
242 */
243 if (error == ESTALE) {
244 vput(vp);
245 vp = NULL;
246 cache_setunresolved(&nd->nl_nch);
247 error = cache_resolve(&nd->nl_nch, cred);
248 if (error == 0)
249 goto again;
250 }
251 goto bad;
252 }
253 }
254 }
255 if (fmode & O_TRUNC) {
256 vn_unlock(vp); /* XXX */
257 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */
258 VATTR_NULL(vap);
259 vap->va_size = 0;
260 error = VOP_SETATTR(vp, vap, cred);
261 if (error)
262 goto bad;
263 }
264
265 /*
266 * Setup the fp so VOP_OPEN can override it. No descriptor has been
267 * associated with the fp yet so we own it clean.
268 *
269 * f_nchandle inherits nl_nch. This used to be necessary only for
270 * directories but now we do it unconditionally so f*() ops
271 * such as fchmod() can access the actual namespace that was
272 * used to open the file.
273 */
274 if (fp) {
275 if (nd->nl_flags & NLC_APPENDONLY)
276 fmode |= FAPPENDONLY;
277 fp->f_nchandle = nd->nl_nch;
278 cache_zero(&nd->nl_nch);
279 cache_unlock(&fp->f_nchandle);
280 }
281
282 /*
283 * Get rid of nl_nch. vn_open does not return it (it returns the
284 * vnode or the file pointer). Note: we can't leave nl_nch locked
285 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
286 * on /dev/ttyd0
287 */
288 if (nd->nl_nch.ncp)
289 cache_put(&nd->nl_nch);
290
291 error = VOP_OPEN(vp, fmode, cred, fp);
292 if (error) {
293 /*
294 * setting f_ops to &badfileops will prevent the descriptor
295 * code from trying to close and release the vnode, since
296 * the open failed we do not want to call close.
297 */
298 if (fp) {
299 fp->f_data = NULL;
300 fp->f_ops = &badfileops;
301 }
302 goto bad;
303 }
304
305#if 0
306 /*
307 * Assert that VREG files have been setup for vmio.
308 */
309 KASSERT(vp->v_type != VREG || vp->v_object != NULL,
310 ("vn_open: regular file was not VMIO enabled!"));
311#endif
312
313 /*
314 * Return the vnode. XXX needs some cleaning up. The vnode is
315 * only returned in the fp == NULL case.
316 */
317 if (fp == NULL) {
318 nd->nl_open_vp = vp;
319 nd->nl_vp_fmode = fmode;
320 if ((nd->nl_flags & NLC_LOCKVP) == 0)
321 vn_unlock(vp);
322 } else {
323 vput(vp);
324 }
325 return (0);
326bad:
327 if (vp)
328 vput(vp);
329 return (error);
330}
331
332int
333vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
334{
335 struct vnode *vp;
336 int error;
337
338 if (strncmp(devname, "/dev/", 5) == 0)
339 devname += 5;
340 if ((vp = getsynthvnode(devname)) == NULL) {
341 error = ENODEV;
342 } else {
343 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
344 vn_unlock(vp);
345 if (error) {
346 vrele(vp);
347 vp = NULL;
348 }
349 }
350 *vpp = vp;
351 return (error);
352}
353
354/*
355 * Check for write permissions on the specified vnode. nch may be NULL.
356 */
357int
358vn_writechk(struct vnode *vp, struct nchandle *nch)
359{
360 /*
361 * If there's shared text associated with
362 * the vnode, try to free it up once. If
363 * we fail, we can't allow writing.
364 */
365 if (vp->v_flag & VTEXT)
366 return (ETXTBSY);
367
368 /*
369 * If the vnode represents a regular file, check the mount
370 * point via the nch. This may be a different mount point
371 * then the one embedded in the vnode (e.g. nullfs).
372 *
373 * We can still write to non-regular files (e.g. devices)
374 * via read-only mounts.
375 */
376 if (nch && nch->ncp && vp->v_type == VREG)
377 return (ncp_writechk(nch));
378 return (0);
379}
380
381/*
382 * Check whether the underlying mount is read-only. The mount point
383 * referenced by the namecache may be different from the mount point
384 * used by the underlying vnode in the case of NULLFS, so a separate
385 * check is needed.
386 */
387int
388ncp_writechk(struct nchandle *nch)
389{
390 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
391 return (EROFS);
392 return(0);
393}
394
395/*
396 * Vnode close call
397 */
398int
399vn_close(struct vnode *vp, int flags)
400{
401 int error;
402
403 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
404 if (error == 0) {
405 error = VOP_CLOSE(vp, flags);
406 vn_unlock(vp);
407 }
408 vrele(vp);
409 return (error);
410}
411
412static __inline
413int
414sequential_heuristic(struct uio *uio, struct file *fp)
415{
416 /*
417 * Sequential heuristic - detect sequential operation
418 */
419 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
420 uio->uio_offset == fp->f_nextoff) {
421 int tmpseq = fp->f_seqcount;
422 /*
423 * XXX we assume that the filesystem block size is
424 * the default. Not true, but still gives us a pretty
425 * good indicator of how sequential the read operations
426 * are.
427 */
428 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
429 if (tmpseq > IO_SEQMAX)
430 tmpseq = IO_SEQMAX;
431 fp->f_seqcount = tmpseq;
432 return(fp->f_seqcount << IO_SEQSHIFT);
433 }
434
435 /*
436 * Not sequential, quick draw-down of seqcount
437 */
438 if (fp->f_seqcount > 1)
439 fp->f_seqcount = 1;
440 else
441 fp->f_seqcount = 0;
442 return(0);
443}
444
445/*
446 * Package up an I/O request on a vnode into a uio and do it.
447 */
448int
449vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
450 off_t offset, enum uio_seg segflg, int ioflg,
451 struct ucred *cred, int *aresid)
452{
453 struct uio auio;
454 struct iovec aiov;
455 struct ccms_lock ccms_lock;
456 int error;
457
458 if ((ioflg & IO_NODELOCKED) == 0)
459 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
460 auio.uio_iov = &aiov;
461 auio.uio_iovcnt = 1;
462 aiov.iov_base = base;
463 aiov.iov_len = len;
464 auio.uio_resid = len;
465 auio.uio_offset = offset;
466 auio.uio_segflg = segflg;
467 auio.uio_rw = rw;
468 auio.uio_td = curthread;
469 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio);
470 if (rw == UIO_READ) {
471 error = VOP_READ(vp, &auio, ioflg, cred);
472 } else {
473 error = VOP_WRITE(vp, &auio, ioflg, cred);
474 }
475 ccms_lock_put(&vp->v_ccms, &ccms_lock);
476 if (aresid)
477 *aresid = auio.uio_resid;
478 else
479 if (auio.uio_resid && error == 0)
480 error = EIO;
481 if ((ioflg & IO_NODELOCKED) == 0)
482 vn_unlock(vp);
483 return (error);
484}
485
486/*
487 * Package up an I/O request on a vnode into a uio and do it. The I/O
488 * request is split up into smaller chunks and we try to avoid saturating
489 * the buffer cache while potentially holding a vnode locked, so we
490 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
491 * to give other processes a chance to lock the vnode (either other processes
492 * core'ing the same binary, or unrelated processes scanning the directory).
493 */
494int
495vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
496 off_t offset, enum uio_seg segflg, int ioflg,
497 struct ucred *cred, int *aresid)
498{
499 int error = 0;
500
501 do {
502 int chunk;
503
504 /*
505 * Force `offset' to a multiple of MAXBSIZE except possibly
506 * for the first chunk, so that filesystems only need to
507 * write full blocks except possibly for the first and last
508 * chunks.
509 */
510 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
511
512 if (chunk > len)
513 chunk = len;
514 if (vp->v_type == VREG) {
515 switch(rw) {
516 case UIO_READ:
517 bwillread(chunk);
518 break;
519 case UIO_WRITE:
520 bwillwrite(chunk);
521 break;
522 }
523 }
524 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
525 ioflg, cred, aresid);
526 len -= chunk; /* aresid calc already includes length */
527 if (error)
528 break;
529 offset += chunk;
530 base += chunk;
531 uio_yield();
532 } while (len);
533 if (aresid)
534 *aresid += len;
535 return (error);
536}
537
538/*
539 * MPALMOSTSAFE - acquires mplock
540 */
541static int
542vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
543{
544 struct ccms_lock ccms_lock;
545 struct vnode *vp;
546 int error, ioflag;
547
548 get_mplock();
549 KASSERT(uio->uio_td == curthread,
550 ("uio_td %p is not td %p", uio->uio_td, curthread));
551 vp = (struct vnode *)fp->f_data;
552
553 ioflag = 0;
554 if (flags & O_FBLOCKING) {
555 /* ioflag &= ~IO_NDELAY; */
556 } else if (flags & O_FNONBLOCKING) {
557 ioflag |= IO_NDELAY;
558 } else if (fp->f_flag & FNONBLOCK) {
559 ioflag |= IO_NDELAY;
560 }
561 if (flags & O_FBUFFERED) {
562 /* ioflag &= ~IO_DIRECT; */
563 } else if (flags & O_FUNBUFFERED) {
564 ioflag |= IO_DIRECT;
565 } else if (fp->f_flag & O_DIRECT) {
566 ioflag |= IO_DIRECT;
567 }
568 vn_lock(vp, LK_SHARED | LK_RETRY);
569 if ((flags & O_FOFFSET) == 0)
570 uio->uio_offset = fp->f_offset;
571 ioflag |= sequential_heuristic(uio, fp);
572
573 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
574 error = VOP_READ(vp, uio, ioflag, cred);
575 ccms_lock_put(&vp->v_ccms, &ccms_lock);
576 if ((flags & O_FOFFSET) == 0)
577 fp->f_offset = uio->uio_offset;
578 fp->f_nextoff = uio->uio_offset;
579 vn_unlock(vp);
580 rel_mplock();
581 return (error);
582}
583
584/*
585 * Device-optimized file table vnode read routine.
586 *
587 * This bypasses the VOP table and talks directly to the device. Most
588 * filesystems just route to specfs and can make this optimization.
589 *
590 * MPALMOSTSAFE - acquires mplock
591 */
592static int
593svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
594{
595 struct vnode *vp;
596 int ioflag;
597 int error;
598 cdev_t dev;
599
600 get_mplock();
601 KASSERT(uio->uio_td == curthread,
602 ("uio_td %p is not td %p", uio->uio_td, curthread));
603
604 vp = (struct vnode *)fp->f_data;
605 if (vp == NULL || vp->v_type == VBAD) {
606 error = EBADF;
607 goto done;
608 }
609
610 if ((dev = vp->v_rdev) == NULL) {
611 error = EBADF;
612 goto done;
613 }
614 reference_dev(dev);
615
616 if (uio->uio_resid == 0) {
617 error = 0;
618 goto done;
619 }
620 if ((flags & O_FOFFSET) == 0)
621 uio->uio_offset = fp->f_offset;
622
623 ioflag = 0;
624 if (flags & O_FBLOCKING) {
625 /* ioflag &= ~IO_NDELAY; */
626 } else if (flags & O_FNONBLOCKING) {
627 ioflag |= IO_NDELAY;
628 } else if (fp->f_flag & FNONBLOCK) {
629 ioflag |= IO_NDELAY;
630 }
631 if (flags & O_FBUFFERED) {
632 /* ioflag &= ~IO_DIRECT; */
633 } else if (flags & O_FUNBUFFERED) {
634 ioflag |= IO_DIRECT;
635 } else if (fp->f_flag & O_DIRECT) {
636 ioflag |= IO_DIRECT;
637 }
638 ioflag |= sequential_heuristic(uio, fp);
639
640 error = dev_dread(dev, uio, ioflag);
641
642 release_dev(dev);
643 if ((flags & O_FOFFSET) == 0)
644 fp->f_offset = uio->uio_offset;
645 fp->f_nextoff = uio->uio_offset;
646done:
647 rel_mplock();
648 return (error);
649}
650
651/*
652 * MPALMOSTSAFE - acquires mplock
653 */
654static int
655vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
656{
657 struct ccms_lock ccms_lock;
658 struct vnode *vp;
659 int error, ioflag;
660
661 get_mplock();
662 KASSERT(uio->uio_td == curthread,
663 ("uio_td %p is not p %p", uio->uio_td, curthread));
664 vp = (struct vnode *)fp->f_data;
665#if 0
666 /* VOP_WRITE should handle this now */
667 if (vp->v_type == VREG || vp->v_type == VDATABASE)
668 bwillwrite();
669#endif
670 vp = (struct vnode *)fp->f_data; /* XXX needed? */
671
672 ioflag = IO_UNIT;
673 if (vp->v_type == VREG &&
674 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
675 ioflag |= IO_APPEND;
676 }
677
678 if (flags & O_FBLOCKING) {
679 /* ioflag &= ~IO_NDELAY; */
680 } else if (flags & O_FNONBLOCKING) {
681 ioflag |= IO_NDELAY;
682 } else if (fp->f_flag & FNONBLOCK) {
683 ioflag |= IO_NDELAY;
684 }
685 if (flags & O_FBUFFERED) {
686 /* ioflag &= ~IO_DIRECT; */
687 } else if (flags & O_FUNBUFFERED) {
688 ioflag |= IO_DIRECT;
689 } else if (fp->f_flag & O_DIRECT) {
690 ioflag |= IO_DIRECT;
691 }
692 if (flags & O_FASYNCWRITE) {
693 /* ioflag &= ~IO_SYNC; */
694 } else if (flags & O_FSYNCWRITE) {
695 ioflag |= IO_SYNC;
696 } else if (fp->f_flag & O_FSYNC) {
697 ioflag |= IO_SYNC;
698 }
699
700 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
701 ioflag |= IO_SYNC;
702 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
703 if ((flags & O_FOFFSET) == 0)
704 uio->uio_offset = fp->f_offset;
705 ioflag |= sequential_heuristic(uio, fp);
706 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
707 error = VOP_WRITE(vp, uio, ioflag, cred);
708 ccms_lock_put(&vp->v_ccms, &ccms_lock);
709 if ((flags & O_FOFFSET) == 0)
710 fp->f_offset = uio->uio_offset;
711 fp->f_nextoff = uio->uio_offset;
712 vn_unlock(vp);
713 rel_mplock();
714 return (error);
715}
716
717/*
718 * Device-optimized file table vnode write routine.
719 *
720 * This bypasses the VOP table and talks directly to the device. Most
721 * filesystems just route to specfs and can make this optimization.
722 *
723 * MPALMOSTSAFE - acquires mplock
724 */
725static int
726svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
727{
728 struct vnode *vp;
729 int ioflag;
730 int error;
731 cdev_t dev;
732
733 get_mplock();
734 KASSERT(uio->uio_td == curthread,
735 ("uio_td %p is not p %p", uio->uio_td, curthread));
736
737 vp = (struct vnode *)fp->f_data;
738 if (vp == NULL || vp->v_type == VBAD) {
739 error = EBADF;
740 goto done;
741 }
742 if (vp->v_type == VREG)
743 bwillwrite(uio->uio_resid);
744 vp = (struct vnode *)fp->f_data; /* XXX needed? */
745
746 if ((dev = vp->v_rdev) == NULL) {
747 error = EBADF;
748 goto done;
749 }
750 reference_dev(dev);
751
752 if ((flags & O_FOFFSET) == 0)
753 uio->uio_offset = fp->f_offset;
754
755 ioflag = IO_UNIT;
756 if (vp->v_type == VREG &&
757 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
758 ioflag |= IO_APPEND;
759 }
760
761 if (flags & O_FBLOCKING) {
762 /* ioflag &= ~IO_NDELAY; */
763 } else if (flags & O_FNONBLOCKING) {
764 ioflag |= IO_NDELAY;
765 } else if (fp->f_flag & FNONBLOCK) {
766 ioflag |= IO_NDELAY;
767 }
768 if (flags & O_FBUFFERED) {
769 /* ioflag &= ~IO_DIRECT; */
770 } else if (flags & O_FUNBUFFERED) {
771 ioflag |= IO_DIRECT;
772 } else if (fp->f_flag & O_DIRECT) {
773 ioflag |= IO_DIRECT;
774 }
775 if (flags & O_FASYNCWRITE) {
776 /* ioflag &= ~IO_SYNC; */
777 } else if (flags & O_FSYNCWRITE) {
778 ioflag |= IO_SYNC;
779 } else if (fp->f_flag & O_FSYNC) {
780 ioflag |= IO_SYNC;
781 }
782
783 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
784 ioflag |= IO_SYNC;
785 ioflag |= sequential_heuristic(uio, fp);
786
787 error = dev_dwrite(dev, uio, ioflag);
788
789 release_dev(dev);
790 if ((flags & O_FOFFSET) == 0)
791 fp->f_offset = uio->uio_offset;
792 fp->f_nextoff = uio->uio_offset;
793done:
794 rel_mplock();
795 return (error);
796}
797
798/*
799 * MPALMOSTSAFE - acquires mplock
800 */
801static int
802vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
803{
804 struct vnode *vp;
805 int error;
806
807 get_mplock();
808 vp = (struct vnode *)fp->f_data;
809 error = vn_stat(vp, sb, cred);
810 rel_mplock();
811 return (error);
812}
813
814int
815vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
816{
817 struct vattr vattr;
818 struct vattr *vap;
819 int error;
820 u_short mode;
821 cdev_t dev;
822
823 vap = &vattr;
824 error = VOP_GETATTR(vp, vap);
825 if (error)
826 return (error);
827
828 /*
829 * Zero the spare stat fields
830 */
831 sb->st_lspare = 0;
832 sb->st_qspare = 0;
833
834 /*
835 * Copy from vattr table
836 */
837 if (vap->va_fsid != VNOVAL)
838 sb->st_dev = vap->va_fsid;
839 else
840 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
841 sb->st_ino = vap->va_fileid;
842 mode = vap->va_mode;
843 switch (vap->va_type) {
844 case VREG:
845 mode |= S_IFREG;
846 break;
847 case VDATABASE:
848 mode |= S_IFDB;
849 break;
850 case VDIR:
851 mode |= S_IFDIR;
852 break;
853 case VBLK:
854 mode |= S_IFBLK;
855 break;
856 case VCHR:
857 mode |= S_IFCHR;
858 break;
859 case VLNK:
860 mode |= S_IFLNK;
861 /* This is a cosmetic change, symlinks do not have a mode. */
862 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
863 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
864 else
865 sb->st_mode |= ACCESSPERMS; /* 0777 */
866 break;
867 case VSOCK:
868 mode |= S_IFSOCK;
869 break;
870 case VFIFO:
871 mode |= S_IFIFO;
872 break;
873 default:
874 return (EBADF);
875 }
876 sb->st_mode = mode;
877 if (vap->va_nlink > (nlink_t)-1)
878 sb->st_nlink = (nlink_t)-1;
879 else
880 sb->st_nlink = vap->va_nlink;
881 sb->st_uid = vap->va_uid;
882 sb->st_gid = vap->va_gid;
883 sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor);
884 sb->st_size = vap->va_size;
885 sb->st_atimespec = vap->va_atime;
886 sb->st_mtimespec = vap->va_mtime;
887 sb->st_ctimespec = vap->va_ctime;
888
889 /*
890 * A VCHR and VBLK device may track the last access and last modified
891 * time independantly of the filesystem. This is particularly true
892 * because device read and write calls may bypass the filesystem.
893 */
894 if (vp->v_type == VCHR || vp->v_type == VBLK) {
895 dev = vp->v_rdev;
896 if (dev != NULL) {
897 if (dev->si_lastread) {
898 sb->st_atimespec.tv_sec = dev->si_lastread;
899 sb->st_atimespec.tv_nsec = 0;
900 }
901 if (dev->si_lastwrite) {
902 sb->st_atimespec.tv_sec = dev->si_lastwrite;
903 sb->st_atimespec.tv_nsec = 0;
904 }
905 }
906 }
907
908 /*
909 * According to www.opengroup.org, the meaning of st_blksize is
910 * "a filesystem-specific preferred I/O block size for this
911 * object. In some filesystem types, this may vary from file
912 * to file"
913 * Default to PAGE_SIZE after much discussion.
914 */
915
916 if (vap->va_type == VREG) {
917 sb->st_blksize = vap->va_blocksize;
918 } else if (vn_isdisk(vp, NULL)) {
919 /*
920 * XXX this is broken. If the device is not yet open (aka
921 * stat() call, aka v_rdev == NULL), how are we supposed
922 * to get a valid block size out of it?
923 */
924 dev = vp->v_rdev;
925 if (dev == NULL && vp->v_type == VCHR) {
926 dev = get_dev(vp->v_umajor, vp->v_uminor);
927 }
928 sb->st_blksize = dev->si_bsize_best;
929 if (sb->st_blksize < dev->si_bsize_phys)
930 sb->st_blksize = dev->si_bsize_phys;
931 if (sb->st_blksize < BLKDEV_IOSIZE)
932 sb->st_blksize = BLKDEV_IOSIZE;
933 } else {
934 sb->st_blksize = PAGE_SIZE;
935 }
936
937 sb->st_flags = vap->va_flags;
938
939 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
940 if (error)
941 sb->st_gen = 0;
942 else
943 sb->st_gen = (u_int32_t)vap->va_gen;
944
945 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
946 sb->st_fsmid = vap->va_fsmid;
947 return (0);
948}
949
950/*
951 * MPALMOSTSAFE - acquires mplock
952 */
953static int
954vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred)
955{
956 struct vnode *vp = ((struct vnode *)fp->f_data);
957 struct vnode *ovp;
958 struct vattr vattr;
959 int error;
960
961 get_mplock();
962
963 switch (vp->v_type) {
964 case VREG:
965 case VDIR:
966 if (com == FIONREAD) {
967 error = VOP_GETATTR(vp, &vattr);
968 if (error)
969 break;
970 *(int *)data = vattr.va_size - fp->f_offset;
971 error = 0;
972 break;
973 }
974 if (com == FIOASYNC) { /* XXX */
975 error = 0; /* XXX */
976 break;
977 }
978 /* fall into ... */
979 default:
980#if 0
981 return (ENOTTY);
982#endif
983 case VFIFO:
984 case VCHR:
985 case VBLK:
986 if (com == FIODTYPE) {
987 if (vp->v_type != VCHR && vp->v_type != VBLK) {
988 error = ENOTTY;
989 break;
990 }
991 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
992 error = 0;
993 break;
994 }
995 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred);
996 if (error == 0 && com == TIOCSCTTY) {
997 struct proc *p = curthread->td_proc;
998 struct session *sess;
999
1000 if (p == NULL) {
1001 error = ENOTTY;
1002 break;
1003 }
1004
1005 sess = p->p_session;
1006 /* Do nothing if reassigning same control tty */
1007 if (sess->s_ttyvp == vp) {
1008 error = 0;
1009 break;
1010 }
1011
1012 /* Get rid of reference to old control tty */
1013 ovp = sess->s_ttyvp;
1014 vref(vp);
1015 sess->s_ttyvp = vp;
1016 if (ovp)
1017 vrele(ovp);
1018 }
1019 break;
1020 }
1021 rel_mplock();
1022 return (error);
1023}
1024
1025/*
1026 * MPALMOSTSAFE - acquires mplock
1027 */
1028static int
1029vn_poll(struct file *fp, int events, struct ucred *cred)
1030{
1031 int error;
1032
1033 get_mplock();
1034 error = VOP_POLL(((struct vnode *)fp->f_data), events, cred);
1035 rel_mplock();
1036 return (error);
1037}
1038
1039/*
1040 * Check that the vnode is still valid, and if so
1041 * acquire requested lock.
1042 */
1043int
1044#ifndef DEBUG_LOCKS
1045vn_lock(struct vnode *vp, int flags)
1046#else
1047debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line)
1048#endif
1049{
1050 int error;
1051
1052 do {
1053#ifdef DEBUG_LOCKS
1054 vp->filename = filename;
1055 vp->line = line;
1056 error = debuglockmgr(&vp->v_lock, flags,
1057 "vn_lock", filename, line);
1058#else
1059 error = lockmgr(&vp->v_lock, flags);
1060#endif
1061 if (error == 0)
1062 break;
1063 } while (flags & LK_RETRY);
1064
1065 /*
1066 * Because we (had better!) have a ref on the vnode, once it
1067 * goes to VRECLAIMED state it will not be recycled until all
1068 * refs go away. So we can just check the flag.
1069 */
1070 if (error == 0 && (vp->v_flag & VRECLAIMED)) {
1071 lockmgr(&vp->v_lock, LK_RELEASE);
1072 error = ENOENT;
1073 }
1074 return (error);
1075}
1076
1077void
1078vn_unlock(struct vnode *vp)
1079{
1080 lockmgr(&vp->v_lock, LK_RELEASE);
1081}
1082
1083int
1084vn_islocked(struct vnode *vp)
1085{
1086 return (lockstatus(&vp->v_lock, curthread));
1087}
1088
1089/*
1090 * MPALMOSTSAFE - acquires mplock
1091 */
1092static int
1093vn_closefile(struct file *fp)
1094{
1095 int error;
1096
1097 get_mplock();
1098 fp->f_ops = &badfileops;
1099 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
1100 rel_mplock();
1101 return (error);
1102}
1103
1104/*
1105 * MPALMOSTSAFE - acquires mplock
1106 */
1107static int
1108vn_kqfilter(struct file *fp, struct knote *kn)
1109{
1110 int error;
1111
1112 get_mplock();
1113 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
1114 rel_mplock();
1115 return (error);
1116}