Add kernel-layer support for chflags checks, remove (most) from the VFS layer.
[dragonfly.git] / sys / kern / vfs_vnops.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
c4df9635 40 * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $
984263bc
MD
41 */
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/fcntl.h>
46#include <sys/file.h>
47#include <sys/stat.h>
48#include <sys/proc.h>
895c1f85 49#include <sys/priv.h>
984263bc 50#include <sys/mount.h>
fad57d0e 51#include <sys/nlookup.h>
984263bc
MD
52#include <sys/vnode.h>
53#include <sys/buf.h>
54#include <sys/filio.h>
55#include <sys/ttycom.h>
56#include <sys/conf.h>
57#include <sys/syslog.h>
58
87de5057
MD
59static int vn_closefile (struct file *fp);
60static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
61 struct ucred *cred);
402ed7e1 62static int vn_read (struct file *fp, struct uio *uio,
87de5057 63 struct ucred *cred, int flags);
fad57d0e 64static int svn_read (struct file *fp, struct uio *uio,
87de5057
MD
65 struct ucred *cred, int flags);
66static int vn_poll (struct file *fp, int events, struct ucred *cred);
402ed7e1 67static int vn_kqfilter (struct file *fp, struct knote *kn);
87de5057 68static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
402ed7e1 69static int vn_write (struct file *fp, struct uio *uio,
87de5057 70 struct ucred *cred, int flags);
fad57d0e 71static int svn_write (struct file *fp, struct uio *uio,
87de5057 72 struct ucred *cred, int flags);
984263bc 73
fad57d0e 74struct fileops vnode_fileops = {
b2d248cb
MD
75 .fo_read = vn_read,
76 .fo_write = vn_write,
77 .fo_ioctl = vn_ioctl,
78 .fo_poll = vn_poll,
79 .fo_kqfilter = vn_kqfilter,
80 .fo_stat = vn_statfile,
81 .fo_close = vn_closefile,
82 .fo_shutdown = nofo_shutdown
984263bc
MD
83};
84
fad57d0e 85struct fileops specvnode_fileops = {
b2d248cb
MD
86 .fo_read = svn_read,
87 .fo_write = svn_write,
88 .fo_ioctl = vn_ioctl,
89 .fo_poll = vn_poll,
90 .fo_kqfilter = vn_kqfilter,
91 .fo_stat = vn_statfile,
92 .fo_close = vn_closefile,
93 .fo_shutdown = nofo_shutdown
fad57d0e
MD
94};
95
96/*
97 * Shortcut the device read/write. This avoids a lot of vnode junk.
98 * Basically the specfs vnops for read and write take the locked vnode,
99 * unlock it (because we can't hold the vnode locked while reading or writing
100 * a device which may block indefinitely), issues the device operation, then
101 * relock the vnode before returning, plus other junk. This bypasses all
102 * of that and just does the device operation.
103 */
104void
105vn_setspecops(struct file *fp)
106{
107 if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
108 fp->f_ops = &specvnode_fileops;
109 }
110}
111
984263bc 112/*
fad57d0e
MD
113 * Common code for vnode open operations. Check permissions, and call
114 * the VOP_NOPEN or VOP_NCREATE routine.
115 *
116 * The caller is responsible for setting up nd with nlookup_init() and
117 * for cleaning it up with nlookup_done(), whether we return an error
118 * or not.
119 *
120 * On success nd->nl_open_vp will hold a referenced and, if requested,
121 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp
122 * is non-NULL the vnode will be installed in the file pointer.
123 *
124 * NOTE: The vnode is referenced just once on return whether or not it
125 * is also installed in the file pointer.
984263bc
MD
126 */
127int
fad57d0e 128vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
984263bc 129{
1fd87d54 130 struct vnode *vp;
fad57d0e 131 struct ucred *cred = nd->nl_cred;
984263bc
MD
132 struct vattr vat;
133 struct vattr *vap = &vat;
3a907475 134 int error;
984263bc 135
fad57d0e
MD
136 /*
137 * Lookup the path and create or obtain the vnode. After a
28623bf9 138 * successful lookup a locked nd->nl_nch will be returned.
fad57d0e
MD
139 *
140 * The result of this section should be a locked vnode.
141 *
142 * XXX with only a little work we should be able to avoid locking
143 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
144 */
3a907475
MD
145 nd->nl_flags |= NLC_OPEN;
146 if (fmode & O_APPEND)
147 nd->nl_flags |= NLC_APPEND;
148 if (fmode & O_TRUNC)
149 nd->nl_flags |= NLC_TRUNCATE;
150 if (fmode & FREAD)
151 nd->nl_flags |= NLC_READ;
152 if (fmode & FWRITE)
153 nd->nl_flags |= NLC_WRITE;
154
984263bc 155 if (fmode & O_CREAT) {
fad57d0e
MD
156 /*
157 * CONDITIONAL CREATE FILE CASE
158 *
159 * Setting NLC_CREATE causes a negative hit to store
160 * the negative hit ncp and not return an error. Then
161 * nc_error or nc_vp may be checked to see if the ncp
162 * represents a negative hit. NLC_CREATE also requires
163 * write permission on the governing directory or EPERM
164 * is returned.
165 */
984263bc 166 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
fad57d0e
MD
167 nd->nl_flags |= NLC_FOLLOW;
168 nd->nl_flags |= NLC_CREATE;
5312fa43 169 nd->nl_flags |= NLC_REFDVP;
c4df9635 170 bwillinode(1);
fad57d0e 171 error = nlookup(nd);
806dcf9a
MD
172 } else {
173 /*
174 * NORMAL OPEN FILE CASE
175 */
176 error = nlookup(nd);
177 }
fad57d0e 178
806dcf9a
MD
179 if (error)
180 return (error);
fad57d0e 181
806dcf9a
MD
182 /*
183 * split case to allow us to re-resolve and retry the ncp in case
184 * we get ESTALE.
185 */
186again:
187 if (fmode & O_CREAT) {
28623bf9
MD
188 if (nd->nl_nch.ncp->nc_vp == NULL) {
189 if ((error = ncp_writechk(&nd->nl_nch)) != 0)
468bb1f9 190 return (error);
984263bc
MD
191 VATTR_NULL(vap);
192 vap->va_type = VREG;
193 vap->va_mode = cmode;
194 if (fmode & O_EXCL)
195 vap->va_vaflags |= VA_EXCLUSIVE;
5312fa43 196 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp,
dff430ab 197 nd->nl_cred, vap);
fad57d0e 198 if (error)
984263bc 199 return (error);
984263bc 200 fmode &= ~O_TRUNC;
fad57d0e 201 /* locked vnode is returned */
984263bc 202 } else {
984263bc
MD
203 if (fmode & O_EXCL) {
204 error = EEXIST;
fad57d0e 205 } else {
28623bf9 206 error = cache_vget(&nd->nl_nch, cred,
fad57d0e 207 LK_EXCLUSIVE, &vp);
984263bc 208 }
fad57d0e
MD
209 if (error)
210 return (error);
984263bc
MD
211 fmode &= ~O_CREAT;
212 }
213 } else {
28623bf9 214 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
984263bc
MD
215 if (error)
216 return (error);
984263bc 217 }
fad57d0e
MD
218
219 /*
806dcf9a 220 * We have a locked vnode and ncp now. Note that the ncp will
28623bf9 221 * be cleaned up by the caller if nd->nl_nch is left intact.
fad57d0e 222 */
984263bc
MD
223 if (vp->v_type == VLNK) {
224 error = EMLINK;
225 goto bad;
226 }
227 if (vp->v_type == VSOCK) {
228 error = EOPNOTSUPP;
229 goto bad;
230 }
231 if ((fmode & O_CREAT) == 0) {
984263bc
MD
232 if (fmode & (FWRITE | O_TRUNC)) {
233 if (vp->v_type == VDIR) {
234 error = EISDIR;
235 goto bad;
236 }
28623bf9 237 error = vn_writechk(vp, &nd->nl_nch);
806dcf9a
MD
238 if (error) {
239 /*
240 * Special stale handling, re-resolve the
241 * vnode.
242 */
243 if (error == ESTALE) {
244 vput(vp);
245 vp = NULL;
28623bf9
MD
246 cache_setunresolved(&nd->nl_nch);
247 error = cache_resolve(&nd->nl_nch, cred);
806dcf9a
MD
248 if (error == 0)
249 goto again;
250 }
984263bc 251 goto bad;
806dcf9a 252 }
984263bc
MD
253 }
254 }
255 if (fmode & O_TRUNC) {
a11aaa81 256 vn_unlock(vp); /* XXX */
ca466bae 257 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */
984263bc
MD
258 VATTR_NULL(vap);
259 vap->va_size = 0;
87de5057 260 error = VOP_SETATTR(vp, vap, cred);
984263bc
MD
261 if (error)
262 goto bad;
263 }
fad57d0e
MD
264
265 /*
266 * Setup the fp so VOP_OPEN can override it. No descriptor has been
72310cfb
MD
267 * associated with the fp yet so we own it clean.
268 *
28623bf9 269 * f_nchandle inherits nl_nch. This used to be necessary only for
72310cfb
MD
270 * directories but now we do it unconditionally so f*() ops
271 * such as fchmod() can access the actual namespace that was
272 * used to open the file.
fad57d0e
MD
273 */
274 if (fp) {
3a907475
MD
275 if (nd->nl_flags & NLC_APPENDONLY)
276 fmode |= FAPPENDONLY;
28623bf9
MD
277 fp->f_nchandle = nd->nl_nch;
278 cache_zero(&nd->nl_nch);
279 cache_unlock(&fp->f_nchandle);
fad57d0e
MD
280 }
281
282 /*
28623bf9
MD
283 * Get rid of nl_nch. vn_open does not return it (it returns the
284 * vnode or the file pointer). Note: we can't leave nl_nch locked
fad57d0e
MD
285 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
286 * on /dev/ttyd0
287 */
28623bf9
MD
288 if (nd->nl_nch.ncp)
289 cache_put(&nd->nl_nch);
fad57d0e 290
87de5057 291 error = VOP_OPEN(vp, fmode, cred, fp);
fad57d0e
MD
292 if (error) {
293 /*
294 * setting f_ops to &badfileops will prevent the descriptor
295 * code from trying to close and release the vnode, since
296 * the open failed we do not want to call close.
297 */
675eb4c0
MD
298 if (fp) {
299 fp->f_data = NULL;
300 fp->f_ops = &badfileops;
301 }
984263bc 302 goto bad;
fad57d0e 303 }
fad57d0e 304
7540ab49 305#if 0
984263bc 306 /*
7540ab49 307 * Assert that VREG files have been setup for vmio.
984263bc 308 */
7540ab49
MD
309 KASSERT(vp->v_type != VREG || vp->v_object != NULL,
310 ("vn_open: regular file was not VMIO enabled!"));
311#endif
984263bc 312
fad57d0e
MD
313 /*
314 * Return the vnode. XXX needs some cleaning up. The vnode is
8ddc6004 315 * only returned in the fp == NULL case.
fad57d0e
MD
316 */
317 if (fp == NULL) {
318 nd->nl_open_vp = vp;
319 nd->nl_vp_fmode = fmode;
320 if ((nd->nl_flags & NLC_LOCKVP) == 0)
a11aaa81 321 vn_unlock(vp);
fad57d0e 322 } else {
8ddc6004 323 vput(vp);
fad57d0e 324 }
984263bc
MD
325 return (0);
326bad:
bb5c9c00
MD
327 if (vp)
328 vput(vp);
984263bc
MD
329 return (error);
330}
331
a8873631
MD
332int
333vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
334{
335 struct vnode *vp;
336 int error;
337
338 if (strncmp(devname, "/dev/", 5) == 0)
339 devname += 5;
340 if ((vp = getsynthvnode(devname)) == NULL) {
341 error = ENODEV;
342 } else {
343 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
344 vn_unlock(vp);
345 if (error) {
346 vrele(vp);
347 vp = NULL;
348 }
349 }
350 *vpp = vp;
351 return (error);
352}
353
984263bc 354/*
28623bf9 355 * Check for write permissions on the specified vnode. nch may be NULL.
984263bc
MD
356 */
357int
28623bf9 358vn_writechk(struct vnode *vp, struct nchandle *nch)
984263bc 359{
984263bc
MD
360 /*
361 * If there's shared text associated with
362 * the vnode, try to free it up once. If
363 * we fail, we can't allow writing.
364 */
365 if (vp->v_flag & VTEXT)
366 return (ETXTBSY);
468bb1f9
MD
367
368 /*
369 * If the vnode represents a regular file, check the mount
28623bf9 370 * point via the nch. This may be a different mount point
468bb1f9
MD
371 * then the one embedded in the vnode (e.g. nullfs).
372 *
373 * We can still write to non-regular files (e.g. devices)
374 * via read-only mounts.
375 */
28623bf9
MD
376 if (nch && nch->ncp && vp->v_type == VREG)
377 return (ncp_writechk(nch));
984263bc
MD
378 return (0);
379}
380
381/*
468bb1f9
MD
382 * Check whether the underlying mount is read-only. The mount point
383 * referenced by the namecache may be different from the mount point
384 * used by the underlying vnode in the case of NULLFS, so a separate
385 * check is needed.
386 */
468bb1f9 387int
28623bf9 388ncp_writechk(struct nchandle *nch)
468bb1f9 389{
28623bf9 390 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
468bb1f9
MD
391 return (EROFS);
392 return(0);
393}
394
395/*
984263bc
MD
396 * Vnode close call
397 */
398int
87de5057 399vn_close(struct vnode *vp, int flags)
984263bc
MD
400{
401 int error;
402
4698dfb3
MN
403 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
404 if (error == 0) {
87de5057 405 error = VOP_CLOSE(vp, flags);
a11aaa81 406 vn_unlock(vp);
5fd012e0 407 }
984263bc
MD
408 vrele(vp);
409 return (error);
410}
411
412static __inline
413int
414sequential_heuristic(struct uio *uio, struct file *fp)
415{
416 /*
417 * Sequential heuristic - detect sequential operation
418 */
419 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
420 uio->uio_offset == fp->f_nextoff) {
421 int tmpseq = fp->f_seqcount;
422 /*
423 * XXX we assume that the filesystem block size is
424 * the default. Not true, but still gives us a pretty
425 * good indicator of how sequential the read operations
426 * are.
427 */
428 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
429 if (tmpseq > IO_SEQMAX)
430 tmpseq = IO_SEQMAX;
431 fp->f_seqcount = tmpseq;
432 return(fp->f_seqcount << IO_SEQSHIFT);
433 }
434
435 /*
436 * Not sequential, quick draw-down of seqcount
437 */
438 if (fp->f_seqcount > 1)
439 fp->f_seqcount = 1;
440 else
441 fp->f_seqcount = 0;
442 return(0);
443}
444
445/*
446 * Package up an I/O request on a vnode into a uio and do it.
447 */
448int
87de5057
MD
449vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
450 off_t offset, enum uio_seg segflg, int ioflg,
451 struct ucred *cred, int *aresid)
984263bc
MD
452{
453 struct uio auio;
454 struct iovec aiov;
9bfc4d6d 455 struct ccms_lock ccms_lock;
984263bc
MD
456 int error;
457
458 if ((ioflg & IO_NODELOCKED) == 0)
ca466bae 459 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
984263bc
MD
460 auio.uio_iov = &aiov;
461 auio.uio_iovcnt = 1;
462 aiov.iov_base = base;
463 aiov.iov_len = len;
464 auio.uio_resid = len;
465 auio.uio_offset = offset;
466 auio.uio_segflg = segflg;
467 auio.uio_rw = rw;
87de5057 468 auio.uio_td = curthread;
9bfc4d6d 469 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio);
984263bc
MD
470 if (rw == UIO_READ) {
471 error = VOP_READ(vp, &auio, ioflg, cred);
472 } else {
473 error = VOP_WRITE(vp, &auio, ioflg, cred);
474 }
9bfc4d6d 475 ccms_lock_put(&vp->v_ccms, &ccms_lock);
984263bc
MD
476 if (aresid)
477 *aresid = auio.uio_resid;
478 else
479 if (auio.uio_resid && error == 0)
480 error = EIO;
481 if ((ioflg & IO_NODELOCKED) == 0)
a11aaa81 482 vn_unlock(vp);
984263bc
MD
483 return (error);
484}
485
486/*
487 * Package up an I/O request on a vnode into a uio and do it. The I/O
488 * request is split up into smaller chunks and we try to avoid saturating
489 * the buffer cache while potentially holding a vnode locked, so we
490 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
491 * to give other processes a chance to lock the vnode (either other processes
492 * core'ing the same binary, or unrelated processes scanning the directory).
493 */
494int
87de5057
MD
495vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
496 off_t offset, enum uio_seg segflg, int ioflg,
497 struct ucred *cred, int *aresid)
984263bc
MD
498{
499 int error = 0;
500
501 do {
9a0222ac 502 int chunk;
984263bc 503
9a0222ac
DR
504 /*
505 * Force `offset' to a multiple of MAXBSIZE except possibly
506 * for the first chunk, so that filesystems only need to
507 * write full blocks except possibly for the first and last
508 * chunks.
509 */
510 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
511
512 if (chunk > len)
513 chunk = len;
c4df9635
MD
514 if (vp->v_type == VREG) {
515 switch(rw) {
516 case UIO_READ:
517 bwillread(chunk);
518 break;
519 case UIO_WRITE:
520 bwillwrite(chunk);
521 break;
522 }
523 }
984263bc 524 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
87de5057 525 ioflg, cred, aresid);
984263bc
MD
526 len -= chunk; /* aresid calc already includes length */
527 if (error)
528 break;
529 offset += chunk;
530 base += chunk;
531 uio_yield();
532 } while (len);
533 if (aresid)
534 *aresid += len;
535 return (error);
536}
537
538/*
d9b2033e 539 * MPALMOSTSAFE - acquires mplock
984263bc
MD
540 */
541static int
87de5057 542vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
984263bc 543{
9bfc4d6d 544 struct ccms_lock ccms_lock;
984263bc
MD
545 struct vnode *vp;
546 int error, ioflag;
547
d9b2033e 548 get_mplock();
87de5057
MD
549 KASSERT(uio->uio_td == curthread,
550 ("uio_td %p is not td %p", uio->uio_td, curthread));
984263bc 551 vp = (struct vnode *)fp->f_data;
9ba76b73 552
984263bc 553 ioflag = 0;
9ba76b73
MD
554 if (flags & O_FBLOCKING) {
555 /* ioflag &= ~IO_NDELAY; */
556 } else if (flags & O_FNONBLOCKING) {
557 ioflag |= IO_NDELAY;
558 } else if (fp->f_flag & FNONBLOCK) {
984263bc 559 ioflag |= IO_NDELAY;
9ba76b73
MD
560 }
561 if (flags & O_FBUFFERED) {
562 /* ioflag &= ~IO_DIRECT; */
563 } else if (flags & O_FUNBUFFERED) {
564 ioflag |= IO_DIRECT;
565 } else if (fp->f_flag & O_DIRECT) {
984263bc 566 ioflag |= IO_DIRECT;
9ba76b73 567 }
ab6f251b 568 vn_lock(vp, LK_SHARED | LK_RETRY);
9ba76b73 569 if ((flags & O_FOFFSET) == 0)
984263bc 570 uio->uio_offset = fp->f_offset;
984263bc
MD
571 ioflag |= sequential_heuristic(uio, fp);
572
9bfc4d6d 573 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
984263bc 574 error = VOP_READ(vp, uio, ioflag, cred);
9bfc4d6d 575 ccms_lock_put(&vp->v_ccms, &ccms_lock);
9ba76b73 576 if ((flags & O_FOFFSET) == 0)
984263bc
MD
577 fp->f_offset = uio->uio_offset;
578 fp->f_nextoff = uio->uio_offset;
a11aaa81 579 vn_unlock(vp);
d9b2033e 580 rel_mplock();
984263bc
MD
581 return (error);
582}
583
584/*
fad57d0e
MD
585 * Device-optimized file table vnode read routine.
586 *
587 * This bypasses the VOP table and talks directly to the device. Most
588 * filesystems just route to specfs and can make this optimization.
d9b2033e
MD
589 *
590 * MPALMOSTSAFE - acquires mplock
fad57d0e
MD
591 */
592static int
87de5057 593svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
fad57d0e
MD
594{
595 struct vnode *vp;
596 int ioflag;
597 int error;
b13267a5 598 cdev_t dev;
fad57d0e 599
d9b2033e 600 get_mplock();
87de5057
MD
601 KASSERT(uio->uio_td == curthread,
602 ("uio_td %p is not td %p", uio->uio_td, curthread));
fad57d0e
MD
603
604 vp = (struct vnode *)fp->f_data;
d9b2033e
MD
605 if (vp == NULL || vp->v_type == VBAD) {
606 error = EBADF;
607 goto done;
608 }
fad57d0e 609
d9b2033e
MD
610 if ((dev = vp->v_rdev) == NULL) {
611 error = EBADF;
612 goto done;
613 }
fad57d0e
MD
614 reference_dev(dev);
615
d9b2033e
MD
616 if (uio->uio_resid == 0) {
617 error = 0;
618 goto done;
619 }
9ba76b73 620 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
621 uio->uio_offset = fp->f_offset;
622
623 ioflag = 0;
9ba76b73
MD
624 if (flags & O_FBLOCKING) {
625 /* ioflag &= ~IO_NDELAY; */
626 } else if (flags & O_FNONBLOCKING) {
627 ioflag |= IO_NDELAY;
628 } else if (fp->f_flag & FNONBLOCK) {
fad57d0e 629 ioflag |= IO_NDELAY;
9ba76b73
MD
630 }
631 if (flags & O_FBUFFERED) {
632 /* ioflag &= ~IO_DIRECT; */
633 } else if (flags & O_FUNBUFFERED) {
634 ioflag |= IO_DIRECT;
635 } else if (fp->f_flag & O_DIRECT) {
fad57d0e 636 ioflag |= IO_DIRECT;
9ba76b73 637 }
fad57d0e
MD
638 ioflag |= sequential_heuristic(uio, fp);
639
640 error = dev_dread(dev, uio, ioflag);
641
642 release_dev(dev);
9ba76b73 643 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
644 fp->f_offset = uio->uio_offset;
645 fp->f_nextoff = uio->uio_offset;
d9b2033e
MD
646done:
647 rel_mplock();
fad57d0e
MD
648 return (error);
649}
650
651/*
d9b2033e 652 * MPALMOSTSAFE - acquires mplock
984263bc
MD
653 */
654static int
87de5057 655vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
984263bc 656{
9bfc4d6d 657 struct ccms_lock ccms_lock;
984263bc
MD
658 struct vnode *vp;
659 int error, ioflag;
660
d9b2033e 661 get_mplock();
87de5057 662 KASSERT(uio->uio_td == curthread,
f4d08668 663 ("uio_td %p is not p %p", uio->uio_td, curthread));
984263bc 664 vp = (struct vnode *)fp->f_data;
c4df9635
MD
665#if 0
666 /* VOP_WRITE should handle this now */
2494f282 667 if (vp->v_type == VREG || vp->v_type == VDATABASE)
984263bc 668 bwillwrite();
c4df9635 669#endif
984263bc 670 vp = (struct vnode *)fp->f_data; /* XXX needed? */
9ba76b73 671
984263bc 672 ioflag = IO_UNIT;
9ba76b73
MD
673 if (vp->v_type == VREG &&
674 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
984263bc 675 ioflag |= IO_APPEND;
9ba76b73
MD
676 }
677
678 if (flags & O_FBLOCKING) {
679 /* ioflag &= ~IO_NDELAY; */
680 } else if (flags & O_FNONBLOCKING) {
984263bc 681 ioflag |= IO_NDELAY;
9ba76b73
MD
682 } else if (fp->f_flag & FNONBLOCK) {
683 ioflag |= IO_NDELAY;
684 }
685 if (flags & O_FBUFFERED) {
686 /* ioflag &= ~IO_DIRECT; */
687 } else if (flags & O_FUNBUFFERED) {
688 ioflag |= IO_DIRECT;
689 } else if (fp->f_flag & O_DIRECT) {
984263bc 690 ioflag |= IO_DIRECT;
9ba76b73
MD
691 }
692 if (flags & O_FASYNCWRITE) {
693 /* ioflag &= ~IO_SYNC; */
694 } else if (flags & O_FSYNCWRITE) {
695 ioflag |= IO_SYNC;
696 } else if (fp->f_flag & O_FSYNC) {
697 ioflag |= IO_SYNC;
698 }
699
700 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
984263bc 701 ioflag |= IO_SYNC;
ca466bae 702 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
9ba76b73 703 if ((flags & O_FOFFSET) == 0)
984263bc
MD
704 uio->uio_offset = fp->f_offset;
705 ioflag |= sequential_heuristic(uio, fp);
9bfc4d6d 706 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
984263bc 707 error = VOP_WRITE(vp, uio, ioflag, cred);
9bfc4d6d 708 ccms_lock_put(&vp->v_ccms, &ccms_lock);
9ba76b73 709 if ((flags & O_FOFFSET) == 0)
984263bc
MD
710 fp->f_offset = uio->uio_offset;
711 fp->f_nextoff = uio->uio_offset;
a11aaa81 712 vn_unlock(vp);
d9b2033e 713 rel_mplock();
984263bc
MD
714 return (error);
715}
716
717/*
fad57d0e
MD
718 * Device-optimized file table vnode write routine.
719 *
720 * This bypasses the VOP table and talks directly to the device. Most
721 * filesystems just route to specfs and can make this optimization.
d9b2033e
MD
722 *
723 * MPALMOSTSAFE - acquires mplock
fad57d0e
MD
724 */
725static int
87de5057 726svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
fad57d0e
MD
727{
728 struct vnode *vp;
729 int ioflag;
730 int error;
b13267a5 731 cdev_t dev;
fad57d0e 732
d9b2033e 733 get_mplock();
87de5057 734 KASSERT(uio->uio_td == curthread,
f4d08668 735 ("uio_td %p is not p %p", uio->uio_td, curthread));
fad57d0e
MD
736
737 vp = (struct vnode *)fp->f_data;
d9b2033e
MD
738 if (vp == NULL || vp->v_type == VBAD) {
739 error = EBADF;
740 goto done;
741 }
fad57d0e 742 if (vp->v_type == VREG)
c4df9635 743 bwillwrite(uio->uio_resid);
fad57d0e
MD
744 vp = (struct vnode *)fp->f_data; /* XXX needed? */
745
d9b2033e
MD
746 if ((dev = vp->v_rdev) == NULL) {
747 error = EBADF;
748 goto done;
749 }
fad57d0e
MD
750 reference_dev(dev);
751
9ba76b73 752 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
753 uio->uio_offset = fp->f_offset;
754
755 ioflag = IO_UNIT;
9ba76b73
MD
756 if (vp->v_type == VREG &&
757 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
fad57d0e 758 ioflag |= IO_APPEND;
9ba76b73
MD
759 }
760
761 if (flags & O_FBLOCKING) {
762 /* ioflag &= ~IO_NDELAY; */
763 } else if (flags & O_FNONBLOCKING) {
764 ioflag |= IO_NDELAY;
765 } else if (fp->f_flag & FNONBLOCK) {
fad57d0e 766 ioflag |= IO_NDELAY;
9ba76b73
MD
767 }
768 if (flags & O_FBUFFERED) {
769 /* ioflag &= ~IO_DIRECT; */
770 } else if (flags & O_FUNBUFFERED) {
771 ioflag |= IO_DIRECT;
772 } else if (fp->f_flag & O_DIRECT) {
fad57d0e 773 ioflag |= IO_DIRECT;
9ba76b73
MD
774 }
775 if (flags & O_FASYNCWRITE) {
776 /* ioflag &= ~IO_SYNC; */
777 } else if (flags & O_FSYNCWRITE) {
778 ioflag |= IO_SYNC;
779 } else if (fp->f_flag & O_FSYNC) {
780 ioflag |= IO_SYNC;
781 }
782
783 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
fad57d0e
MD
784 ioflag |= IO_SYNC;
785 ioflag |= sequential_heuristic(uio, fp);
786
787 error = dev_dwrite(dev, uio, ioflag);
788
789 release_dev(dev);
9ba76b73 790 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
791 fp->f_offset = uio->uio_offset;
792 fp->f_nextoff = uio->uio_offset;
d9b2033e
MD
793done:
794 rel_mplock();
fad57d0e
MD
795 return (error);
796}
797
798/*
d9b2033e 799 * MPALMOSTSAFE - acquires mplock
984263bc
MD
800 */
801static int
87de5057 802vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
984263bc 803{
d9b2033e
MD
804 struct vnode *vp;
805 int error;
984263bc 806
d9b2033e
MD
807 get_mplock();
808 vp = (struct vnode *)fp->f_data;
809 error = vn_stat(vp, sb, cred);
810 rel_mplock();
811 return (error);
984263bc
MD
812}
813
814int
87de5057 815vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
984263bc
MD
816{
817 struct vattr vattr;
dadab5e9 818 struct vattr *vap;
984263bc
MD
819 int error;
820 u_short mode;
b13267a5 821 cdev_t dev;
984263bc
MD
822
823 vap = &vattr;
87de5057 824 error = VOP_GETATTR(vp, vap);
984263bc
MD
825 if (error)
826 return (error);
827
828 /*
829 * Zero the spare stat fields
830 */
831 sb->st_lspare = 0;
7d15906a 832 sb->st_qspare = 0;
984263bc
MD
833
834 /*
835 * Copy from vattr table
836 */
837 if (vap->va_fsid != VNOVAL)
838 sb->st_dev = vap->va_fsid;
839 else
840 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
841 sb->st_ino = vap->va_fileid;
842 mode = vap->va_mode;
843 switch (vap->va_type) {
844 case VREG:
845 mode |= S_IFREG;
846 break;
50626622
MD
847 case VDATABASE:
848 mode |= S_IFDB;
849 break;
984263bc
MD
850 case VDIR:
851 mode |= S_IFDIR;
852 break;
853 case VBLK:
854 mode |= S_IFBLK;
855 break;
856 case VCHR:
857 mode |= S_IFCHR;
858 break;
859 case VLNK:
860 mode |= S_IFLNK;
861 /* This is a cosmetic change, symlinks do not have a mode. */
862 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
863 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
864 else
865 sb->st_mode |= ACCESSPERMS; /* 0777 */
866 break;
867 case VSOCK:
868 mode |= S_IFSOCK;
869 break;
870 case VFIFO:
871 mode |= S_IFIFO;
872 break;
873 default:
874 return (EBADF);
4698dfb3 875 }
984263bc 876 sb->st_mode = mode;
50626622
MD
877 if (vap->va_nlink > (nlink_t)-1)
878 sb->st_nlink = (nlink_t)-1;
879 else
880 sb->st_nlink = vap->va_nlink;
984263bc
MD
881 sb->st_uid = vap->va_uid;
882 sb->st_gid = vap->va_gid;
0e9b9130 883 sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor);
984263bc
MD
884 sb->st_size = vap->va_size;
885 sb->st_atimespec = vap->va_atime;
886 sb->st_mtimespec = vap->va_mtime;
887 sb->st_ctimespec = vap->va_ctime;
888
d8869c1b
MD
889 /*
890 * A VCHR and VBLK device may track the last access and last modified
891 * time independantly of the filesystem. This is particularly true
892 * because device read and write calls may bypass the filesystem.
893 */
894 if (vp->v_type == VCHR || vp->v_type == VBLK) {
4698dfb3
MN
895 dev = vp->v_rdev;
896 if (dev != NULL) {
d8869c1b
MD
897 if (dev->si_lastread) {
898 sb->st_atimespec.tv_sec = dev->si_lastread;
899 sb->st_atimespec.tv_nsec = 0;
900 }
901 if (dev->si_lastwrite) {
902 sb->st_atimespec.tv_sec = dev->si_lastwrite;
903 sb->st_atimespec.tv_nsec = 0;
904 }
905 }
906 }
907
984263bc
MD
908 /*
909 * According to www.opengroup.org, the meaning of st_blksize is
910 * "a filesystem-specific preferred I/O block size for this
911 * object. In some filesystem types, this may vary from file
912 * to file"
913 * Default to PAGE_SIZE after much discussion.
914 */
915
916 if (vap->va_type == VREG) {
917 sb->st_blksize = vap->va_blocksize;
918 } else if (vn_isdisk(vp, NULL)) {
e4c9c0c8
MD
919 /*
920 * XXX this is broken. If the device is not yet open (aka
921 * stat() call, aka v_rdev == NULL), how are we supposed
922 * to get a valid block size out of it?
923 */
4698dfb3
MN
924 dev = vp->v_rdev;
925 if (dev == NULL && vp->v_type == VCHR) {
926 dev = get_dev(vp->v_umajor, vp->v_uminor);
0e9b9130 927 }
e4c9c0c8
MD
928 sb->st_blksize = dev->si_bsize_best;
929 if (sb->st_blksize < dev->si_bsize_phys)
930 sb->st_blksize = dev->si_bsize_phys;
984263bc
MD
931 if (sb->st_blksize < BLKDEV_IOSIZE)
932 sb->st_blksize = BLKDEV_IOSIZE;
933 } else {
934 sb->st_blksize = PAGE_SIZE;
935 }
936
937 sb->st_flags = vap->va_flags;
f00b5e4e
MN
938
939 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
940 if (error)
984263bc
MD
941 sb->st_gen = 0;
942 else
50626622 943 sb->st_gen = (u_int32_t)vap->va_gen;
984263bc 944
984263bc 945 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
dc1be39c 946 sb->st_fsmid = vap->va_fsmid;
984263bc
MD
947 return (0);
948}
949
950/*
d9b2033e 951 * MPALMOSTSAFE - acquires mplock
984263bc
MD
952 */
953static int
87de5057 954vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred)
984263bc 955{
dadab5e9 956 struct vnode *vp = ((struct vnode *)fp->f_data);
1fbb5fc0 957 struct vnode *ovp;
984263bc
MD
958 struct vattr vattr;
959 int error;
960
d9b2033e
MD
961 get_mplock();
962
dadab5e9 963 switch (vp->v_type) {
984263bc
MD
964 case VREG:
965 case VDIR:
966 if (com == FIONREAD) {
4698dfb3
MN
967 error = VOP_GETATTR(vp, &vattr);
968 if (error)
d9b2033e 969 break;
984263bc 970 *(int *)data = vattr.va_size - fp->f_offset;
d9b2033e
MD
971 error = 0;
972 break;
973 }
9ba76b73 974 if (com == FIOASYNC) { /* XXX */
d9b2033e
MD
975 error = 0; /* XXX */
976 break;
984263bc 977 }
984263bc 978 /* fall into ... */
984263bc
MD
979 default:
980#if 0
981 return (ENOTTY);
982#endif
983 case VFIFO:
984 case VCHR:
985 case VBLK:
986 if (com == FIODTYPE) {
d9b2033e
MD
987 if (vp->v_type != VCHR && vp->v_type != VBLK) {
988 error = ENOTTY;
989 break;
990 }
335dda38 991 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
d9b2033e
MD
992 error = 0;
993 break;
984263bc 994 }
87de5057 995 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred);
984263bc 996 if (error == 0 && com == TIOCSCTTY) {
87de5057
MD
997 struct proc *p = curthread->td_proc;
998 struct session *sess;
999
d9b2033e
MD
1000 if (p == NULL) {
1001 error = ENOTTY;
1002 break;
1003 }
984263bc 1004
87de5057 1005 sess = p->p_session;
984263bc 1006 /* Do nothing if reassigning same control tty */
d9b2033e
MD
1007 if (sess->s_ttyvp == vp) {
1008 error = 0;
1009 break;
1010 }
984263bc
MD
1011
1012 /* Get rid of reference to old control tty */
1fbb5fc0 1013 ovp = sess->s_ttyvp;
597aea93 1014 vref(vp);
1fbb5fc0
MD
1015 sess->s_ttyvp = vp;
1016 if (ovp)
1017 vrele(ovp);
984263bc 1018 }
d9b2033e 1019 break;
984263bc 1020 }
d9b2033e
MD
1021 rel_mplock();
1022 return (error);
984263bc
MD
1023}
1024
1025/*
d9b2033e 1026 * MPALMOSTSAFE - acquires mplock
984263bc
MD
1027 */
1028static int
87de5057 1029vn_poll(struct file *fp, int events, struct ucred *cred)
984263bc 1030{
d9b2033e
MD
1031 int error;
1032
1033 get_mplock();
1034 error = VOP_POLL(((struct vnode *)fp->f_data), events, cred);
1035 rel_mplock();
1036 return (error);
984263bc
MD
1037}
1038
1039/*
1040 * Check that the vnode is still valid, and if so
1041 * acquire requested lock.
1042 */
1043int
1044#ifndef DEBUG_LOCKS
ca466bae 1045vn_lock(struct vnode *vp, int flags)
984263bc 1046#else
ca466bae 1047debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line)
984263bc
MD
1048#endif
1049{
1050 int error;
1051
1052 do {
984263bc 1053#ifdef DEBUG_LOCKS
5fd012e0
MD
1054 vp->filename = filename;
1055 vp->line = line;
a11aaa81
MD
1056 error = debuglockmgr(&vp->v_lock, flags,
1057 "vn_lock", filename, line);
1058#else
1059 error = lockmgr(&vp->v_lock, flags);
984263bc 1060#endif
5fd012e0
MD
1061 if (error == 0)
1062 break;
984263bc 1063 } while (flags & LK_RETRY);
5fd012e0
MD
1064
1065 /*
1066 * Because we (had better!) have a ref on the vnode, once it
1067 * goes to VRECLAIMED state it will not be recycled until all
1068 * refs go away. So we can just check the flag.
1069 */
1070 if (error == 0 && (vp->v_flag & VRECLAIMED)) {
a11aaa81 1071 lockmgr(&vp->v_lock, LK_RELEASE);
5fd012e0
MD
1072 error = ENOENT;
1073 }
984263bc
MD
1074 return (error);
1075}
1076
a11aaa81
MD
1077void
1078vn_unlock(struct vnode *vp)
1079{
1080 lockmgr(&vp->v_lock, LK_RELEASE);
1081}
1082
1083int
1084vn_islocked(struct vnode *vp)
1085{
1086 return (lockstatus(&vp->v_lock, curthread));
1087}
1088
984263bc 1089/*
d9b2033e 1090 * MPALMOSTSAFE - acquires mplock
984263bc
MD
1091 */
1092static int
87de5057 1093vn_closefile(struct file *fp)
984263bc 1094{
d9b2033e 1095 int error;
984263bc 1096
d9b2033e 1097 get_mplock();
984263bc 1098 fp->f_ops = &badfileops;
d9b2033e
MD
1099 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
1100 rel_mplock();
4698dfb3 1101 return (error);
984263bc
MD
1102}
1103
d9b2033e
MD
1104/*
1105 * MPALMOSTSAFE - acquires mplock
1106 */
984263bc
MD
1107static int
1108vn_kqfilter(struct file *fp, struct knote *kn)
1109{
d9b2033e 1110 int error;
984263bc 1111
d9b2033e
MD
1112 get_mplock();
1113 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
1114 rel_mplock();
1115 return (error);
984263bc 1116}