MFC: fstest regression fixes - POSIX error codes.
[dragonfly.git] / sys / kern / vfs_vnops.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
c4df9635 40 * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $
984263bc
MD
41 */
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/fcntl.h>
46#include <sys/file.h>
47#include <sys/stat.h>
48#include <sys/proc.h>
895c1f85 49#include <sys/priv.h>
984263bc 50#include <sys/mount.h>
fad57d0e 51#include <sys/nlookup.h>
984263bc
MD
52#include <sys/vnode.h>
53#include <sys/buf.h>
54#include <sys/filio.h>
55#include <sys/ttycom.h>
56#include <sys/conf.h>
57#include <sys/syslog.h>
58
87de5057
MD
59static int vn_closefile (struct file *fp);
60static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
61 struct ucred *cred);
402ed7e1 62static int vn_read (struct file *fp, struct uio *uio,
87de5057 63 struct ucred *cred, int flags);
fad57d0e 64static int svn_read (struct file *fp, struct uio *uio,
87de5057
MD
65 struct ucred *cred, int flags);
66static int vn_poll (struct file *fp, int events, struct ucred *cred);
402ed7e1 67static int vn_kqfilter (struct file *fp, struct knote *kn);
87de5057 68static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
402ed7e1 69static int vn_write (struct file *fp, struct uio *uio,
87de5057 70 struct ucred *cred, int flags);
fad57d0e 71static int svn_write (struct file *fp, struct uio *uio,
87de5057 72 struct ucred *cred, int flags);
984263bc 73
fad57d0e 74struct fileops vnode_fileops = {
b2d248cb
MD
75 .fo_read = vn_read,
76 .fo_write = vn_write,
77 .fo_ioctl = vn_ioctl,
78 .fo_poll = vn_poll,
79 .fo_kqfilter = vn_kqfilter,
80 .fo_stat = vn_statfile,
81 .fo_close = vn_closefile,
82 .fo_shutdown = nofo_shutdown
984263bc
MD
83};
84
fad57d0e 85struct fileops specvnode_fileops = {
b2d248cb
MD
86 .fo_read = svn_read,
87 .fo_write = svn_write,
88 .fo_ioctl = vn_ioctl,
89 .fo_poll = vn_poll,
90 .fo_kqfilter = vn_kqfilter,
91 .fo_stat = vn_statfile,
92 .fo_close = vn_closefile,
93 .fo_shutdown = nofo_shutdown
fad57d0e
MD
94};
95
96/*
97 * Shortcut the device read/write. This avoids a lot of vnode junk.
98 * Basically the specfs vnops for read and write take the locked vnode,
99 * unlock it (because we can't hold the vnode locked while reading or writing
100 * a device which may block indefinitely), issues the device operation, then
101 * relock the vnode before returning, plus other junk. This bypasses all
102 * of that and just does the device operation.
103 */
104void
105vn_setspecops(struct file *fp)
106{
107 if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
108 fp->f_ops = &specvnode_fileops;
109 }
110}
111
984263bc 112/*
fad57d0e
MD
113 * Common code for vnode open operations. Check permissions, and call
114 * the VOP_NOPEN or VOP_NCREATE routine.
115 *
116 * The caller is responsible for setting up nd with nlookup_init() and
117 * for cleaning it up with nlookup_done(), whether we return an error
118 * or not.
119 *
120 * On success nd->nl_open_vp will hold a referenced and, if requested,
121 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp
122 * is non-NULL the vnode will be installed in the file pointer.
123 *
124 * NOTE: The vnode is referenced just once on return whether or not it
125 * is also installed in the file pointer.
984263bc
MD
126 */
127int
fad57d0e 128vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
984263bc 129{
1fd87d54 130 struct vnode *vp;
fad57d0e 131 struct ucred *cred = nd->nl_cred;
984263bc
MD
132 struct vattr vat;
133 struct vattr *vap = &vat;
134 int mode, error;
135
fad57d0e 136 /*
fa8302b1
SS
137 * Certain combinations are illegal
138 */
139 if ((fmode & (FWRITE | O_TRUNC)) == O_TRUNC)
140 return(EACCES);
141
142 /*
fad57d0e 143 * Lookup the path and create or obtain the vnode. After a
28623bf9 144 * successful lookup a locked nd->nl_nch will be returned.
fad57d0e
MD
145 *
146 * The result of this section should be a locked vnode.
147 *
148 * XXX with only a little work we should be able to avoid locking
149 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
150 */
fa8302b1
SS
151
152 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
153 nd->nl_flags |= NLC_FOLLOW;
154
984263bc 155 if (fmode & O_CREAT) {
fad57d0e
MD
156 /*
157 * CONDITIONAL CREATE FILE CASE
158 *
159 * Setting NLC_CREATE causes a negative hit to store
160 * the negative hit ncp and not return an error. Then
161 * nc_error or nc_vp may be checked to see if the ncp
162 * represents a negative hit. NLC_CREATE also requires
163 * write permission on the governing directory or EPERM
164 * is returned.
165 */
fad57d0e 166 nd->nl_flags |= NLC_CREATE;
5312fa43 167 nd->nl_flags |= NLC_REFDVP;
c4df9635 168 bwillinode(1);
fad57d0e 169 error = nlookup(nd);
806dcf9a
MD
170 } else {
171 /*
172 * NORMAL OPEN FILE CASE
173 */
174 error = nlookup(nd);
175 }
fad57d0e 176
806dcf9a
MD
177 if (error)
178 return (error);
fad57d0e 179
806dcf9a
MD
180 /*
181 * split case to allow us to re-resolve and retry the ncp in case
182 * we get ESTALE.
183 */
184again:
185 if (fmode & O_CREAT) {
28623bf9
MD
186 if (nd->nl_nch.ncp->nc_vp == NULL) {
187 if ((error = ncp_writechk(&nd->nl_nch)) != 0)
468bb1f9 188 return (error);
984263bc
MD
189 VATTR_NULL(vap);
190 vap->va_type = VREG;
191 vap->va_mode = cmode;
192 if (fmode & O_EXCL)
193 vap->va_vaflags |= VA_EXCLUSIVE;
5312fa43 194 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp,
dff430ab 195 nd->nl_cred, vap);
fad57d0e 196 if (error)
984263bc 197 return (error);
984263bc 198 fmode &= ~O_TRUNC;
fad57d0e 199 /* locked vnode is returned */
984263bc 200 } else {
984263bc
MD
201 if (fmode & O_EXCL) {
202 error = EEXIST;
fad57d0e 203 } else {
28623bf9 204 error = cache_vget(&nd->nl_nch, cred,
fad57d0e 205 LK_EXCLUSIVE, &vp);
984263bc 206 }
fad57d0e
MD
207 if (error)
208 return (error);
984263bc
MD
209 fmode &= ~O_CREAT;
210 }
211 } else {
28623bf9 212 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
984263bc
MD
213 if (error)
214 return (error);
984263bc 215 }
fad57d0e
MD
216
217 /*
806dcf9a 218 * We have a locked vnode and ncp now. Note that the ncp will
28623bf9 219 * be cleaned up by the caller if nd->nl_nch is left intact.
fad57d0e 220 */
984263bc
MD
221 if (vp->v_type == VLNK) {
222 error = EMLINK;
223 goto bad;
224 }
225 if (vp->v_type == VSOCK) {
226 error = EOPNOTSUPP;
227 goto bad;
228 }
229 if ((fmode & O_CREAT) == 0) {
230 mode = 0;
231 if (fmode & (FWRITE | O_TRUNC)) {
232 if (vp->v_type == VDIR) {
233 error = EISDIR;
234 goto bad;
235 }
28623bf9 236 error = vn_writechk(vp, &nd->nl_nch);
806dcf9a
MD
237 if (error) {
238 /*
239 * Special stale handling, re-resolve the
240 * vnode.
241 */
242 if (error == ESTALE) {
243 vput(vp);
244 vp = NULL;
28623bf9
MD
245 cache_setunresolved(&nd->nl_nch);
246 error = cache_resolve(&nd->nl_nch, cred);
806dcf9a
MD
247 if (error == 0)
248 goto again;
249 }
984263bc 250 goto bad;
806dcf9a 251 }
984263bc
MD
252 mode |= VWRITE;
253 }
254 if (fmode & FREAD)
255 mode |= VREAD;
256 if (mode) {
87de5057 257 error = VOP_ACCESS(vp, mode, cred);
806dcf9a
MD
258 if (error) {
259 /*
260 * Special stale handling, re-resolve the
261 * vnode.
262 */
263 if (error == ESTALE) {
264 vput(vp);
265 vp = NULL;
28623bf9
MD
266 cache_setunresolved(&nd->nl_nch);
267 error = cache_resolve(&nd->nl_nch, cred);
806dcf9a
MD
268 if (error == 0)
269 goto again;
270 }
984263bc 271 goto bad;
806dcf9a 272 }
984263bc
MD
273 }
274 }
275 if (fmode & O_TRUNC) {
a11aaa81 276 vn_unlock(vp); /* XXX */
ca466bae 277 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */
984263bc
MD
278 VATTR_NULL(vap);
279 vap->va_size = 0;
87de5057 280 error = VOP_SETATTR(vp, vap, cred);
984263bc
MD
281 if (error)
282 goto bad;
283 }
fad57d0e
MD
284
285 /*
286 * Setup the fp so VOP_OPEN can override it. No descriptor has been
72310cfb
MD
287 * associated with the fp yet so we own it clean.
288 *
28623bf9 289 * f_nchandle inherits nl_nch. This used to be necessary only for
72310cfb
MD
290 * directories but now we do it unconditionally so f*() ops
291 * such as fchmod() can access the actual namespace that was
292 * used to open the file.
fad57d0e
MD
293 */
294 if (fp) {
28623bf9
MD
295 fp->f_nchandle = nd->nl_nch;
296 cache_zero(&nd->nl_nch);
297 cache_unlock(&fp->f_nchandle);
fad57d0e
MD
298 }
299
300 /*
28623bf9
MD
301 * Get rid of nl_nch. vn_open does not return it (it returns the
302 * vnode or the file pointer). Note: we can't leave nl_nch locked
fad57d0e
MD
303 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
304 * on /dev/ttyd0
305 */
28623bf9
MD
306 if (nd->nl_nch.ncp)
307 cache_put(&nd->nl_nch);
fad57d0e 308
87de5057 309 error = VOP_OPEN(vp, fmode, cred, fp);
fad57d0e
MD
310 if (error) {
311 /*
312 * setting f_ops to &badfileops will prevent the descriptor
313 * code from trying to close and release the vnode, since
314 * the open failed we do not want to call close.
315 */
675eb4c0
MD
316 if (fp) {
317 fp->f_data = NULL;
318 fp->f_ops = &badfileops;
319 }
984263bc 320 goto bad;
fad57d0e 321 }
fad57d0e 322
7540ab49 323#if 0
984263bc 324 /*
7540ab49 325 * Assert that VREG files have been setup for vmio.
984263bc 326 */
7540ab49
MD
327 KASSERT(vp->v_type != VREG || vp->v_object != NULL,
328 ("vn_open: regular file was not VMIO enabled!"));
329#endif
984263bc 330
fad57d0e
MD
331 /*
332 * Return the vnode. XXX needs some cleaning up. The vnode is
8ddc6004 333 * only returned in the fp == NULL case.
fad57d0e
MD
334 */
335 if (fp == NULL) {
336 nd->nl_open_vp = vp;
337 nd->nl_vp_fmode = fmode;
338 if ((nd->nl_flags & NLC_LOCKVP) == 0)
a11aaa81 339 vn_unlock(vp);
fad57d0e 340 } else {
8ddc6004 341 vput(vp);
fad57d0e 342 }
984263bc
MD
343 return (0);
344bad:
bb5c9c00
MD
345 if (vp)
346 vput(vp);
984263bc
MD
347 return (error);
348}
349
a8873631
MD
350int
351vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
352{
353 struct vnode *vp;
354 int error;
355
356 if (strncmp(devname, "/dev/", 5) == 0)
357 devname += 5;
358 if ((vp = getsynthvnode(devname)) == NULL) {
359 error = ENODEV;
360 } else {
361 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
362 vn_unlock(vp);
363 if (error) {
364 vrele(vp);
365 vp = NULL;
366 }
367 }
368 *vpp = vp;
369 return (error);
370}
371
984263bc 372/*
28623bf9 373 * Check for write permissions on the specified vnode. nch may be NULL.
984263bc
MD
374 */
375int
28623bf9 376vn_writechk(struct vnode *vp, struct nchandle *nch)
984263bc 377{
984263bc
MD
378 /*
379 * If there's shared text associated with
380 * the vnode, try to free it up once. If
381 * we fail, we can't allow writing.
382 */
383 if (vp->v_flag & VTEXT)
384 return (ETXTBSY);
468bb1f9
MD
385
386 /*
387 * If the vnode represents a regular file, check the mount
28623bf9 388 * point via the nch. This may be a different mount point
468bb1f9
MD
389 * then the one embedded in the vnode (e.g. nullfs).
390 *
391 * We can still write to non-regular files (e.g. devices)
392 * via read-only mounts.
393 */
28623bf9
MD
394 if (nch && nch->ncp && vp->v_type == VREG)
395 return (ncp_writechk(nch));
984263bc
MD
396 return (0);
397}
398
399/*
468bb1f9
MD
400 * Check whether the underlying mount is read-only. The mount point
401 * referenced by the namecache may be different from the mount point
402 * used by the underlying vnode in the case of NULLFS, so a separate
403 * check is needed.
404 */
468bb1f9 405int
28623bf9 406ncp_writechk(struct nchandle *nch)
468bb1f9 407{
28623bf9 408 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
468bb1f9
MD
409 return (EROFS);
410 return(0);
411}
412
413/*
984263bc
MD
414 * Vnode close call
415 */
416int
87de5057 417vn_close(struct vnode *vp, int flags)
984263bc
MD
418{
419 int error;
420
4698dfb3
MN
421 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
422 if (error == 0) {
87de5057 423 error = VOP_CLOSE(vp, flags);
a11aaa81 424 vn_unlock(vp);
5fd012e0 425 }
984263bc
MD
426 vrele(vp);
427 return (error);
428}
429
430static __inline
431int
432sequential_heuristic(struct uio *uio, struct file *fp)
433{
434 /*
435 * Sequential heuristic - detect sequential operation
436 */
437 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
438 uio->uio_offset == fp->f_nextoff) {
439 int tmpseq = fp->f_seqcount;
440 /*
441 * XXX we assume that the filesystem block size is
442 * the default. Not true, but still gives us a pretty
443 * good indicator of how sequential the read operations
444 * are.
445 */
446 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
447 if (tmpseq > IO_SEQMAX)
448 tmpseq = IO_SEQMAX;
449 fp->f_seqcount = tmpseq;
450 return(fp->f_seqcount << IO_SEQSHIFT);
451 }
452
453 /*
454 * Not sequential, quick draw-down of seqcount
455 */
456 if (fp->f_seqcount > 1)
457 fp->f_seqcount = 1;
458 else
459 fp->f_seqcount = 0;
460 return(0);
461}
462
463/*
464 * Package up an I/O request on a vnode into a uio and do it.
465 */
466int
87de5057
MD
467vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
468 off_t offset, enum uio_seg segflg, int ioflg,
469 struct ucred *cred, int *aresid)
984263bc
MD
470{
471 struct uio auio;
472 struct iovec aiov;
9bfc4d6d 473 struct ccms_lock ccms_lock;
984263bc
MD
474 int error;
475
476 if ((ioflg & IO_NODELOCKED) == 0)
ca466bae 477 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
984263bc
MD
478 auio.uio_iov = &aiov;
479 auio.uio_iovcnt = 1;
480 aiov.iov_base = base;
481 aiov.iov_len = len;
482 auio.uio_resid = len;
483 auio.uio_offset = offset;
484 auio.uio_segflg = segflg;
485 auio.uio_rw = rw;
87de5057 486 auio.uio_td = curthread;
9bfc4d6d 487 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio);
984263bc
MD
488 if (rw == UIO_READ) {
489 error = VOP_READ(vp, &auio, ioflg, cred);
490 } else {
491 error = VOP_WRITE(vp, &auio, ioflg, cred);
492 }
9bfc4d6d 493 ccms_lock_put(&vp->v_ccms, &ccms_lock);
984263bc
MD
494 if (aresid)
495 *aresid = auio.uio_resid;
496 else
497 if (auio.uio_resid && error == 0)
498 error = EIO;
499 if ((ioflg & IO_NODELOCKED) == 0)
a11aaa81 500 vn_unlock(vp);
984263bc
MD
501 return (error);
502}
503
504/*
505 * Package up an I/O request on a vnode into a uio and do it. The I/O
506 * request is split up into smaller chunks and we try to avoid saturating
507 * the buffer cache while potentially holding a vnode locked, so we
508 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
509 * to give other processes a chance to lock the vnode (either other processes
510 * core'ing the same binary, or unrelated processes scanning the directory).
511 */
512int
87de5057
MD
513vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
514 off_t offset, enum uio_seg segflg, int ioflg,
515 struct ucred *cred, int *aresid)
984263bc
MD
516{
517 int error = 0;
518
519 do {
9a0222ac 520 int chunk;
984263bc 521
9a0222ac
DR
522 /*
523 * Force `offset' to a multiple of MAXBSIZE except possibly
524 * for the first chunk, so that filesystems only need to
525 * write full blocks except possibly for the first and last
526 * chunks.
527 */
528 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
529
530 if (chunk > len)
531 chunk = len;
c4df9635
MD
532 if (vp->v_type == VREG) {
533 switch(rw) {
534 case UIO_READ:
535 bwillread(chunk);
536 break;
537 case UIO_WRITE:
538 bwillwrite(chunk);
539 break;
540 }
541 }
984263bc 542 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
87de5057 543 ioflg, cred, aresid);
984263bc
MD
544 len -= chunk; /* aresid calc already includes length */
545 if (error)
546 break;
547 offset += chunk;
548 base += chunk;
549 uio_yield();
550 } while (len);
551 if (aresid)
552 *aresid += len;
553 return (error);
554}
555
556/*
d9b2033e 557 * MPALMOSTSAFE - acquires mplock
984263bc
MD
558 */
559static int
87de5057 560vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
984263bc 561{
9bfc4d6d 562 struct ccms_lock ccms_lock;
984263bc
MD
563 struct vnode *vp;
564 int error, ioflag;
565
d9b2033e 566 get_mplock();
87de5057
MD
567 KASSERT(uio->uio_td == curthread,
568 ("uio_td %p is not td %p", uio->uio_td, curthread));
984263bc 569 vp = (struct vnode *)fp->f_data;
9ba76b73 570
984263bc 571 ioflag = 0;
9ba76b73
MD
572 if (flags & O_FBLOCKING) {
573 /* ioflag &= ~IO_NDELAY; */
574 } else if (flags & O_FNONBLOCKING) {
575 ioflag |= IO_NDELAY;
576 } else if (fp->f_flag & FNONBLOCK) {
984263bc 577 ioflag |= IO_NDELAY;
9ba76b73
MD
578 }
579 if (flags & O_FBUFFERED) {
580 /* ioflag &= ~IO_DIRECT; */
581 } else if (flags & O_FUNBUFFERED) {
582 ioflag |= IO_DIRECT;
583 } else if (fp->f_flag & O_DIRECT) {
984263bc 584 ioflag |= IO_DIRECT;
9ba76b73 585 }
ab6f251b 586 vn_lock(vp, LK_SHARED | LK_RETRY);
9ba76b73 587 if ((flags & O_FOFFSET) == 0)
984263bc 588 uio->uio_offset = fp->f_offset;
984263bc
MD
589 ioflag |= sequential_heuristic(uio, fp);
590
9bfc4d6d 591 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
984263bc 592 error = VOP_READ(vp, uio, ioflag, cred);
9bfc4d6d 593 ccms_lock_put(&vp->v_ccms, &ccms_lock);
9ba76b73 594 if ((flags & O_FOFFSET) == 0)
984263bc
MD
595 fp->f_offset = uio->uio_offset;
596 fp->f_nextoff = uio->uio_offset;
a11aaa81 597 vn_unlock(vp);
d9b2033e 598 rel_mplock();
984263bc
MD
599 return (error);
600}
601
602/*
fad57d0e
MD
603 * Device-optimized file table vnode read routine.
604 *
605 * This bypasses the VOP table and talks directly to the device. Most
606 * filesystems just route to specfs and can make this optimization.
d9b2033e
MD
607 *
608 * MPALMOSTSAFE - acquires mplock
fad57d0e
MD
609 */
610static int
87de5057 611svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
fad57d0e
MD
612{
613 struct vnode *vp;
614 int ioflag;
615 int error;
b13267a5 616 cdev_t dev;
fad57d0e 617
d9b2033e 618 get_mplock();
87de5057
MD
619 KASSERT(uio->uio_td == curthread,
620 ("uio_td %p is not td %p", uio->uio_td, curthread));
fad57d0e
MD
621
622 vp = (struct vnode *)fp->f_data;
d9b2033e
MD
623 if (vp == NULL || vp->v_type == VBAD) {
624 error = EBADF;
625 goto done;
626 }
fad57d0e 627
d9b2033e
MD
628 if ((dev = vp->v_rdev) == NULL) {
629 error = EBADF;
630 goto done;
631 }
fad57d0e
MD
632 reference_dev(dev);
633
d9b2033e
MD
634 if (uio->uio_resid == 0) {
635 error = 0;
636 goto done;
637 }
9ba76b73 638 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
639 uio->uio_offset = fp->f_offset;
640
641 ioflag = 0;
9ba76b73
MD
642 if (flags & O_FBLOCKING) {
643 /* ioflag &= ~IO_NDELAY; */
644 } else if (flags & O_FNONBLOCKING) {
645 ioflag |= IO_NDELAY;
646 } else if (fp->f_flag & FNONBLOCK) {
fad57d0e 647 ioflag |= IO_NDELAY;
9ba76b73
MD
648 }
649 if (flags & O_FBUFFERED) {
650 /* ioflag &= ~IO_DIRECT; */
651 } else if (flags & O_FUNBUFFERED) {
652 ioflag |= IO_DIRECT;
653 } else if (fp->f_flag & O_DIRECT) {
fad57d0e 654 ioflag |= IO_DIRECT;
9ba76b73 655 }
fad57d0e
MD
656 ioflag |= sequential_heuristic(uio, fp);
657
658 error = dev_dread(dev, uio, ioflag);
659
660 release_dev(dev);
9ba76b73 661 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
662 fp->f_offset = uio->uio_offset;
663 fp->f_nextoff = uio->uio_offset;
d9b2033e
MD
664done:
665 rel_mplock();
fad57d0e
MD
666 return (error);
667}
668
669/*
d9b2033e 670 * MPALMOSTSAFE - acquires mplock
984263bc
MD
671 */
672static int
87de5057 673vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
984263bc 674{
9bfc4d6d 675 struct ccms_lock ccms_lock;
984263bc
MD
676 struct vnode *vp;
677 int error, ioflag;
678
d9b2033e 679 get_mplock();
87de5057 680 KASSERT(uio->uio_td == curthread,
f4d08668 681 ("uio_td %p is not p %p", uio->uio_td, curthread));
984263bc 682 vp = (struct vnode *)fp->f_data;
c4df9635
MD
683#if 0
684 /* VOP_WRITE should handle this now */
2494f282 685 if (vp->v_type == VREG || vp->v_type == VDATABASE)
984263bc 686 bwillwrite();
c4df9635 687#endif
984263bc 688 vp = (struct vnode *)fp->f_data; /* XXX needed? */
9ba76b73 689
984263bc 690 ioflag = IO_UNIT;
9ba76b73
MD
691 if (vp->v_type == VREG &&
692 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
984263bc 693 ioflag |= IO_APPEND;
9ba76b73
MD
694 }
695
696 if (flags & O_FBLOCKING) {
697 /* ioflag &= ~IO_NDELAY; */
698 } else if (flags & O_FNONBLOCKING) {
984263bc 699 ioflag |= IO_NDELAY;
9ba76b73
MD
700 } else if (fp->f_flag & FNONBLOCK) {
701 ioflag |= IO_NDELAY;
702 }
703 if (flags & O_FBUFFERED) {
704 /* ioflag &= ~IO_DIRECT; */
705 } else if (flags & O_FUNBUFFERED) {
706 ioflag |= IO_DIRECT;
707 } else if (fp->f_flag & O_DIRECT) {
984263bc 708 ioflag |= IO_DIRECT;
9ba76b73
MD
709 }
710 if (flags & O_FASYNCWRITE) {
711 /* ioflag &= ~IO_SYNC; */
712 } else if (flags & O_FSYNCWRITE) {
713 ioflag |= IO_SYNC;
714 } else if (fp->f_flag & O_FSYNC) {
715 ioflag |= IO_SYNC;
716 }
717
718 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
984263bc 719 ioflag |= IO_SYNC;
ca466bae 720 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
9ba76b73 721 if ((flags & O_FOFFSET) == 0)
984263bc
MD
722 uio->uio_offset = fp->f_offset;
723 ioflag |= sequential_heuristic(uio, fp);
9bfc4d6d 724 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
984263bc 725 error = VOP_WRITE(vp, uio, ioflag, cred);
9bfc4d6d 726 ccms_lock_put(&vp->v_ccms, &ccms_lock);
9ba76b73 727 if ((flags & O_FOFFSET) == 0)
984263bc
MD
728 fp->f_offset = uio->uio_offset;
729 fp->f_nextoff = uio->uio_offset;
a11aaa81 730 vn_unlock(vp);
d9b2033e 731 rel_mplock();
984263bc
MD
732 return (error);
733}
734
735/*
fad57d0e
MD
736 * Device-optimized file table vnode write routine.
737 *
738 * This bypasses the VOP table and talks directly to the device. Most
739 * filesystems just route to specfs and can make this optimization.
d9b2033e
MD
740 *
741 * MPALMOSTSAFE - acquires mplock
fad57d0e
MD
742 */
743static int
87de5057 744svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
fad57d0e
MD
745{
746 struct vnode *vp;
747 int ioflag;
748 int error;
b13267a5 749 cdev_t dev;
fad57d0e 750
d9b2033e 751 get_mplock();
87de5057 752 KASSERT(uio->uio_td == curthread,
f4d08668 753 ("uio_td %p is not p %p", uio->uio_td, curthread));
fad57d0e
MD
754
755 vp = (struct vnode *)fp->f_data;
d9b2033e
MD
756 if (vp == NULL || vp->v_type == VBAD) {
757 error = EBADF;
758 goto done;
759 }
fad57d0e 760 if (vp->v_type == VREG)
c4df9635 761 bwillwrite(uio->uio_resid);
fad57d0e
MD
762 vp = (struct vnode *)fp->f_data; /* XXX needed? */
763
d9b2033e
MD
764 if ((dev = vp->v_rdev) == NULL) {
765 error = EBADF;
766 goto done;
767 }
fad57d0e
MD
768 reference_dev(dev);
769
9ba76b73 770 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
771 uio->uio_offset = fp->f_offset;
772
773 ioflag = IO_UNIT;
9ba76b73
MD
774 if (vp->v_type == VREG &&
775 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
fad57d0e 776 ioflag |= IO_APPEND;
9ba76b73
MD
777 }
778
779 if (flags & O_FBLOCKING) {
780 /* ioflag &= ~IO_NDELAY; */
781 } else if (flags & O_FNONBLOCKING) {
782 ioflag |= IO_NDELAY;
783 } else if (fp->f_flag & FNONBLOCK) {
fad57d0e 784 ioflag |= IO_NDELAY;
9ba76b73
MD
785 }
786 if (flags & O_FBUFFERED) {
787 /* ioflag &= ~IO_DIRECT; */
788 } else if (flags & O_FUNBUFFERED) {
789 ioflag |= IO_DIRECT;
790 } else if (fp->f_flag & O_DIRECT) {
fad57d0e 791 ioflag |= IO_DIRECT;
9ba76b73
MD
792 }
793 if (flags & O_FASYNCWRITE) {
794 /* ioflag &= ~IO_SYNC; */
795 } else if (flags & O_FSYNCWRITE) {
796 ioflag |= IO_SYNC;
797 } else if (fp->f_flag & O_FSYNC) {
798 ioflag |= IO_SYNC;
799 }
800
801 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
fad57d0e
MD
802 ioflag |= IO_SYNC;
803 ioflag |= sequential_heuristic(uio, fp);
804
805 error = dev_dwrite(dev, uio, ioflag);
806
807 release_dev(dev);
9ba76b73 808 if ((flags & O_FOFFSET) == 0)
fad57d0e
MD
809 fp->f_offset = uio->uio_offset;
810 fp->f_nextoff = uio->uio_offset;
d9b2033e
MD
811done:
812 rel_mplock();
fad57d0e
MD
813 return (error);
814}
815
816/*
d9b2033e 817 * MPALMOSTSAFE - acquires mplock
984263bc
MD
818 */
819static int
87de5057 820vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
984263bc 821{
d9b2033e
MD
822 struct vnode *vp;
823 int error;
984263bc 824
d9b2033e
MD
825 get_mplock();
826 vp = (struct vnode *)fp->f_data;
827 error = vn_stat(vp, sb, cred);
828 rel_mplock();
829 return (error);
984263bc
MD
830}
831
832int
87de5057 833vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
984263bc
MD
834{
835 struct vattr vattr;
dadab5e9 836 struct vattr *vap;
984263bc
MD
837 int error;
838 u_short mode;
b13267a5 839 cdev_t dev;
984263bc
MD
840
841 vap = &vattr;
87de5057 842 error = VOP_GETATTR(vp, vap);
984263bc
MD
843 if (error)
844 return (error);
845
846 /*
847 * Zero the spare stat fields
848 */
849 sb->st_lspare = 0;
7d15906a 850 sb->st_qspare = 0;
984263bc
MD
851
852 /*
853 * Copy from vattr table
854 */
855 if (vap->va_fsid != VNOVAL)
856 sb->st_dev = vap->va_fsid;
857 else
858 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
859 sb->st_ino = vap->va_fileid;
860 mode = vap->va_mode;
861 switch (vap->va_type) {
862 case VREG:
863 mode |= S_IFREG;
864 break;
50626622
MD
865 case VDATABASE:
866 mode |= S_IFDB;
867 break;
984263bc
MD
868 case VDIR:
869 mode |= S_IFDIR;
870 break;
871 case VBLK:
872 mode |= S_IFBLK;
873 break;
874 case VCHR:
875 mode |= S_IFCHR;
876 break;
877 case VLNK:
878 mode |= S_IFLNK;
879 /* This is a cosmetic change, symlinks do not have a mode. */
880 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
881 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
882 else
883 sb->st_mode |= ACCESSPERMS; /* 0777 */
884 break;
885 case VSOCK:
886 mode |= S_IFSOCK;
887 break;
888 case VFIFO:
889 mode |= S_IFIFO;
890 break;
891 default:
892 return (EBADF);
4698dfb3 893 }
984263bc 894 sb->st_mode = mode;
50626622
MD
895 if (vap->va_nlink > (nlink_t)-1)
896 sb->st_nlink = (nlink_t)-1;
897 else
898 sb->st_nlink = vap->va_nlink;
984263bc
MD
899 sb->st_uid = vap->va_uid;
900 sb->st_gid = vap->va_gid;
0e9b9130 901 sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor);
984263bc
MD
902 sb->st_size = vap->va_size;
903 sb->st_atimespec = vap->va_atime;
904 sb->st_mtimespec = vap->va_mtime;
905 sb->st_ctimespec = vap->va_ctime;
906
d8869c1b
MD
907 /*
908 * A VCHR and VBLK device may track the last access and last modified
909 * time independantly of the filesystem. This is particularly true
910 * because device read and write calls may bypass the filesystem.
911 */
912 if (vp->v_type == VCHR || vp->v_type == VBLK) {
4698dfb3
MN
913 dev = vp->v_rdev;
914 if (dev != NULL) {
d8869c1b
MD
915 if (dev->si_lastread) {
916 sb->st_atimespec.tv_sec = dev->si_lastread;
917 sb->st_atimespec.tv_nsec = 0;
918 }
919 if (dev->si_lastwrite) {
920 sb->st_atimespec.tv_sec = dev->si_lastwrite;
921 sb->st_atimespec.tv_nsec = 0;
922 }
923 }
924 }
925
984263bc
MD
926 /*
927 * According to www.opengroup.org, the meaning of st_blksize is
928 * "a filesystem-specific preferred I/O block size for this
929 * object. In some filesystem types, this may vary from file
930 * to file"
931 * Default to PAGE_SIZE after much discussion.
932 */
933
934 if (vap->va_type == VREG) {
935 sb->st_blksize = vap->va_blocksize;
936 } else if (vn_isdisk(vp, NULL)) {
e4c9c0c8
MD
937 /*
938 * XXX this is broken. If the device is not yet open (aka
939 * stat() call, aka v_rdev == NULL), how are we supposed
940 * to get a valid block size out of it?
941 */
4698dfb3
MN
942 dev = vp->v_rdev;
943 if (dev == NULL && vp->v_type == VCHR) {
944 dev = get_dev(vp->v_umajor, vp->v_uminor);
0e9b9130 945 }
e4c9c0c8
MD
946 sb->st_blksize = dev->si_bsize_best;
947 if (sb->st_blksize < dev->si_bsize_phys)
948 sb->st_blksize = dev->si_bsize_phys;
984263bc
MD
949 if (sb->st_blksize < BLKDEV_IOSIZE)
950 sb->st_blksize = BLKDEV_IOSIZE;
951 } else {
952 sb->st_blksize = PAGE_SIZE;
953 }
954
955 sb->st_flags = vap->va_flags;
f00b5e4e
MN
956
957 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
958 if (error)
984263bc
MD
959 sb->st_gen = 0;
960 else
50626622 961 sb->st_gen = (u_int32_t)vap->va_gen;
984263bc 962
984263bc 963 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
dc1be39c 964 sb->st_fsmid = vap->va_fsmid;
984263bc
MD
965 return (0);
966}
967
968/*
d9b2033e 969 * MPALMOSTSAFE - acquires mplock
984263bc
MD
970 */
971static int
87de5057 972vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred)
984263bc 973{
dadab5e9 974 struct vnode *vp = ((struct vnode *)fp->f_data);
1fbb5fc0 975 struct vnode *ovp;
984263bc
MD
976 struct vattr vattr;
977 int error;
978
d9b2033e
MD
979 get_mplock();
980
dadab5e9 981 switch (vp->v_type) {
984263bc
MD
982 case VREG:
983 case VDIR:
984 if (com == FIONREAD) {
4698dfb3
MN
985 error = VOP_GETATTR(vp, &vattr);
986 if (error)
d9b2033e 987 break;
984263bc 988 *(int *)data = vattr.va_size - fp->f_offset;
d9b2033e
MD
989 error = 0;
990 break;
991 }
9ba76b73 992 if (com == FIOASYNC) { /* XXX */
d9b2033e
MD
993 error = 0; /* XXX */
994 break;
984263bc 995 }
984263bc 996 /* fall into ... */
984263bc
MD
997 default:
998#if 0
999 return (ENOTTY);
1000#endif
1001 case VFIFO:
1002 case VCHR:
1003 case VBLK:
1004 if (com == FIODTYPE) {
d9b2033e
MD
1005 if (vp->v_type != VCHR && vp->v_type != VBLK) {
1006 error = ENOTTY;
1007 break;
1008 }
335dda38 1009 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
d9b2033e
MD
1010 error = 0;
1011 break;
984263bc 1012 }
87de5057 1013 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred);
984263bc 1014 if (error == 0 && com == TIOCSCTTY) {
87de5057
MD
1015 struct proc *p = curthread->td_proc;
1016 struct session *sess;
1017
d9b2033e
MD
1018 if (p == NULL) {
1019 error = ENOTTY;
1020 break;
1021 }
984263bc 1022
87de5057 1023 sess = p->p_session;
984263bc 1024 /* Do nothing if reassigning same control tty */
d9b2033e
MD
1025 if (sess->s_ttyvp == vp) {
1026 error = 0;
1027 break;
1028 }
984263bc
MD
1029
1030 /* Get rid of reference to old control tty */
1fbb5fc0 1031 ovp = sess->s_ttyvp;
597aea93 1032 vref(vp);
1fbb5fc0
MD
1033 sess->s_ttyvp = vp;
1034 if (ovp)
1035 vrele(ovp);
984263bc 1036 }
d9b2033e 1037 break;
984263bc 1038 }
d9b2033e
MD
1039 rel_mplock();
1040 return (error);
984263bc
MD
1041}
1042
1043/*
d9b2033e 1044 * MPALMOSTSAFE - acquires mplock
984263bc
MD
1045 */
1046static int
87de5057 1047vn_poll(struct file *fp, int events, struct ucred *cred)
984263bc 1048{
d9b2033e
MD
1049 int error;
1050
1051 get_mplock();
1052 error = VOP_POLL(((struct vnode *)fp->f_data), events, cred);
1053 rel_mplock();
1054 return (error);
984263bc
MD
1055}
1056
1057/*
1058 * Check that the vnode is still valid, and if so
1059 * acquire requested lock.
1060 */
1061int
1062#ifndef DEBUG_LOCKS
ca466bae 1063vn_lock(struct vnode *vp, int flags)
984263bc 1064#else
ca466bae 1065debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line)
984263bc
MD
1066#endif
1067{
1068 int error;
1069
1070 do {
984263bc 1071#ifdef DEBUG_LOCKS
5fd012e0
MD
1072 vp->filename = filename;
1073 vp->line = line;
a11aaa81
MD
1074 error = debuglockmgr(&vp->v_lock, flags,
1075 "vn_lock", filename, line);
1076#else
1077 error = lockmgr(&vp->v_lock, flags);
984263bc 1078#endif
5fd012e0
MD
1079 if (error == 0)
1080 break;
984263bc 1081 } while (flags & LK_RETRY);
5fd012e0
MD
1082
1083 /*
1084 * Because we (had better!) have a ref on the vnode, once it
1085 * goes to VRECLAIMED state it will not be recycled until all
1086 * refs go away. So we can just check the flag.
1087 */
1088 if (error == 0 && (vp->v_flag & VRECLAIMED)) {
a11aaa81 1089 lockmgr(&vp->v_lock, LK_RELEASE);
5fd012e0
MD
1090 error = ENOENT;
1091 }
984263bc
MD
1092 return (error);
1093}
1094
a11aaa81
MD
1095void
1096vn_unlock(struct vnode *vp)
1097{
1098 lockmgr(&vp->v_lock, LK_RELEASE);
1099}
1100
1101int
1102vn_islocked(struct vnode *vp)
1103{
1104 return (lockstatus(&vp->v_lock, curthread));
1105}
1106
984263bc 1107/*
d9b2033e 1108 * MPALMOSTSAFE - acquires mplock
984263bc
MD
1109 */
1110static int
87de5057 1111vn_closefile(struct file *fp)
984263bc 1112{
d9b2033e 1113 int error;
984263bc 1114
d9b2033e 1115 get_mplock();
984263bc 1116 fp->f_ops = &badfileops;
d9b2033e
MD
1117 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
1118 rel_mplock();
4698dfb3 1119 return (error);
984263bc
MD
1120}
1121
d9b2033e
MD
1122/*
1123 * MPALMOSTSAFE - acquires mplock
1124 */
984263bc
MD
1125static int
1126vn_kqfilter(struct file *fp, struct knote *kn)
1127{
d9b2033e 1128 int error;
984263bc 1129
d9b2033e
MD
1130 get_mplock();
1131 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
1132 rel_mplock();
1133 return (error);
984263bc 1134}