kernel: Make SMP support default (and non-optional).
[dragonfly.git] / sys / kern / vfs_vnops.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/stat.h>
47#include <sys/proc.h>
895c1f85 48#include <sys/priv.h>
984263bc 49#include <sys/mount.h>
fad57d0e 50#include <sys/nlookup.h>
984263bc
MD
51#include <sys/vnode.h>
52#include <sys/buf.h>
53#include <sys/filio.h>
54#include <sys/ttycom.h>
55#include <sys/conf.h>
c0885fab 56#include <sys/sysctl.h>
984263bc
MD
57#include <sys/syslog.h>
58
c0885fab 59#include <sys/thread2.h>
684a93c4 60#include <sys/mplock2.h>
c0885fab 61
87de5057
MD
62static int vn_closefile (struct file *fp);
63static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
87baaf0c 64 struct ucred *cred, struct sysmsg *msg);
402ed7e1 65static int vn_read (struct file *fp, struct uio *uio,
87de5057 66 struct ucred *cred, int flags);
402ed7e1 67static int vn_kqfilter (struct file *fp, struct knote *kn);
87de5057 68static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
402ed7e1 69static int vn_write (struct file *fp, struct uio *uio,
87de5057 70 struct ucred *cred, int flags);
984263bc 71
fad57d0e 72struct fileops vnode_fileops = {
b2d248cb
MD
73 .fo_read = vn_read,
74 .fo_write = vn_write,
75 .fo_ioctl = vn_ioctl,
b2d248cb
MD
76 .fo_kqfilter = vn_kqfilter,
77 .fo_stat = vn_statfile,
78 .fo_close = vn_closefile,
79 .fo_shutdown = nofo_shutdown
984263bc
MD
80};
81
82/*
fad57d0e
MD
83 * Common code for vnode open operations. Check permissions, and call
84 * the VOP_NOPEN or VOP_NCREATE routine.
85 *
86 * The caller is responsible for setting up nd with nlookup_init() and
87 * for cleaning it up with nlookup_done(), whether we return an error
88 * or not.
89 *
90 * On success nd->nl_open_vp will hold a referenced and, if requested,
91 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp
92 * is non-NULL the vnode will be installed in the file pointer.
93 *
94 * NOTE: The vnode is referenced just once on return whether or not it
95 * is also installed in the file pointer.
984263bc
MD
96 */
97int
fad57d0e 98vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
984263bc 99{
1fd87d54 100 struct vnode *vp;
fad57d0e 101 struct ucred *cred = nd->nl_cred;
984263bc
MD
102 struct vattr vat;
103 struct vattr *vap = &vat;
3a907475 104 int error;
e9b56058 105 u_int flags;
18cd8808
FT
106 uint64_t osize;
107 struct mount *mp;
984263bc 108
d7c75c7a
MD
109 /*
110 * Certain combinations are illegal
111 */
112 if ((fmode & (FWRITE | O_TRUNC)) == O_TRUNC)
113 return(EACCES);
114
fad57d0e
MD
115 /*
116 * Lookup the path and create or obtain the vnode. After a
28623bf9 117 * successful lookup a locked nd->nl_nch will be returned.
fad57d0e
MD
118 *
119 * The result of this section should be a locked vnode.
120 *
121 * XXX with only a little work we should be able to avoid locking
122 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
123 */
3a907475
MD
124 nd->nl_flags |= NLC_OPEN;
125 if (fmode & O_APPEND)
126 nd->nl_flags |= NLC_APPEND;
127 if (fmode & O_TRUNC)
128 nd->nl_flags |= NLC_TRUNCATE;
129 if (fmode & FREAD)
130 nd->nl_flags |= NLC_READ;
131 if (fmode & FWRITE)
132 nd->nl_flags |= NLC_WRITE;
d7c75c7a
MD
133 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
134 nd->nl_flags |= NLC_FOLLOW;
3a907475 135
984263bc 136 if (fmode & O_CREAT) {
fad57d0e
MD
137 /*
138 * CONDITIONAL CREATE FILE CASE
139 *
140 * Setting NLC_CREATE causes a negative hit to store
141 * the negative hit ncp and not return an error. Then
142 * nc_error or nc_vp may be checked to see if the ncp
143 * represents a negative hit. NLC_CREATE also requires
144 * write permission on the governing directory or EPERM
145 * is returned.
146 */
fad57d0e 147 nd->nl_flags |= NLC_CREATE;
5312fa43 148 nd->nl_flags |= NLC_REFDVP;
c4df9635 149 bwillinode(1);
fad57d0e 150 error = nlookup(nd);
806dcf9a
MD
151 } else {
152 /*
153 * NORMAL OPEN FILE CASE
154 */
155 error = nlookup(nd);
156 }
fad57d0e 157
806dcf9a
MD
158 if (error)
159 return (error);
fad57d0e 160
806dcf9a
MD
161 /*
162 * split case to allow us to re-resolve and retry the ncp in case
163 * we get ESTALE.
164 */
165again:
166 if (fmode & O_CREAT) {
28623bf9
MD
167 if (nd->nl_nch.ncp->nc_vp == NULL) {
168 if ((error = ncp_writechk(&nd->nl_nch)) != 0)
468bb1f9 169 return (error);
984263bc
MD
170 VATTR_NULL(vap);
171 vap->va_type = VREG;
172 vap->va_mode = cmode;
173 if (fmode & O_EXCL)
174 vap->va_vaflags |= VA_EXCLUSIVE;
5312fa43 175 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp,
dff430ab 176 nd->nl_cred, vap);
fad57d0e 177 if (error)
984263bc 178 return (error);
984263bc 179 fmode &= ~O_TRUNC;
fad57d0e 180 /* locked vnode is returned */
984263bc 181 } else {
984263bc
MD
182 if (fmode & O_EXCL) {
183 error = EEXIST;
fad57d0e 184 } else {
28623bf9 185 error = cache_vget(&nd->nl_nch, cred,
fad57d0e 186 LK_EXCLUSIVE, &vp);
984263bc 187 }
fad57d0e
MD
188 if (error)
189 return (error);
984263bc
MD
190 fmode &= ~O_CREAT;
191 }
192 } else {
28623bf9 193 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
984263bc
MD
194 if (error)
195 return (error);
984263bc 196 }
fad57d0e
MD
197
198 /*
806dcf9a 199 * We have a locked vnode and ncp now. Note that the ncp will
28623bf9 200 * be cleaned up by the caller if nd->nl_nch is left intact.
fad57d0e 201 */
984263bc
MD
202 if (vp->v_type == VLNK) {
203 error = EMLINK;
204 goto bad;
205 }
206 if (vp->v_type == VSOCK) {
207 error = EOPNOTSUPP;
208 goto bad;
209 }
28d748b9
AH
210 if (vp->v_type != VDIR && (fmode & O_DIRECTORY)) {
211 error = ENOTDIR;
212 goto bad;
213 }
984263bc 214 if ((fmode & O_CREAT) == 0) {
984263bc
MD
215 if (fmode & (FWRITE | O_TRUNC)) {
216 if (vp->v_type == VDIR) {
217 error = EISDIR;
218 goto bad;
219 }
28623bf9 220 error = vn_writechk(vp, &nd->nl_nch);
806dcf9a
MD
221 if (error) {
222 /*
223 * Special stale handling, re-resolve the
224 * vnode.
225 */
226 if (error == ESTALE) {
227 vput(vp);
228 vp = NULL;
28623bf9
MD
229 cache_setunresolved(&nd->nl_nch);
230 error = cache_resolve(&nd->nl_nch, cred);
806dcf9a
MD
231 if (error == 0)
232 goto again;
233 }
984263bc 234 goto bad;
806dcf9a 235 }
984263bc
MD
236 }
237 }
238 if (fmode & O_TRUNC) {
a11aaa81 239 vn_unlock(vp); /* XXX */
ca466bae 240 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */
18cd8808 241 osize = vp->v_filesize;
984263bc
MD
242 VATTR_NULL(vap);
243 vap->va_size = 0;
87de5057 244 error = VOP_SETATTR(vp, vap, cred);
984263bc
MD
245 if (error)
246 goto bad;
18cd8808
FT
247 error = VOP_GETATTR(vp, vap);
248 if (error)
249 goto bad;
250 mp = vq_vptomp(vp);
251 VFS_ACCOUNT(mp, vap->va_uid, vap->va_gid, -osize);
984263bc 252 }
fad57d0e 253
e9b56058
MD
254 /*
255 * Set or clear VNSWAPCACHE on the vp based on nd->nl_nch.ncp->nc_flag.
256 * These particular bits a tracked all the way from the root.
257 *
258 * NOTE: Might not work properly on NFS servers due to the
259 * disconnected namecache.
260 */
261 flags = nd->nl_nch.ncp->nc_flag;
262 if ((flags & (NCF_UF_CACHE | NCF_UF_PCACHE)) &&
263 (flags & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) == 0) {
264 vsetflags(vp, VSWAPCACHE);
265 } else {
266 vclrflags(vp, VSWAPCACHE);
267 }
268
fad57d0e
MD
269 /*
270 * Setup the fp so VOP_OPEN can override it. No descriptor has been
72310cfb
MD
271 * associated with the fp yet so we own it clean.
272 *
28623bf9 273 * f_nchandle inherits nl_nch. This used to be necessary only for
72310cfb
MD
274 * directories but now we do it unconditionally so f*() ops
275 * such as fchmod() can access the actual namespace that was
276 * used to open the file.
fad57d0e
MD
277 */
278 if (fp) {
3a907475
MD
279 if (nd->nl_flags & NLC_APPENDONLY)
280 fmode |= FAPPENDONLY;
28623bf9
MD
281 fp->f_nchandle = nd->nl_nch;
282 cache_zero(&nd->nl_nch);
283 cache_unlock(&fp->f_nchandle);
fad57d0e
MD
284 }
285
286 /*
28623bf9
MD
287 * Get rid of nl_nch. vn_open does not return it (it returns the
288 * vnode or the file pointer). Note: we can't leave nl_nch locked
fad57d0e
MD
289 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
290 * on /dev/ttyd0
291 */
28623bf9
MD
292 if (nd->nl_nch.ncp)
293 cache_put(&nd->nl_nch);
fad57d0e 294
87de5057 295 error = VOP_OPEN(vp, fmode, cred, fp);
fad57d0e
MD
296 if (error) {
297 /*
298 * setting f_ops to &badfileops will prevent the descriptor
299 * code from trying to close and release the vnode, since
300 * the open failed we do not want to call close.
301 */
675eb4c0
MD
302 if (fp) {
303 fp->f_data = NULL;
304 fp->f_ops = &badfileops;
305 }
984263bc 306 goto bad;
fad57d0e 307 }
fad57d0e 308
7540ab49 309#if 0
984263bc 310 /*
7540ab49 311 * Assert that VREG files have been setup for vmio.
984263bc 312 */
7540ab49
MD
313 KASSERT(vp->v_type != VREG || vp->v_object != NULL,
314 ("vn_open: regular file was not VMIO enabled!"));
315#endif
984263bc 316
fad57d0e
MD
317 /*
318 * Return the vnode. XXX needs some cleaning up. The vnode is
8ddc6004 319 * only returned in the fp == NULL case.
fad57d0e
MD
320 */
321 if (fp == NULL) {
322 nd->nl_open_vp = vp;
323 nd->nl_vp_fmode = fmode;
324 if ((nd->nl_flags & NLC_LOCKVP) == 0)
a11aaa81 325 vn_unlock(vp);
fad57d0e 326 } else {
8ddc6004 327 vput(vp);
fad57d0e 328 }
984263bc
MD
329 return (0);
330bad:
bb5c9c00
MD
331 if (vp)
332 vput(vp);
984263bc
MD
333 return (error);
334}
335
a8873631
MD
336int
337vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
338{
339 struct vnode *vp;
340 int error;
341
342 if (strncmp(devname, "/dev/", 5) == 0)
343 devname += 5;
344 if ((vp = getsynthvnode(devname)) == NULL) {
345 error = ENODEV;
346 } else {
347 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
348 vn_unlock(vp);
349 if (error) {
350 vrele(vp);
351 vp = NULL;
352 }
353 }
354 *vpp = vp;
355 return (error);
356}
357
984263bc 358/*
28623bf9 359 * Check for write permissions on the specified vnode. nch may be NULL.
984263bc
MD
360 */
361int
28623bf9 362vn_writechk(struct vnode *vp, struct nchandle *nch)
984263bc 363{
984263bc
MD
364 /*
365 * If there's shared text associated with
366 * the vnode, try to free it up once. If
367 * we fail, we can't allow writing.
368 */
369 if (vp->v_flag & VTEXT)
370 return (ETXTBSY);
468bb1f9
MD
371
372 /*
373 * If the vnode represents a regular file, check the mount
28623bf9 374 * point via the nch. This may be a different mount point
468bb1f9
MD
375 * then the one embedded in the vnode (e.g. nullfs).
376 *
377 * We can still write to non-regular files (e.g. devices)
378 * via read-only mounts.
379 */
28623bf9
MD
380 if (nch && nch->ncp && vp->v_type == VREG)
381 return (ncp_writechk(nch));
984263bc
MD
382 return (0);
383}
384
468bb1f9
MD
385/*
386 * Check whether the underlying mount is read-only. The mount point
387 * referenced by the namecache may be different from the mount point
388 * used by the underlying vnode in the case of NULLFS, so a separate
389 * check is needed.
390 */
468bb1f9 391int
28623bf9 392ncp_writechk(struct nchandle *nch)
468bb1f9 393{
28623bf9 394 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
468bb1f9
MD
395 return (EROFS);
396 return(0);
397}
398
984263bc
MD
399/*
400 * Vnode close call
2247fe02
MD
401 *
402 * MPSAFE
984263bc
MD
403 */
404int
87de5057 405vn_close(struct vnode *vp, int flags)
984263bc
MD
406{
407 int error;
408
4698dfb3
MN
409 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
410 if (error == 0) {
87de5057 411 error = VOP_CLOSE(vp, flags);
a11aaa81 412 vn_unlock(vp);
5fd012e0 413 }
984263bc
MD
414 vrele(vp);
415 return (error);
416}
417
2247fe02
MD
418/*
419 * Sequential heuristic.
420 *
421 * MPSAFE (f_seqcount and f_nextoff are allowed to race)
422 */
984263bc
MD
423static __inline
424int
425sequential_heuristic(struct uio *uio, struct file *fp)
426{
427 /*
428 * Sequential heuristic - detect sequential operation
c0885fab
MD
429 *
430 * NOTE: SMP: We allow f_seqcount updates to race.
984263bc
MD
431 */
432 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
433 uio->uio_offset == fp->f_nextoff) {
434 int tmpseq = fp->f_seqcount;
2247fe02 435
984263bc
MD
436 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
437 if (tmpseq > IO_SEQMAX)
438 tmpseq = IO_SEQMAX;
439 fp->f_seqcount = tmpseq;
440 return(fp->f_seqcount << IO_SEQSHIFT);
441 }
442
443 /*
444 * Not sequential, quick draw-down of seqcount
c0885fab
MD
445 *
446 * NOTE: SMP: We allow f_seqcount updates to race.
984263bc
MD
447 */
448 if (fp->f_seqcount > 1)
449 fp->f_seqcount = 1;
450 else
451 fp->f_seqcount = 0;
452 return(0);
453}
454
c0885fab
MD
455/*
456 * get - lock and return the f_offset field.
457 * set - set and unlock the f_offset field.
458 *
459 * These routines serve the dual purpose of serializing access to the
460 * f_offset field (at least on i386) and guaranteeing operational integrity
461 * when multiple read()ers and write()ers are present on the same fp.
2247fe02
MD
462 *
463 * MPSAFE
c0885fab
MD
464 */
465static __inline off_t
466vn_get_fpf_offset(struct file *fp)
467{
468 u_int flags;
469 u_int nflags;
470
471 /*
472 * Shortcut critical path.
473 */
474 flags = fp->f_flag & ~FOFFSETLOCK;
475 if (atomic_cmpset_int(&fp->f_flag, flags, flags | FOFFSETLOCK))
476 return(fp->f_offset);
477
478 /*
479 * The hard way
480 */
481 for (;;) {
482 flags = fp->f_flag;
483 if (flags & FOFFSETLOCK) {
484 nflags = flags | FOFFSETWAKE;
ae8e83e6 485 tsleep_interlock(&fp->f_flag, 0);
c0885fab 486 if (atomic_cmpset_int(&fp->f_flag, flags, nflags))
d9345d3a 487 tsleep(&fp->f_flag, PINTERLOCKED, "fpoff", 0);
c0885fab
MD
488 } else {
489 nflags = flags | FOFFSETLOCK;
490 if (atomic_cmpset_int(&fp->f_flag, flags, nflags))
491 break;
492 }
493 }
494 return(fp->f_offset);
495}
496
2247fe02
MD
497/*
498 * MPSAFE
499 */
c0885fab
MD
500static __inline void
501vn_set_fpf_offset(struct file *fp, off_t offset)
502{
503 u_int flags;
504 u_int nflags;
505
506 /*
507 * We hold the lock so we can set the offset without interference.
508 */
509 fp->f_offset = offset;
510
511 /*
512 * Normal release is already a reasonably critical path.
513 */
514 for (;;) {
515 flags = fp->f_flag;
516 nflags = flags & ~(FOFFSETLOCK | FOFFSETWAKE);
517 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) {
518 if (flags & FOFFSETWAKE)
519 wakeup(&fp->f_flag);
520 break;
521 }
522 }
523}
524
2247fe02
MD
525/*
526 * MPSAFE
527 */
c0885fab
MD
528static __inline off_t
529vn_poll_fpf_offset(struct file *fp)
530{
1918fc5c 531#if defined(__x86_64__)
c0885fab
MD
532 return(fp->f_offset);
533#else
534 off_t off = vn_get_fpf_offset(fp);
535 vn_set_fpf_offset(fp, off);
536 return(off);
537#endif
538}
539
984263bc
MD
540/*
541 * Package up an I/O request on a vnode into a uio and do it.
2247fe02
MD
542 *
543 * MPSAFE
984263bc
MD
544 */
545int
87de5057
MD
546vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
547 off_t offset, enum uio_seg segflg, int ioflg,
548 struct ucred *cred, int *aresid)
984263bc
MD
549{
550 struct uio auio;
551 struct iovec aiov;
552 int error;
553
554 if ((ioflg & IO_NODELOCKED) == 0)
ca466bae 555 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
984263bc
MD
556 auio.uio_iov = &aiov;
557 auio.uio_iovcnt = 1;
558 aiov.iov_base = base;
559 aiov.iov_len = len;
560 auio.uio_resid = len;
561 auio.uio_offset = offset;
562 auio.uio_segflg = segflg;
563 auio.uio_rw = rw;
87de5057 564 auio.uio_td = curthread;
984263bc
MD
565 if (rw == UIO_READ) {
566 error = VOP_READ(vp, &auio, ioflg, cred);
567 } else {
568 error = VOP_WRITE(vp, &auio, ioflg, cred);
569 }
570 if (aresid)
571 *aresid = auio.uio_resid;
572 else
573 if (auio.uio_resid && error == 0)
574 error = EIO;
575 if ((ioflg & IO_NODELOCKED) == 0)
a11aaa81 576 vn_unlock(vp);
984263bc
MD
577 return (error);
578}
579
580/*
581 * Package up an I/O request on a vnode into a uio and do it. The I/O
582 * request is split up into smaller chunks and we try to avoid saturating
583 * the buffer cache while potentially holding a vnode locked, so we
f9235b6d 584 * check bwillwrite() before calling vn_rdwr(). We also call lwkt_user_yield()
984263bc
MD
585 * to give other processes a chance to lock the vnode (either other processes
586 * core'ing the same binary, or unrelated processes scanning the directory).
2247fe02
MD
587 *
588 * MPSAFE
984263bc
MD
589 */
590int
87de5057
MD
591vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
592 off_t offset, enum uio_seg segflg, int ioflg,
593 struct ucred *cred, int *aresid)
984263bc
MD
594{
595 int error = 0;
596
597 do {
9a0222ac 598 int chunk;
984263bc 599
9a0222ac
DR
600 /*
601 * Force `offset' to a multiple of MAXBSIZE except possibly
602 * for the first chunk, so that filesystems only need to
603 * write full blocks except possibly for the first and last
604 * chunks.
605 */
606 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
607
608 if (chunk > len)
609 chunk = len;
c4df9635
MD
610 if (vp->v_type == VREG) {
611 switch(rw) {
612 case UIO_READ:
613 bwillread(chunk);
614 break;
615 case UIO_WRITE:
616 bwillwrite(chunk);
617 break;
618 }
619 }
984263bc 620 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
2247fe02 621 ioflg, cred, aresid);
984263bc
MD
622 len -= chunk; /* aresid calc already includes length */
623 if (error)
624 break;
625 offset += chunk;
626 base += chunk;
f9235b6d 627 lwkt_user_yield();
984263bc
MD
628 } while (len);
629 if (aresid)
630 *aresid += len;
631 return (error);
632}
633
634/*
c0885fab
MD
635 * File pointers can no longer get ripped up by revoke so
636 * we don't need to lock access to the vp.
637 *
638 * f_offset updates are not guaranteed against multiple readers
984263bc
MD
639 */
640static int
87de5057 641vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
984263bc
MD
642{
643 struct vnode *vp;
644 int error, ioflag;
645
87de5057
MD
646 KASSERT(uio->uio_td == curthread,
647 ("uio_td %p is not td %p", uio->uio_td, curthread));
984263bc 648 vp = (struct vnode *)fp->f_data;
9ba76b73 649
984263bc 650 ioflag = 0;
05dd1c0b 651 if (flags & O_FBLOCKING) {
9ba76b73
MD
652 /* ioflag &= ~IO_NDELAY; */
653 } else if (flags & O_FNONBLOCKING) {
654 ioflag |= IO_NDELAY;
655 } else if (fp->f_flag & FNONBLOCK) {
984263bc 656 ioflag |= IO_NDELAY;
9ba76b73
MD
657 }
658 if (flags & O_FBUFFERED) {
659 /* ioflag &= ~IO_DIRECT; */
660 } else if (flags & O_FUNBUFFERED) {
661 ioflag |= IO_DIRECT;
662 } else if (fp->f_flag & O_DIRECT) {
984263bc 663 ioflag |= IO_DIRECT;
9ba76b73 664 }
c0885fab
MD
665 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0)
666 uio->uio_offset = vn_get_fpf_offset(fp);
ab6f251b 667 vn_lock(vp, LK_SHARED | LK_RETRY);
984263bc
MD
668 ioflag |= sequential_heuristic(uio, fp);
669
aac0aabd 670 error = VOP_READ(vp, uio, ioflag, cred);
984263bc 671 fp->f_nextoff = uio->uio_offset;
a11aaa81 672 vn_unlock(vp);
c0885fab
MD
673 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0)
674 vn_set_fpf_offset(fp, uio->uio_offset);
984263bc
MD
675 return (error);
676}
677
678/*
2247fe02 679 * MPSAFE
984263bc
MD
680 */
681static int
87de5057 682vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
984263bc
MD
683{
684 struct vnode *vp;
685 int error, ioflag;
686
87de5057 687 KASSERT(uio->uio_td == curthread,
f4d08668 688 ("uio_td %p is not p %p", uio->uio_td, curthread));
984263bc 689 vp = (struct vnode *)fp->f_data;
9ba76b73 690
984263bc 691 ioflag = IO_UNIT;
9ba76b73
MD
692 if (vp->v_type == VREG &&
693 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
984263bc 694 ioflag |= IO_APPEND;
9ba76b73
MD
695 }
696
697 if (flags & O_FBLOCKING) {
698 /* ioflag &= ~IO_NDELAY; */
699 } else if (flags & O_FNONBLOCKING) {
984263bc 700 ioflag |= IO_NDELAY;
9ba76b73
MD
701 } else if (fp->f_flag & FNONBLOCK) {
702 ioflag |= IO_NDELAY;
703 }
704 if (flags & O_FBUFFERED) {
705 /* ioflag &= ~IO_DIRECT; */
706 } else if (flags & O_FUNBUFFERED) {
707 ioflag |= IO_DIRECT;
708 } else if (fp->f_flag & O_DIRECT) {
984263bc 709 ioflag |= IO_DIRECT;
9ba76b73
MD
710 }
711 if (flags & O_FASYNCWRITE) {
712 /* ioflag &= ~IO_SYNC; */
713 } else if (flags & O_FSYNCWRITE) {
714 ioflag |= IO_SYNC;
715 } else if (fp->f_flag & O_FSYNC) {
716 ioflag |= IO_SYNC;
717 }
718
719 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
984263bc 720 ioflag |= IO_SYNC;
9ba76b73 721 if ((flags & O_FOFFSET) == 0)
c0885fab
MD
722 uio->uio_offset = vn_get_fpf_offset(fp);
723 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
984263bc 724 ioflag |= sequential_heuristic(uio, fp);
aac0aabd 725 error = VOP_WRITE(vp, uio, ioflag, cred);
984263bc 726 fp->f_nextoff = uio->uio_offset;
a11aaa81 727 vn_unlock(vp);
c0885fab
MD
728 if ((flags & O_FOFFSET) == 0)
729 vn_set_fpf_offset(fp, uio->uio_offset);
984263bc
MD
730 return (error);
731}
732
733/*
2ad080fe 734 * MPSAFE
984263bc
MD
735 */
736static int
87de5057 737vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
984263bc 738{
d9b2033e
MD
739 struct vnode *vp;
740 int error;
984263bc 741
d9b2033e
MD
742 vp = (struct vnode *)fp->f_data;
743 error = vn_stat(vp, sb, cred);
d9b2033e 744 return (error);
984263bc
MD
745}
746
2ad080fe 747/*
aac0aabd 748 * MPSAFE
2ad080fe 749 */
984263bc 750int
87de5057 751vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
984263bc
MD
752{
753 struct vattr vattr;
dadab5e9 754 struct vattr *vap;
984263bc
MD
755 int error;
756 u_short mode;
b13267a5 757 cdev_t dev;
984263bc
MD
758
759 vap = &vattr;
aac0aabd 760 error = VOP_GETATTR(vp, vap);
984263bc
MD
761 if (error)
762 return (error);
763
764 /*
765 * Zero the spare stat fields
766 */
767 sb->st_lspare = 0;
d98152a8
MD
768 sb->st_qspare1 = 0;
769 sb->st_qspare2 = 0;
984263bc
MD
770
771 /*
772 * Copy from vattr table
773 */
774 if (vap->va_fsid != VNOVAL)
775 sb->st_dev = vap->va_fsid;
776 else
777 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
778 sb->st_ino = vap->va_fileid;
779 mode = vap->va_mode;
780 switch (vap->va_type) {
781 case VREG:
782 mode |= S_IFREG;
783 break;
50626622
MD
784 case VDATABASE:
785 mode |= S_IFDB;
786 break;
984263bc
MD
787 case VDIR:
788 mode |= S_IFDIR;
789 break;
790 case VBLK:
791 mode |= S_IFBLK;
792 break;
793 case VCHR:
794 mode |= S_IFCHR;
795 break;
796 case VLNK:
797 mode |= S_IFLNK;
798 /* This is a cosmetic change, symlinks do not have a mode. */
799 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
800 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
801 else
802 sb->st_mode |= ACCESSPERMS; /* 0777 */
803 break;
804 case VSOCK:
805 mode |= S_IFSOCK;
806 break;
807 case VFIFO:
808 mode |= S_IFIFO;
809 break;
810 default:
811 return (EBADF);
4698dfb3 812 }
984263bc 813 sb->st_mode = mode;
50626622
MD
814 if (vap->va_nlink > (nlink_t)-1)
815 sb->st_nlink = (nlink_t)-1;
816 else
817 sb->st_nlink = vap->va_nlink;
984263bc
MD
818 sb->st_uid = vap->va_uid;
819 sb->st_gid = vap->va_gid;
cd29885a 820 sb->st_rdev = dev2udev(vp->v_rdev);
984263bc
MD
821 sb->st_size = vap->va_size;
822 sb->st_atimespec = vap->va_atime;
823 sb->st_mtimespec = vap->va_mtime;
824 sb->st_ctimespec = vap->va_ctime;
825
d8869c1b
MD
826 /*
827 * A VCHR and VBLK device may track the last access and last modified
828 * time independantly of the filesystem. This is particularly true
829 * because device read and write calls may bypass the filesystem.
830 */
831 if (vp->v_type == VCHR || vp->v_type == VBLK) {
4698dfb3
MN
832 dev = vp->v_rdev;
833 if (dev != NULL) {
d8869c1b
MD
834 if (dev->si_lastread) {
835 sb->st_atimespec.tv_sec = dev->si_lastread;
836 sb->st_atimespec.tv_nsec = 0;
837 }
838 if (dev->si_lastwrite) {
839 sb->st_atimespec.tv_sec = dev->si_lastwrite;
840 sb->st_atimespec.tv_nsec = 0;
841 }
842 }
843 }
844
984263bc
MD
845 /*
846 * According to www.opengroup.org, the meaning of st_blksize is
847 * "a filesystem-specific preferred I/O block size for this
848 * object. In some filesystem types, this may vary from file
849 * to file"
850 * Default to PAGE_SIZE after much discussion.
851 */
852
853 if (vap->va_type == VREG) {
854 sb->st_blksize = vap->va_blocksize;
855 } else if (vn_isdisk(vp, NULL)) {
e4c9c0c8
MD
856 /*
857 * XXX this is broken. If the device is not yet open (aka
858 * stat() call, aka v_rdev == NULL), how are we supposed
859 * to get a valid block size out of it?
860 */
4698dfb3 861 dev = vp->v_rdev;
cd29885a 862
e4c9c0c8
MD
863 sb->st_blksize = dev->si_bsize_best;
864 if (sb->st_blksize < dev->si_bsize_phys)
865 sb->st_blksize = dev->si_bsize_phys;
984263bc
MD
866 if (sb->st_blksize < BLKDEV_IOSIZE)
867 sb->st_blksize = BLKDEV_IOSIZE;
868 } else {
869 sb->st_blksize = PAGE_SIZE;
870 }
871
872 sb->st_flags = vap->va_flags;
f00b5e4e
MN
873
874 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
875 if (error)
984263bc
MD
876 sb->st_gen = 0;
877 else
50626622 878 sb->st_gen = (u_int32_t)vap->va_gen;
984263bc 879
984263bc 880 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
984263bc
MD
881 return (0);
882}
883
884/*
d9b2033e 885 * MPALMOSTSAFE - acquires mplock
984263bc
MD
886 */
887static int
87baaf0c
MD
888vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred,
889 struct sysmsg *msg)
984263bc 890{
dadab5e9 891 struct vnode *vp = ((struct vnode *)fp->f_data);
1fbb5fc0 892 struct vnode *ovp;
984263bc
MD
893 struct vattr vattr;
894 int error;
c0885fab 895 off_t size;
984263bc 896
dadab5e9 897 switch (vp->v_type) {
984263bc
MD
898 case VREG:
899 case VDIR:
900 if (com == FIONREAD) {
4698dfb3
MN
901 error = VOP_GETATTR(vp, &vattr);
902 if (error)
d9b2033e 903 break;
c0885fab
MD
904 size = vattr.va_size;
905 if ((vp->v_flag & VNOTSEEKABLE) == 0)
906 size -= vn_poll_fpf_offset(fp);
907 if (size > 0x7FFFFFFF)
908 size = 0x7FFFFFFF;
909 *(int *)data = size;
d9b2033e
MD
910 error = 0;
911 break;
912 }
9ba76b73 913 if (com == FIOASYNC) { /* XXX */
d9b2033e
MD
914 error = 0; /* XXX */
915 break;
984263bc 916 }
984263bc 917 /* fall into ... */
984263bc
MD
918 default:
919#if 0
920 return (ENOTTY);
921#endif
922 case VFIFO:
923 case VCHR:
924 case VBLK:
925 if (com == FIODTYPE) {
d9b2033e
MD
926 if (vp->v_type != VCHR && vp->v_type != VBLK) {
927 error = ENOTTY;
928 break;
929 }
335dda38 930 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
d9b2033e
MD
931 error = 0;
932 break;
984263bc 933 }
87baaf0c 934 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred, msg);
984263bc 935 if (error == 0 && com == TIOCSCTTY) {
87de5057
MD
936 struct proc *p = curthread->td_proc;
937 struct session *sess;
938
d9b2033e
MD
939 if (p == NULL) {
940 error = ENOTTY;
941 break;
942 }
984263bc 943
2247fe02 944 get_mplock();
87de5057 945 sess = p->p_session;
984263bc 946 /* Do nothing if reassigning same control tty */
d9b2033e
MD
947 if (sess->s_ttyvp == vp) {
948 error = 0;
2247fe02 949 rel_mplock();
d9b2033e
MD
950 break;
951 }
984263bc
MD
952
953 /* Get rid of reference to old control tty */
1fbb5fc0 954 ovp = sess->s_ttyvp;
597aea93 955 vref(vp);
1fbb5fc0
MD
956 sess->s_ttyvp = vp;
957 if (ovp)
958 vrele(ovp);
2247fe02 959 rel_mplock();
984263bc 960 }
d9b2033e 961 break;
984263bc 962 }
d9b2033e 963 return (error);
984263bc
MD
964}
965
984263bc
MD
966/*
967 * Check that the vnode is still valid, and if so
968 * acquire requested lock.
969 */
970int
971#ifndef DEBUG_LOCKS
ca466bae 972vn_lock(struct vnode *vp, int flags)
984263bc 973#else
ca466bae 974debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line)
984263bc
MD
975#endif
976{
977 int error;
978
979 do {
984263bc 980#ifdef DEBUG_LOCKS
5fd012e0
MD
981 vp->filename = filename;
982 vp->line = line;
a11aaa81
MD
983 error = debuglockmgr(&vp->v_lock, flags,
984 "vn_lock", filename, line);
985#else
986 error = lockmgr(&vp->v_lock, flags);
984263bc 987#endif
5fd012e0
MD
988 if (error == 0)
989 break;
984263bc 990 } while (flags & LK_RETRY);
5fd012e0
MD
991
992 /*
993 * Because we (had better!) have a ref on the vnode, once it
994 * goes to VRECLAIMED state it will not be recycled until all
995 * refs go away. So we can just check the flag.
996 */
997 if (error == 0 && (vp->v_flag & VRECLAIMED)) {
a11aaa81 998 lockmgr(&vp->v_lock, LK_RELEASE);
5fd012e0
MD
999 error = ENOENT;
1000 }
984263bc
MD
1001 return (error);
1002}
1003
ead16d5b
MD
1004#ifdef DEBUG_VN_UNLOCK
1005
1006void
1007debug_vn_unlock(struct vnode *vp, const char *filename, int line)
1008{
1009 kprintf("vn_unlock from %s:%d\n", filename, line);
1010 lockmgr(&vp->v_lock, LK_RELEASE);
1011}
1012
1013#else
1014
a11aaa81
MD
1015void
1016vn_unlock(struct vnode *vp)
1017{
1018 lockmgr(&vp->v_lock, LK_RELEASE);
1019}
1020
ead16d5b
MD
1021#endif
1022
2247fe02
MD
1023/*
1024 * MPSAFE
1025 */
a11aaa81
MD
1026int
1027vn_islocked(struct vnode *vp)
1028{
1029 return (lockstatus(&vp->v_lock, curthread));
1030}
1031
94f2e6f2
MD
1032/*
1033 * Return the lock status of a vnode and unlock the vnode
1034 * if we owned the lock. This is not a boolean, if the
1035 * caller cares what the lock status is the caller must
1036 * check the various possible values.
1037 *
1038 * This only unlocks exclusive locks held by the caller,
1039 * it will NOT unlock shared locks (there is no way to
1040 * tell who the shared lock belongs to).
1041 *
1042 * MPSAFE
1043 */
1044int
1045vn_islocked_unlock(struct vnode *vp)
1046{
1047 int vpls;
1048
1049 vpls = lockstatus(&vp->v_lock, curthread);
1050 if (vpls == LK_EXCLUSIVE)
1051 lockmgr(&vp->v_lock, LK_RELEASE);
1052 return(vpls);
1053}
1054
1055/*
1056 * Restore a vnode lock that we previously released via
1057 * vn_islocked_unlock(). This is a NOP if we did not
1058 * own the original lock.
1059 *
1060 * MPSAFE
1061 */
1062void
1063vn_islocked_relock(struct vnode *vp, int vpls)
1064{
1065 int error;
1066
1067 if (vpls == LK_EXCLUSIVE)
1068 error = lockmgr(&vp->v_lock, vpls);
1069}
1070
984263bc 1071/*
2247fe02 1072 * MPSAFE
984263bc
MD
1073 */
1074static int
87de5057 1075vn_closefile(struct file *fp)
984263bc 1076{
d9b2033e 1077 int error;
984263bc
MD
1078
1079 fp->f_ops = &badfileops;
d9b2033e 1080 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
4698dfb3 1081 return (error);
984263bc
MD
1082}
1083
d9b2033e 1084/*
2247fe02 1085 * MPSAFE
d9b2033e 1086 */
984263bc
MD
1087static int
1088vn_kqfilter(struct file *fp, struct knote *kn)
1089{
d9b2033e 1090 int error;
984263bc 1091
d9b2033e 1092 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
d9b2033e 1093 return (error);
984263bc 1094}