kernel - lwkt_token revamp
[dragonfly.git] / sys / kern / vfs_subr.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
67863d04 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.118 2008/09/17 21:44:18 dillon Exp $
984263bc
MD
41 */
42
43/*
44 * External virtual filesystem routines
45 */
46#include "opt_ddb.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/dirent.h>
53#include <sys/domain.h>
54#include <sys/eventhandler.h>
55#include <sys/fcntl.h>
b8477cda 56#include <sys/file.h>
984263bc
MD
57#include <sys/kernel.h>
58#include <sys/kthread.h>
59#include <sys/malloc.h>
60#include <sys/mbuf.h>
61#include <sys/mount.h>
3b0783db 62#include <sys/priv.h>
984263bc
MD
63#include <sys/proc.h>
64#include <sys/reboot.h>
65#include <sys/socket.h>
66#include <sys/stat.h>
67#include <sys/sysctl.h>
68#include <sys/syslog.h>
5d72d6ed 69#include <sys/unistd.h>
984263bc
MD
70#include <sys/vmmeter.h>
71#include <sys/vnode.h>
72
73#include <machine/limits.h>
74
75#include <vm/vm.h>
76#include <vm/vm_object.h>
77#include <vm/vm_extern.h>
6ef943a3 78#include <vm/vm_kern.h>
984263bc
MD
79#include <vm/pmap.h>
80#include <vm/vm_map.h>
81#include <vm/vm_page.h>
82#include <vm/vm_pager.h>
83#include <vm/vnode_pager.h>
84#include <vm/vm_zone.h>
85
3020e3be 86#include <sys/buf2.h>
f5865223 87#include <sys/thread2.h>
3c37c940 88#include <sys/sysref2.h>
2247fe02 89#include <sys/mplock2.h>
3020e3be 90
984263bc
MD
91static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
92
5fd012e0 93int numvnodes;
984263bc
MD
94SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
95
96enum vtype iftovt_tab[16] = {
97 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
98 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
99};
100int vttoif_tab[9] = {
101 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
102 S_IFSOCK, S_IFIFO, S_IFMT,
103};
104
984263bc 105static int reassignbufcalls;
dd98570a
MD
106SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW,
107 &reassignbufcalls, 0, "");
984263bc 108static int reassignbufloops;
dd98570a
MD
109SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW,
110 &reassignbufloops, 0, "");
984263bc 111static int reassignbufsortgood;
dd98570a
MD
112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW,
113 &reassignbufsortgood, 0, "");
984263bc 114static int reassignbufsortbad;
dd98570a
MD
115SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW,
116 &reassignbufsortbad, 0, "");
984263bc 117static int reassignbufmethod = 1;
dd98570a
MD
118SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW,
119 &reassignbufmethod, 0, "");
984263bc 120
984263bc 121int nfs_mount_type = -1;
8a8d5d85 122static struct lwkt_token spechash_token;
984263bc 123struct nfs_public nfs_pub; /* publicly exported FS */
984263bc
MD
124
125int desiredvnodes;
126SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
dd98570a 127 &desiredvnodes, 0, "Maximum number of vnodes");
984263bc 128
402ed7e1
RG
129static void vfs_free_addrlist (struct netexport *nep);
130static int vfs_free_netcred (struct radix_node *rn, void *w);
131static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep,
1aa89f17 132 const struct export_args *argp);
984263bc 133
41a01a4d 134/*
6bae6177
MD
135 * Red black tree functions
136 */
137static int rb_buf_compare(struct buf *b1, struct buf *b2);
54078292
MD
138RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, off_t, b_loffset);
139RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, off_t, b_loffset);
6bae6177
MD
140
141static int
142rb_buf_compare(struct buf *b1, struct buf *b2)
143{
54078292 144 if (b1->b_loffset < b2->b_loffset)
6bae6177 145 return(-1);
54078292 146 if (b1->b_loffset > b2->b_loffset)
6bae6177
MD
147 return(1);
148 return(0);
149}
150
151/*
44b1cf3d 152 * Returns non-zero if the vnode is a candidate for lazy msyncing.
41a01a4d 153 */
5fd012e0 154static __inline int
3c37c940 155vshouldmsync(struct vnode *vp)
41a01a4d 156{
3c37c940 157 if (vp->v_auxrefs != 0 || vp->v_sysref.refcnt > 0)
44b1cf3d 158 return (0); /* other holders */
5fd012e0
MD
159 if (vp->v_object &&
160 (vp->v_object->ref_count || vp->v_object->resident_page_count)) {
161 return (0);
162 }
163 return (1);
41a01a4d 164}
5fd012e0 165
984263bc 166/*
5fd012e0
MD
167 * Initialize the vnode management data structures.
168 *
169 * Called from vfsinit()
984263bc
MD
170 */
171void
5fd012e0 172vfs_subr_init(void)
984263bc 173{
7c457ac8
MD
174 int factor1;
175 int factor2;
176
6ef943a3 177 /*
55d3a838
MD
178 * Desiredvnodes is kern.maxvnodes. We want to scale it
179 * according to available system memory but we may also have
180 * to limit it based on available KVM, which is capped on 32 bit
181 * systems.
b867f3d9
MD
182 *
183 * WARNING! For machines with 64-256M of ram we have to be sure
184 * that the default limit scales down well due to HAMMER
185 * taking up significantly more memory per-vnode vs UFS.
186 * We want around ~5800 on a 128M machine.
6ef943a3 187 */
7c457ac8
MD
188 factor1 = 20 * (sizeof(struct vm_object) + sizeof(struct vnode));
189 factor2 = 22 * (sizeof(struct vm_object) + sizeof(struct vnode));
190 desiredvnodes =
191 imin((int64_t)vmstats.v_page_count * PAGE_SIZE / factor1,
192 KvaSize / factor2);
193 desiredvnodes = imax(desiredvnodes, maxproc * 8);
6ef943a3 194
3b998fa9 195 lwkt_token_init(&spechash_token, 1);
984263bc
MD
196}
197
198/*
199 * Knob to control the precision of file timestamps:
200 *
201 * 0 = seconds only; nanoseconds zeroed.
202 * 1 = seconds and nanoseconds, accurate within 1/HZ.
203 * 2 = seconds and nanoseconds, truncated to microseconds.
204 * >=3 = seconds and nanoseconds, maximum precision.
205 */
206enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
207
208static int timestamp_precision = TSP_SEC;
209SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
dd98570a 210 &timestamp_precision, 0, "");
984263bc
MD
211
212/*
213 * Get a current timestamp.
627531fa
MD
214 *
215 * MPSAFE
984263bc
MD
216 */
217void
dd98570a 218vfs_timestamp(struct timespec *tsp)
984263bc
MD
219{
220 struct timeval tv;
221
222 switch (timestamp_precision) {
223 case TSP_SEC:
224 tsp->tv_sec = time_second;
225 tsp->tv_nsec = 0;
226 break;
227 case TSP_HZ:
228 getnanotime(tsp);
229 break;
230 case TSP_USEC:
231 microtime(&tv);
232 TIMEVAL_TO_TIMESPEC(&tv, tsp);
233 break;
234 case TSP_NSEC:
235 default:
236 nanotime(tsp);
237 break;
238 }
239}
240
241/*
242 * Set vnode attributes to VNOVAL
243 */
244void
dd98570a 245vattr_null(struct vattr *vap)
984263bc 246{
984263bc
MD
247 vap->va_type = VNON;
248 vap->va_size = VNOVAL;
249 vap->va_bytes = VNOVAL;
250 vap->va_mode = VNOVAL;
251 vap->va_nlink = VNOVAL;
252 vap->va_uid = VNOVAL;
253 vap->va_gid = VNOVAL;
254 vap->va_fsid = VNOVAL;
255 vap->va_fileid = VNOVAL;
256 vap->va_blocksize = VNOVAL;
0e9b9130
MD
257 vap->va_rmajor = VNOVAL;
258 vap->va_rminor = VNOVAL;
984263bc
MD
259 vap->va_atime.tv_sec = VNOVAL;
260 vap->va_atime.tv_nsec = VNOVAL;
261 vap->va_mtime.tv_sec = VNOVAL;
262 vap->va_mtime.tv_nsec = VNOVAL;
263 vap->va_ctime.tv_sec = VNOVAL;
264 vap->va_ctime.tv_nsec = VNOVAL;
265 vap->va_flags = VNOVAL;
266 vap->va_gen = VNOVAL;
267 vap->va_vaflags = 0;
50626622 268 /* va_*_uuid fields are only valid if related flags are set */
984263bc
MD
269}
270
271/*
984263bc 272 * Flush out and invalidate all buffers associated with a vnode.
5fd012e0
MD
273 *
274 * vp must be locked.
984263bc 275 */
6bae6177
MD
276static int vinvalbuf_bp(struct buf *bp, void *data);
277
278struct vinvalbuf_bp_info {
279 struct vnode *vp;
280 int slptimeo;
f2770c70 281 int lkflags;
6bae6177
MD
282 int flags;
283};
284
984263bc 285int
87de5057 286vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
984263bc 287{
6bae6177 288 struct vinvalbuf_bp_info info;
984263bc 289 vm_object_t object;
0202303b
MD
290 int error;
291
3b998fa9 292 lwkt_gettoken(&vp->v_token);
984263bc 293
6bae6177
MD
294 /*
295 * If we are being asked to save, call fsync to ensure that the inode
296 * is updated.
297 */
984263bc 298 if (flags & V_SAVE) {
a9a20f98 299 error = bio_track_wait(&vp->v_track_write, slpflag, slptimeo);
0202303b 300 if (error)
a9a20f98 301 goto done;
6bae6177 302 if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
52174f71 303 if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)) != 0)
a9a20f98 304 goto done;
1bb61199
MD
305
306 /*
307 * Dirty bufs may be left or generated via races
308 * in circumstances where vinvalbuf() is called on
309 * a vnode not undergoing reclamation. Only
310 * panic if we are trying to reclaim the vnode.
311 */
312 if ((vp->v_flag & VRECLAIMED) &&
a9a20f98 313 (bio_track_active(&vp->v_track_write) ||
1bb61199 314 !RB_EMPTY(&vp->v_rbdirty_tree))) {
984263bc 315 panic("vinvalbuf: dirty bufs");
1bb61199 316 }
984263bc 317 }
984263bc 318 }
6bae6177 319 info.slptimeo = slptimeo;
f2770c70
MD
320 info.lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL;
321 if (slpflag & PCATCH)
322 info.lkflags |= LK_PCATCH;
6bae6177
MD
323 info.flags = flags;
324 info.vp = vp;
325
326 /*
327 * Flush the buffer cache until nothing is left.
328 */
329 while (!RB_EMPTY(&vp->v_rbclean_tree) ||
0202303b 330 !RB_EMPTY(&vp->v_rbdirty_tree)) {
6bae6177 331 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL,
65c6c519 332 vinvalbuf_bp, &info);
6bae6177
MD
333 if (error == 0) {
334 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
335 vinvalbuf_bp, &info);
984263bc
MD
336 }
337 }
338
339 /*
a9a20f98
MD
340 * Wait for I/O completion. We may block in the pip code so we have
341 * to re-check.
984263bc
MD
342 */
343 do {
a9a20f98 344 bio_track_wait(&vp->v_track_write, 0, 0);
7540ab49 345 if ((object = vp->v_object) != NULL) {
984263bc
MD
346 while (object->paging_in_progress)
347 vm_object_pip_sleep(object, "vnvlbx");
348 }
a9a20f98 349 } while (bio_track_active(&vp->v_track_write));
984263bc 350
984263bc
MD
351 /*
352 * Destroy the copy in the VM cache, too.
353 */
7540ab49 354 if ((object = vp->v_object) != NULL) {
984263bc
MD
355 vm_object_page_remove(object, 0, 0,
356 (flags & V_SAVE) ? TRUE : FALSE);
357 }
984263bc 358
6bae6177 359 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree))
984263bc 360 panic("vinvalbuf: flush failed");
1f1ea522
MD
361 if (!RB_EMPTY(&vp->v_rbhash_tree))
362 panic("vinvalbuf: flush failed, buffers still present");
a9a20f98
MD
363 error = 0;
364done:
3b998fa9 365 lwkt_reltoken(&vp->v_token);
a9a20f98 366 return (error);
984263bc
MD
367}
368
6bae6177
MD
369static int
370vinvalbuf_bp(struct buf *bp, void *data)
371{
372 struct vinvalbuf_bp_info *info = data;
373 int error;
374
375 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
f2770c70
MD
376 error = BUF_TIMELOCK(bp, info->lkflags,
377 "vinvalbuf", info->slptimeo);
6bae6177
MD
378 if (error == 0) {
379 BUF_UNLOCK(bp);
380 error = ENOLCK;
381 }
382 if (error == ENOLCK)
383 return(0);
384 return (-error);
385 }
65c6c519
MD
386
387 KKASSERT(bp->b_vp == info->vp);
388
6bae6177
MD
389 /*
390 * XXX Since there are no node locks for NFS, I
391 * believe there is a slight chance that a delayed
392 * write will occur while sleeping just above, so
393 * check for it. Note that vfs_bio_awrite expects
62cfda27
MD
394 * buffers to reside on a queue, while bwrite() and
395 * brelse() do not.
27bc0cb1
MD
396 *
397 * NOTE: NO B_LOCKED CHECK. Also no buf_checkwrite()
398 * check. This code will write out the buffer, period.
6bae6177
MD
399 */
400 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
401 (info->flags & V_SAVE)) {
402 if (bp->b_vp == info->vp) {
403 if (bp->b_flags & B_CLUSTEROK) {
6bae6177
MD
404 vfs_bio_awrite(bp);
405 } else {
406 bremfree(bp);
ae8e83e6 407 bawrite(bp);
6bae6177
MD
408 }
409 } else {
410 bremfree(bp);
62cfda27 411 bwrite(bp);
6bae6177 412 }
135bd6a8
MD
413 } else if (info->flags & V_SAVE) {
414 /*
415 * Cannot set B_NOCACHE on a clean buffer as this will
416 * destroy the VM backing store which might actually
417 * be dirty (and unsynchronized).
418 */
419 bremfree(bp);
420 bp->b_flags |= (B_INVAL | B_RELBUF);
135bd6a8 421 brelse(bp);
6bae6177
MD
422 } else {
423 bremfree(bp);
424 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
6bae6177
MD
425 brelse(bp);
426 }
427 return(0);
428}
429
984263bc
MD
430/*
431 * Truncate a file's buffer and pages to a specified length. This
432 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
433 * sync activity.
5fd012e0
MD
434 *
435 * The vnode must be locked.
984263bc 436 */
6bae6177
MD
437static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
438static int vtruncbuf_bp_trunc(struct buf *bp, void *data);
439static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
440static int vtruncbuf_bp_metasync(struct buf *bp, void *data);
441
984263bc 442int
87de5057 443vtruncbuf(struct vnode *vp, off_t length, int blksize)
984263bc 444{
54078292 445 off_t truncloffset;
c4b46cb4 446 const char *filename;
0202303b 447 int count;
984263bc
MD
448
449 /*
54078292 450 * Round up to the *next* block, then destroy the buffers in question.
6bae6177
MD
451 * Since we are only removing some of the buffers we must rely on the
452 * scan count to determine whether a loop is necessary.
984263bc 453 */
54078292
MD
454 if ((count = (int)(length % blksize)) != 0)
455 truncloffset = length + (blksize - count);
456 else
457 truncloffset = length;
984263bc 458
3b998fa9 459 lwkt_gettoken(&vp->v_token);
6bae6177
MD
460 do {
461 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
462 vtruncbuf_bp_trunc_cmp,
54078292 463 vtruncbuf_bp_trunc, &truncloffset);
6bae6177
MD
464 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
465 vtruncbuf_bp_trunc_cmp,
54078292 466 vtruncbuf_bp_trunc, &truncloffset);
6bae6177 467 } while(count);
984263bc 468
6bae6177
MD
469 /*
470 * For safety, fsync any remaining metadata if the file is not being
471 * truncated to 0. Since the metadata does not represent the entire
472 * dirty list we have to rely on the hit count to ensure that we get
473 * all of it.
474 */
984263bc 475 if (length > 0) {
6bae6177
MD
476 do {
477 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
478 vtruncbuf_bp_metasync_cmp,
479 vtruncbuf_bp_metasync, vp);
480 } while (count);
984263bc
MD
481 }
482
6bae6177 483 /*
c4b46cb4 484 * Clean out any left over VM backing store.
0202303b 485 *
135bd6a8
MD
486 * It is possible to have in-progress I/O from buffers that were
487 * not part of the truncation. This should not happen if we
488 * are truncating to 0-length.
6bae6177 489 */
0202303b
MD
490 vnode_pager_setsize(vp, length);
491 bio_track_wait(&vp->v_track_write, 0, 0);
492
f63911bf
MD
493 /*
494 * Debugging only
495 */
496 spin_lock_wr(&vp->v_spinlock);
c4b46cb4
MD
497 filename = TAILQ_FIRST(&vp->v_namecache) ?
498 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
f63911bf 499 spin_unlock_wr(&vp->v_spinlock);
c4b46cb4 500
c4b46cb4
MD
501 /*
502 * Make sure no buffers were instantiated while we were trying
503 * to clean out the remaining VM pages. This could occur due
504 * to busy dirty VM pages being flushed out to disk.
505 */
506 do {
507 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
508 vtruncbuf_bp_trunc_cmp,
509 vtruncbuf_bp_trunc, &truncloffset);
510 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
511 vtruncbuf_bp_trunc_cmp,
512 vtruncbuf_bp_trunc, &truncloffset);
513 if (count) {
6ea70f76 514 kprintf("Warning: vtruncbuf(): Had to re-clean %d "
c4b46cb4
MD
515 "left over buffers in %s\n", count, filename);
516 }
517 } while(count);
984263bc 518
3b998fa9 519 lwkt_reltoken(&vp->v_token);
b1f72a5c 520
984263bc
MD
521 return (0);
522}
523
524/*
6bae6177
MD
525 * The callback buffer is beyond the new file EOF and must be destroyed.
526 * Note that the compare function must conform to the RB_SCAN's requirements.
527 */
528static
529int
530vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
531{
54078292 532 if (bp->b_loffset >= *(off_t *)data)
6bae6177
MD
533 return(0);
534 return(-1);
535}
536
537static
538int
539vtruncbuf_bp_trunc(struct buf *bp, void *data)
540{
541 /*
542 * Do not try to use a buffer we cannot immediately lock, but sleep
543 * anyway to prevent a livelock. The code will loop until all buffers
544 * can be acted upon.
545 */
546 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
547 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
548 BUF_UNLOCK(bp);
549 } else {
550 bremfree(bp);
135bd6a8 551 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
6bae6177
MD
552 brelse(bp);
553 }
554 return(1);
555}
556
557/*
558 * Fsync all meta-data after truncating a file to be non-zero. Only metadata
54078292 559 * blocks (with a negative loffset) are scanned.
6bae6177
MD
560 * Note that the compare function must conform to the RB_SCAN's requirements.
561 */
562static int
563vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
564{
54078292 565 if (bp->b_loffset < 0)
6bae6177
MD
566 return(0);
567 return(1);
568}
569
570static int
571vtruncbuf_bp_metasync(struct buf *bp, void *data)
572{
573 struct vnode *vp = data;
574
575 if (bp->b_flags & B_DELWRI) {
576 /*
577 * Do not try to use a buffer we cannot immediately lock,
578 * but sleep anyway to prevent a livelock. The code will
579 * loop until all buffers can be acted upon.
580 */
581 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
582 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
583 BUF_UNLOCK(bp);
584 } else {
585 bremfree(bp);
ae8e83e6
MD
586 if (bp->b_vp == vp)
587 bawrite(bp);
588 else
589 bwrite(bp);
6bae6177
MD
590 }
591 return(1);
592 } else {
593 return(0);
594 }
595}
596
597/*
598 * vfsync - implements a multipass fsync on a file which understands
599 * dependancies and meta-data. The passed vnode must be locked. The
600 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY.
601 *
602 * When fsyncing data asynchronously just do one consolidated pass starting
603 * with the most negative block number. This may not get all the data due
604 * to dependancies.
605 *
606 * When fsyncing data synchronously do a data pass, then a metadata pass,
607 * then do additional data+metadata passes to try to get all the data out.
608 */
609static int vfsync_wait_output(struct vnode *vp,
610 int (*waitoutput)(struct vnode *, struct thread *));
611static int vfsync_data_only_cmp(struct buf *bp, void *data);
612static int vfsync_meta_only_cmp(struct buf *bp, void *data);
613static int vfsync_lazy_range_cmp(struct buf *bp, void *data);
614static int vfsync_bp(struct buf *bp, void *data);
615
616struct vfsync_info {
617 struct vnode *vp;
618 int synchronous;
619 int syncdeps;
620 int lazycount;
621 int lazylimit;
54078292 622 int skippedbufs;
6bae6177
MD
623 int (*checkdef)(struct buf *);
624};
625
626int
4e0ecc94 627vfsync(struct vnode *vp, int waitfor, int passes,
6bae6177
MD
628 int (*checkdef)(struct buf *),
629 int (*waitoutput)(struct vnode *, struct thread *))
630{
631 struct vfsync_info info;
632 int error;
633
634 bzero(&info, sizeof(info));
635 info.vp = vp;
6bae6177
MD
636 if ((info.checkdef = checkdef) == NULL)
637 info.syncdeps = 1;
638
3b998fa9 639 lwkt_gettoken(&vp->v_token);
e43a034f 640
6bae6177
MD
641 switch(waitfor) {
642 case MNT_LAZY:
643 /*
644 * Lazy (filesystem syncer typ) Asynchronous plus limit the
645 * number of data (not meta) pages we try to flush to 1MB.
646 * A non-zero return means that lazy limit was reached.
647 */
648 info.lazylimit = 1024 * 1024;
649 info.syncdeps = 1;
650 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
651 vfsync_lazy_range_cmp, vfsync_bp, &info);
652 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
653 vfsync_meta_only_cmp, vfsync_bp, &info);
654 if (error == 0)
655 vp->v_lazyw = 0;
656 else if (!RB_EMPTY(&vp->v_rbdirty_tree))
657 vn_syncer_add_to_worklist(vp, 1);
658 error = 0;
659 break;
660 case MNT_NOWAIT:
661 /*
662 * Asynchronous. Do a data-only pass and a meta-only pass.
663 */
664 info.syncdeps = 1;
665 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,
666 vfsync_bp, &info);
667 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp,
668 vfsync_bp, &info);
669 error = 0;
670 break;
671 default:
672 /*
673 * Synchronous. Do a data-only pass, then a meta-data+data
674 * pass, then additional integrated passes to try to get
675 * all the dependancies flushed.
676 */
677 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,
678 vfsync_bp, &info);
679 error = vfsync_wait_output(vp, waitoutput);
680 if (error == 0) {
54078292 681 info.skippedbufs = 0;
6bae6177
MD
682 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
683 vfsync_bp, &info);
684 error = vfsync_wait_output(vp, waitoutput);
54078292 685 if (info.skippedbufs)
6ea70f76 686 kprintf("Warning: vfsync skipped %d dirty bufs in pass2!\n", info.skippedbufs);
6bae6177
MD
687 }
688 while (error == 0 && passes > 0 &&
0202303b
MD
689 !RB_EMPTY(&vp->v_rbdirty_tree)
690 ) {
6bae6177
MD
691 if (--passes == 0) {
692 info.synchronous = 1;
693 info.syncdeps = 1;
694 }
695 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
696 vfsync_bp, &info);
697 if (error < 0)
698 error = -error;
699 info.syncdeps = 1;
700 if (error == 0)
701 error = vfsync_wait_output(vp, waitoutput);
702 }
703 break;
704 }
3b998fa9 705 lwkt_reltoken(&vp->v_token);
6bae6177
MD
706 return(error);
707}
708
709static int
a9a20f98
MD
710vfsync_wait_output(struct vnode *vp,
711 int (*waitoutput)(struct vnode *, struct thread *))
6bae6177 712{
a9a20f98 713 int error;
6bae6177 714
a9a20f98 715 error = bio_track_wait(&vp->v_track_write, 0, 0);
6bae6177
MD
716 if (waitoutput)
717 error = waitoutput(vp, curthread);
718 return(error);
719}
720
721static int
722vfsync_data_only_cmp(struct buf *bp, void *data)
723{
54078292 724 if (bp->b_loffset < 0)
6bae6177
MD
725 return(-1);
726 return(0);
727}
728
729static int
730vfsync_meta_only_cmp(struct buf *bp, void *data)
731{
54078292 732 if (bp->b_loffset < 0)
6bae6177
MD
733 return(0);
734 return(1);
735}
736
737static int
738vfsync_lazy_range_cmp(struct buf *bp, void *data)
739{
740 struct vfsync_info *info = data;
54078292 741 if (bp->b_loffset < info->vp->v_lazyw)
6bae6177
MD
742 return(-1);
743 return(0);
744}
745
746static int
747vfsync_bp(struct buf *bp, void *data)
748{
749 struct vfsync_info *info = data;
750 struct vnode *vp = info->vp;
751 int error;
752
753 /*
754 * if syncdeps is not set we do not try to write buffers which have
755 * dependancies.
756 */
757 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp))
758 return(0);
759
760 /*
761 * Ignore buffers that we cannot immediately lock. XXX
762 */
54078292 763 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
6ea70f76 764 kprintf("Warning: vfsync_bp skipping dirty buffer %p\n", bp);
54078292 765 ++info->skippedbufs;
6bae6177 766 return(0);
54078292 767 }
6bae6177
MD
768 if ((bp->b_flags & B_DELWRI) == 0)
769 panic("vfsync_bp: buffer not dirty");
770 if (vp != bp->b_vp)
771 panic("vfsync_bp: buffer vp mismatch");
772
773 /*
774 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer
775 * has been written but an additional handshake with the device
776 * is required before we can dispose of the buffer. We have no idea
777 * how to do this so we have to skip these buffers.
778 */
779 if (bp->b_flags & B_NEEDCOMMIT) {
780 BUF_UNLOCK(bp);
781 return(0);
782 }
783
17a8ba12
MD
784 /*
785 * Ask bioops if it is ok to sync
786 */
787 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {
788 bremfree(bp);
789 brelse(bp);
790 return(0);
791 }
792
6bae6177
MD
793 if (info->synchronous) {
794 /*
795 * Synchronous flushing. An error may be returned.
796 */
797 bremfree(bp);
6bae6177 798 error = bwrite(bp);
6bae6177
MD
799 } else {
800 /*
801 * Asynchronous flushing. A negative return value simply
802 * stops the scan and is not considered an error. We use
803 * this to support limited MNT_LAZY flushes.
804 */
54078292 805 vp->v_lazyw = bp->b_loffset;
6bae6177 806 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
6bae6177
MD
807 info->lazycount += vfs_bio_awrite(bp);
808 } else {
809 info->lazycount += bp->b_bufsize;
810 bremfree(bp);
6bae6177 811 bawrite(bp);
6bae6177
MD
812 }
813 if (info->lazylimit && info->lazycount >= info->lazylimit)
814 error = 1;
815 else
816 error = 0;
817 }
818 return(-error);
819}
820
821/*
984263bc 822 * Associate a buffer with a vnode.
b1c20cfa
MD
823 *
824 * MPSAFE
984263bc 825 */
b1c20cfa 826int
dd98570a 827bgetvp(struct vnode *vp, struct buf *bp)
984263bc 828{
984263bc 829 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
9e45bec7 830 KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI|B_VNCLEAN|B_VNDIRTY)) == 0);
984263bc 831
0202303b 832 /*
984263bc
MD
833 * Insert onto list for new vnode.
834 */
3b998fa9 835 lwkt_gettoken(&vp->v_token);
b1c20cfa 836 if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) {
3b998fa9 837 lwkt_reltoken(&vp->v_token);
b1c20cfa
MD
838 return (EEXIST);
839 }
1f1ea522
MD
840 bp->b_vp = vp;
841 bp->b_flags |= B_HASHED;
9e45bec7 842 bp->b_flags |= B_VNCLEAN;
6bae6177 843 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp))
1f1ea522 844 panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp);
b1c20cfa 845 vhold(vp);
3b998fa9 846 lwkt_reltoken(&vp->v_token);
b1c20cfa 847 return(0);
984263bc
MD
848}
849
850/*
851 * Disassociate a buffer from a vnode.
852 */
853void
dd98570a 854brelvp(struct buf *bp)
984263bc
MD
855{
856 struct vnode *vp;
984263bc
MD
857
858 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
859
860 /*
861 * Delete from old vnode list, if on one.
862 */
863 vp = bp->b_vp;
3b998fa9 864 lwkt_gettoken(&vp->v_token);
9e45bec7
MD
865 if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN)) {
866 if (bp->b_flags & B_VNDIRTY)
6bae6177
MD
867 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
868 else
869 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
9e45bec7 870 bp->b_flags &= ~(B_VNDIRTY | B_VNCLEAN);
984263bc 871 }
1f1ea522
MD
872 if (bp->b_flags & B_HASHED) {
873 buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp);
874 bp->b_flags &= ~B_HASHED;
875 }
6bae6177 876 if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
2247fe02 877 vclrflags(vp, VONWORKLST);
984263bc
MD
878 LIST_REMOVE(vp, v_synclist);
879 }
5fd012e0 880 bp->b_vp = NULL;
3b998fa9 881 lwkt_reltoken(&vp->v_token);
0202303b 882
5fd012e0 883 vdrop(vp);
984263bc
MD
884}
885
886/*
1f1ea522
MD
887 * Reassign the buffer to the proper clean/dirty list based on B_DELWRI.
888 * This routine is called when the state of the B_DELWRI bit is changed.
b1c20cfa
MD
889 *
890 * MPSAFE
984263bc
MD
891 */
892void
1f1ea522 893reassignbuf(struct buf *bp)
984263bc 894{
1f1ea522 895 struct vnode *vp = bp->b_vp;
984263bc 896 int delay;
984263bc 897
1f1ea522 898 KKASSERT(vp != NULL);
984263bc
MD
899 ++reassignbufcalls;
900
901 /*
902 * B_PAGING flagged buffers cannot be reassigned because their vp
903 * is not fully linked in.
904 */
905 if (bp->b_flags & B_PAGING)
906 panic("cannot reassign paging buffer");
907
3b998fa9 908 lwkt_gettoken(&vp->v_token);
984263bc 909 if (bp->b_flags & B_DELWRI) {
1f1ea522
MD
910 /*
911 * Move to the dirty list, add the vnode to the worklist
912 */
9e45bec7 913 if (bp->b_flags & B_VNCLEAN) {
1f1ea522 914 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
9e45bec7 915 bp->b_flags &= ~B_VNCLEAN;
1f1ea522 916 }
9e45bec7 917 if ((bp->b_flags & B_VNDIRTY) == 0) {
1f1ea522
MD
918 if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) {
919 panic("reassignbuf: dup lblk vp %p bp %p",
920 vp, bp);
921 }
9e45bec7 922 bp->b_flags |= B_VNDIRTY;
1f1ea522
MD
923 }
924 if ((vp->v_flag & VONWORKLST) == 0) {
925 switch (vp->v_type) {
984263bc
MD
926 case VDIR:
927 delay = dirdelay;
928 break;
929 case VCHR:
930 case VBLK:
1f1ea522
MD
931 if (vp->v_rdev &&
932 vp->v_rdev->si_mountpoint != NULL) {
984263bc
MD
933 delay = metadelay;
934 break;
935 }
936 /* fall through */
937 default:
938 delay = filedelay;
939 }
1f1ea522 940 vn_syncer_add_to_worklist(vp, delay);
984263bc 941 }
984263bc 942 } else {
1f1ea522
MD
943 /*
944 * Move to the clean list, remove the vnode from the worklist
945 * if no dirty blocks remain.
946 */
9e45bec7 947 if (bp->b_flags & B_VNDIRTY) {
1f1ea522 948 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
9e45bec7 949 bp->b_flags &= ~B_VNDIRTY;
1f1ea522 950 }
9e45bec7 951 if ((bp->b_flags & B_VNCLEAN) == 0) {
1f1ea522
MD
952 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) {
953 panic("reassignbuf: dup lblk vp %p bp %p",
954 vp, bp);
955 }
9e45bec7 956 bp->b_flags |= B_VNCLEAN;
1f1ea522
MD
957 }
958 if ((vp->v_flag & VONWORKLST) &&
959 RB_EMPTY(&vp->v_rbdirty_tree)) {
2247fe02 960 vclrflags(vp, VONWORKLST);
1f1ea522 961 LIST_REMOVE(vp, v_synclist);
984263bc 962 }
984263bc 963 }
3b998fa9 964 lwkt_reltoken(&vp->v_token);
984263bc
MD
965}
966
967/*
968 * Create a vnode for a block device.
969 * Used for mounting the root file system.
970 */
cd29885a 971extern struct vop_ops *devfs_vnode_dev_vops_p;
984263bc 972int
b13267a5 973bdevvp(cdev_t dev, struct vnode **vpp)
984263bc 974{
1fd87d54 975 struct vnode *vp;
984263bc
MD
976 struct vnode *nvp;
977 int error;
978
028066b1 979 if (dev == NULL) {
984263bc
MD
980 *vpp = NULLVP;
981 return (ENXIO);
982 }
aec8eea4
MD
983 error = getspecialvnode(VT_NON, NULL, &devfs_vnode_dev_vops_p,
984 &nvp, 0, 0);
984263bc
MD
985 if (error) {
986 *vpp = NULLVP;
987 return (error);
988 }
989 vp = nvp;
e4c9c0c8 990 vp->v_type = VCHR;
9b823501 991#if 0
cd29885a 992 vp->v_rdev = dev;
9b823501
AH
993#endif
994 v_associate_rdev(vp, dev);
0e9b9130
MD
995 vp->v_umajor = dev->si_umajor;
996 vp->v_uminor = dev->si_uminor;
5fd012e0 997 vx_unlock(vp);
984263bc
MD
998 *vpp = vp;
999 return (0);
5fd012e0 1000}
41a01a4d 1001
984263bc 1002int
b13267a5 1003v_associate_rdev(struct vnode *vp, cdev_t dev)
984263bc 1004{
0e9b9130 1005 if (dev == NULL)
5fd012e0
MD
1006 return(ENXIO);
1007 if (dev_is_good(dev) == 0)
1008 return(ENXIO);
1009 KKASSERT(vp->v_rdev == NULL);
5fd012e0 1010 vp->v_rdev = reference_dev(dev);
3b998fa9 1011 lwkt_gettoken(&spechash_token);
0de08e6d 1012 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_cdevnext);
3b998fa9 1013 lwkt_reltoken(&spechash_token);
5fd012e0
MD
1014 return(0);
1015}
984263bc 1016
5fd012e0
MD
1017void
1018v_release_rdev(struct vnode *vp)
1019{
b13267a5 1020 cdev_t dev;
984263bc 1021
5fd012e0 1022 if ((dev = vp->v_rdev) != NULL) {
3b998fa9 1023 lwkt_gettoken(&spechash_token);
0de08e6d 1024 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_cdevnext);
5fd012e0 1025 vp->v_rdev = NULL;
5fd012e0 1026 release_dev(dev);
3b998fa9 1027 lwkt_reltoken(&spechash_token);
984263bc 1028 }
984263bc
MD
1029}
1030
1031/*
b13267a5 1032 * Add a vnode to the alias list hung off the cdev_t. We only associate
5fd012e0
MD
1033 * the device number with the vnode. The actual device is not associated
1034 * until the vnode is opened (usually in spec_open()), and will be
1035 * disassociated on last close.
984263bc 1036 */
5fd012e0 1037void
0e9b9130 1038addaliasu(struct vnode *nvp, int x, int y)
984263bc 1039{
5fd012e0
MD
1040 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1041 panic("addaliasu on non-special vnode");
0e9b9130
MD
1042 nvp->v_umajor = x;
1043 nvp->v_uminor = y;
984263bc
MD
1044}
1045
1046/*
cf683bae
MD
1047 * Simple call that a filesystem can make to try to get rid of a
1048 * vnode. It will fail if anyone is referencing the vnode (including
1049 * the caller).
1050 *
1051 * The filesystem can check whether its in-memory inode structure still
1052 * references the vp on return.
1053 */
1054void
1055vclean_unlocked(struct vnode *vp)
1056{
1057 vx_get(vp);
1058 if (sysref_isactive(&vp->v_sysref) == 0)
2b4ed70b 1059 vgone_vxlocked(vp);
cf683bae
MD
1060 vx_put(vp);
1061}
1062
1063/*
5fd012e0
MD
1064 * Disassociate a vnode from its underlying filesystem.
1065 *
3c37c940
MD
1066 * The vnode must be VX locked and referenced. In all normal situations
1067 * there are no active references. If vclean_vxlocked() is called while
1068 * there are active references, the vnode is being ripped out and we have
1069 * to call VOP_CLOSE() as appropriate before we can reclaim it.
984263bc 1070 */
5fd012e0 1071void
3c37c940 1072vclean_vxlocked(struct vnode *vp, int flags)
984263bc
MD
1073{
1074 int active;
8ddc6004 1075 int n;
7540ab49 1076 vm_object_t object;
984263bc
MD
1077
1078 /*
5fd012e0 1079 * If the vnode has already been reclaimed we have nothing to do.
984263bc 1080 */
3c37c940 1081 if (vp->v_flag & VRECLAIMED)
5fd012e0 1082 return;
2247fe02 1083 vsetflags(vp, VRECLAIMED);
984263bc
MD
1084
1085 /*
5fd012e0 1086 * Scrap the vfs cache
984263bc 1087 */
6b008938 1088 while (cache_inval_vp(vp, 0) != 0) {
6ea70f76 1089 kprintf("Warning: vnode %p clean/cache_resolution race detected\n", vp);
25cb3304
MD
1090 tsleep(vp, 0, "vclninv", 2);
1091 }
41a01a4d 1092
984263bc 1093 /*
5fd012e0
MD
1094 * Check to see if the vnode is in use. If so we have to reference it
1095 * before we clean it out so that its count cannot fall to zero and
1096 * generate a race against ourselves to recycle it.
984263bc 1097 */
3c37c940 1098 active = sysref_isactive(&vp->v_sysref);
984263bc
MD
1099
1100 /*
5fd012e0 1101 * Clean out any buffers associated with the vnode and destroy its
7540ab49 1102 * object, if it has one.
984263bc 1103 */
87de5057 1104 vinvalbuf(vp, V_SAVE, 0, 0);
7540ab49 1105
984263bc 1106 /*
8ddc6004
MD
1107 * If purging an active vnode (typically during a forced unmount
1108 * or reboot), it must be closed and deactivated before being
1109 * reclaimed. This isn't really all that safe, but what can
1110 * we do? XXX.
5fd012e0
MD
1111 *
1112 * Note that neither of these routines unlocks the vnode.
984263bc 1113 */
8ddc6004
MD
1114 if (active && (flags & DOCLOSE)) {
1115 while ((n = vp->v_opencount) != 0) {
1116 if (vp->v_writecount)
87de5057 1117 VOP_CLOSE(vp, FWRITE|FNONBLOCK);
8ddc6004 1118 else
87de5057 1119 VOP_CLOSE(vp, FNONBLOCK);
8ddc6004 1120 if (vp->v_opencount == n) {
6ea70f76 1121 kprintf("Warning: unable to force-close"
8ddc6004
MD
1122 " vnode %p\n", vp);
1123 break;
1124 }
1125 }
5fd012e0
MD
1126 }
1127
1128 /*
64e0b2d3 1129 * If the vnode has not been deactivated, deactivated it. Deactivation
e3bc9a94
MD
1130 * can create new buffers and VM pages so we have to call vinvalbuf()
1131 * again to make sure they all get flushed.
1132 *
1133 * This can occur if a file with a link count of 0 needs to be
1134 * truncated.
2247fe02
MD
1135 *
1136 * If the vnode is already dead don't try to deactivate it.
5fd012e0
MD
1137 */
1138 if ((vp->v_flag & VINACTIVE) == 0) {
2247fe02
MD
1139 vsetflags(vp, VINACTIVE);
1140 if (vp->v_mount)
1141 VOP_INACTIVE(vp);
e3bc9a94
MD
1142 vinvalbuf(vp, V_SAVE, 0, 0);
1143 }
1144
1145 /*
1146 * If the vnode has an object, destroy it.
1147 */
1148 if ((object = vp->v_object) != NULL) {
1149 if (object->ref_count == 0) {
1150 if ((object->flags & OBJ_DEAD) == 0)
1151 vm_object_terminate(object);
1152 } else {
1153 vm_pager_deallocate(object);
1154 }
2247fe02 1155 vclrflags(vp, VOBJBUF);
984263bc 1156 }
e3bc9a94
MD
1157 KKASSERT((vp->v_flag & VOBJBUF) == 0);
1158
984263bc 1159 /*
2247fe02 1160 * Reclaim the vnode if not already dead.
984263bc 1161 */
2247fe02 1162 if (vp->v_mount && VOP_RECLAIM(vp))
984263bc
MD
1163 panic("vclean: cannot reclaim");
1164
984263bc
MD
1165 /*
1166 * Done with purge, notify sleepers of the grim news.
1167 */
66a1ddf5 1168 vp->v_ops = &dead_vnode_vops_p;
984263bc
MD
1169 vn_pollgone(vp);
1170 vp->v_tag = VT_NON;
64e0b2d3
MD
1171
1172 /*
1173 * If we are destroying an active vnode, reactivate it now that
1174 * we have reassociated it with deadfs. This prevents the system
1175 * from crashing on the vnode due to it being unexpectedly marked
1176 * as inactive or reclaimed.
1177 */
1178 if (active && (flags & DOCLOSE)) {
2247fe02 1179 vclrflags(vp, VINACTIVE | VRECLAIMED);
64e0b2d3 1180 }
984263bc
MD
1181}
1182
1183/*
1184 * Eliminate all activity associated with the requested vnode
1185 * and with all vnodes aliased to the requested vnode.
dd98570a 1186 *
b8477cda 1187 * The vnode must be referenced but should not be locked.
984263bc
MD
1188 */
1189int
b8477cda 1190vrevoke(struct vnode *vp, struct ucred *cred)
984263bc 1191{
b8477cda 1192 struct vnode *vq;
a32446b7 1193 struct vnode *vqn;
b13267a5 1194 cdev_t dev;
b8477cda 1195 int error;
e4c9c0c8
MD
1196
1197 /*
1198 * If the vnode has a device association, scrap all vnodes associated
1199 * with the device. Don't let the device disappear on us while we
1200 * are scrapping the vnodes.
5fd012e0
MD
1201 *
1202 * The passed vp will probably show up in the list, do not VX lock
1203 * it twice!
a32446b7
MD
1204 *
1205 * Releasing the vnode's rdev here can mess up specfs's call to
1206 * device close, so don't do it. The vnode has been disassociated
1207 * and the device will be closed after the last ref on the related
1208 * fp goes away (if not still open by e.g. the kernel).
e4c9c0c8 1209 */
b8477cda
MD
1210 if (vp->v_type != VCHR) {
1211 error = fdrevoke(vp, DTYPE_VNODE, cred);
1212 return (error);
1213 }
e4c9c0c8 1214 if ((dev = vp->v_rdev) == NULL) {
cd29885a 1215 return(0);
e4c9c0c8
MD
1216 }
1217 reference_dev(dev);
3b998fa9 1218 lwkt_gettoken(&spechash_token);
a32446b7
MD
1219
1220 vqn = SLIST_FIRST(&dev->si_hlist);
1221 if (vqn)
1222 vref(vqn);
1223 while ((vq = vqn) != NULL) {
1224 vqn = SLIST_NEXT(vqn, v_cdevnext);
1225 if (vqn)
1226 vref(vqn);
b8477cda 1227 fdrevoke(vq, DTYPE_VNODE, cred);
a32446b7 1228 /*v_release_rdev(vq);*/
b8477cda 1229 vrele(vq);
984263bc 1230 }
3b998fa9 1231 lwkt_reltoken(&spechash_token);
a32446b7 1232 dev_drevoke(dev);
9b823501 1233 release_dev(dev);
984263bc
MD
1234 return (0);
1235}
1236
1237/*
3c37c940
MD
1238 * This is called when the object underlying a vnode is being destroyed,
1239 * such as in a remove(). Try to recycle the vnode immediately if the
1240 * only active reference is our reference.
c0c70b27
MD
1241 *
1242 * Directory vnodes in the namecache with children cannot be immediately
1243 * recycled because numerous VOP_N*() ops require them to be stable.
1b7df30a
MD
1244 *
1245 * To avoid recursive recycling from VOP_INACTIVE implemenetations this
1246 * function is a NOP if VRECLAIMED is already set.
984263bc
MD
1247 */
1248int
87de5057 1249vrecycle(struct vnode *vp)
984263bc 1250{
1b7df30a 1251 if (vp->v_sysref.refcnt <= 1 && (vp->v_flag & VRECLAIMED) == 0) {
c0c70b27
MD
1252 if (cache_inval_vp_nonblock(vp))
1253 return(0);
3c37c940 1254 vgone_vxlocked(vp);
984263bc
MD
1255 return (1);
1256 }
984263bc
MD
1257 return (0);
1258}
1259
1260/*
2ec4b00d
MD
1261 * Return the maximum I/O size allowed for strategy calls on VP.
1262 *
1263 * If vp is VCHR or VBLK we dive the device, otherwise we use
1264 * the vp's mount info.
1265 */
1266int
1267vmaxiosize(struct vnode *vp)
1268{
1269 if (vp->v_type == VBLK || vp->v_type == VCHR) {
1270 return(vp->v_rdev->si_iosize_max);
1271 } else {
1272 return(vp->v_mount->mnt_iosize_max);
1273 }
1274}
1275
1276/*
5fd012e0
MD
1277 * Eliminate all activity associated with a vnode in preparation for reuse.
1278 *
57ac0c99
MD
1279 * The vnode must be VX locked and refd and will remain VX locked and refd
1280 * on return. This routine may be called with the vnode in any state, as
1281 * long as it is VX locked. The vnode will be cleaned out and marked
1282 * VRECLAIMED but will not actually be reused until all existing refs and
1283 * holds go away.
5fd012e0
MD
1284 *
1285 * NOTE: This routine may be called on a vnode which has not yet been
1286 * already been deactivated (VOP_INACTIVE), or on a vnode which has
1287 * already been reclaimed.
1288 *
1289 * This routine is not responsible for placing us back on the freelist.
1290 * Instead, it happens automatically when the caller releases the VX lock
1291 * (assuming there aren't any other references).
984263bc 1292 */
e3332475 1293void
3c37c940 1294vgone_vxlocked(struct vnode *vp)
e3332475 1295{
984263bc 1296 /*
5fd012e0 1297 * assert that the VX lock is held. This is an absolute requirement
3c37c940 1298 * now for vgone_vxlocked() to be called.
984263bc 1299 */
5fd012e0 1300 KKASSERT(vp->v_lock.lk_exclusivecount == 1);
984263bc 1301
2247fe02
MD
1302 get_mplock();
1303
984263bc 1304 /*
5fd012e0 1305 * Clean out the filesystem specific data and set the VRECLAIMED
e3332475 1306 * bit. Also deactivate the vnode if necessary.
984263bc 1307 */
3c37c940 1308 vclean_vxlocked(vp, DOCLOSE);
984263bc
MD
1309
1310 /*
1311 * Delete from old mount point vnode list, if on one.
1312 */
1b7df30a
MD
1313 if (vp->v_mount != NULL) {
1314 KKASSERT(vp->v_data == NULL);
5fd012e0 1315 insmntque(vp, NULL);
1b7df30a 1316 }
dd98570a 1317
984263bc
MD
1318 /*
1319 * If special device, remove it from special device alias list
1fbb5fc0
MD
1320 * if it is on one. This should normally only occur if a vnode is
1321 * being revoked as the device should otherwise have been released
1322 * naturally.
984263bc
MD
1323 */
1324 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
e4c9c0c8 1325 v_release_rdev(vp);
984263bc
MD
1326 }
1327
1328 /*
5fd012e0 1329 * Set us to VBAD
984263bc 1330 */
984263bc 1331 vp->v_type = VBAD;
2247fe02 1332 rel_mplock();
984263bc
MD
1333}
1334
1335/*
1336 * Lookup a vnode by device number.
3875f5b0
MD
1337 *
1338 * Returns non-zero and *vpp set to a vref'd vnode on success.
1339 * Returns zero on failure.
984263bc
MD
1340 */
1341int
b13267a5 1342vfinddev(cdev_t dev, enum vtype type, struct vnode **vpp)
984263bc
MD
1343{
1344 struct vnode *vp;
1345
3b998fa9 1346 lwkt_gettoken(&spechash_token);
0de08e6d 1347 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) {
984263bc
MD
1348 if (type == vp->v_type) {
1349 *vpp = vp;
3875f5b0 1350 vref(vp);
3b998fa9 1351 lwkt_reltoken(&spechash_token);
984263bc
MD
1352 return (1);
1353 }
1354 }
3b998fa9 1355 lwkt_reltoken(&spechash_token);
984263bc
MD
1356 return (0);
1357}
1358
1359/*
e4c9c0c8
MD
1360 * Calculate the total number of references to a special device. This
1361 * routine may only be called for VBLK and VCHR vnodes since v_rdev is
028066b1 1362 * an overloaded field. Since udev2dev can now return NULL, we have
e4c9c0c8 1363 * to check for a NULL v_rdev.
984263bc
MD
1364 */
1365int
b13267a5 1366count_dev(cdev_t dev)
984263bc 1367{
e4c9c0c8
MD
1368 struct vnode *vp;
1369 int count = 0;
984263bc 1370
e4c9c0c8 1371 if (SLIST_FIRST(&dev->si_hlist)) {
3b998fa9 1372 lwkt_gettoken(&spechash_token);
0de08e6d 1373 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) {
9b823501 1374 count += vp->v_opencount;
e4c9c0c8 1375 }
3b998fa9 1376 lwkt_reltoken(&spechash_token);
e4c9c0c8
MD
1377 }
1378 return(count);
984263bc
MD
1379}
1380
984263bc 1381int
e4c9c0c8
MD
1382vcount(struct vnode *vp)
1383{
1384 if (vp->v_rdev == NULL)
1385 return(0);
1386 return(count_dev(vp->v_rdev));
984263bc
MD
1387}
1388
1389/*
1c843a13
MD
1390 * Initialize VMIO for a vnode. This routine MUST be called before a
1391 * VFS can issue buffer cache ops on a vnode. It is typically called
1392 * when a vnode is initialized from its inode.
7540ab49
MD
1393 */
1394int
b0d18f7d 1395vinitvmio(struct vnode *vp, off_t filesize, int blksize, int boff)
7540ab49 1396{
7540ab49
MD
1397 vm_object_t object;
1398 int error = 0;
1399
1400retry:
1401 if ((object = vp->v_object) == NULL) {
b0d18f7d 1402 object = vnode_pager_alloc(vp, filesize, 0, 0, blksize, boff);
7540ab49
MD
1403 /*
1404 * Dereference the reference we just created. This assumes
1405 * that the object is associated with the vp.
1406 */
1407 object->ref_count--;
3c37c940 1408 vrele(vp);
7540ab49
MD
1409 } else {
1410 if (object->flags & OBJ_DEAD) {
a11aaa81 1411 vn_unlock(vp);
9e12ff11 1412 vm_object_dead_sleep(object, "vodead");
ca466bae 1413 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
7540ab49
MD
1414 goto retry;
1415 }
1416 }
1417 KASSERT(vp->v_object != NULL, ("vinitvmio: NULL object"));
2247fe02 1418 vsetflags(vp, VOBJBUF);
7540ab49
MD
1419 return (error);
1420}
1421
1422
1423/*
984263bc
MD
1424 * Print out a description of a vnode.
1425 */
1426static char *typename[] =
1427{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1428
1429void
dd98570a 1430vprint(char *label, struct vnode *vp)
984263bc
MD
1431{
1432 char buf[96];
1433
1434 if (label != NULL)
6ea70f76 1435 kprintf("%s: %p: ", label, (void *)vp);
984263bc 1436 else
6ea70f76 1437 kprintf("%p: ", (void *)vp);
3c37c940
MD
1438 kprintf("type %s, sysrefs %d, writecount %d, holdcnt %d,",
1439 typename[vp->v_type],
1440 vp->v_sysref.refcnt, vp->v_writecount, vp->v_auxrefs);
984263bc
MD
1441 buf[0] = '\0';
1442 if (vp->v_flag & VROOT)
1443 strcat(buf, "|VROOT");
67863d04
MD
1444 if (vp->v_flag & VPFSROOT)
1445 strcat(buf, "|VPFSROOT");
984263bc
MD
1446 if (vp->v_flag & VTEXT)
1447 strcat(buf, "|VTEXT");
1448 if (vp->v_flag & VSYSTEM)
1449 strcat(buf, "|VSYSTEM");
984263bc
MD
1450 if (vp->v_flag & VFREE)
1451 strcat(buf, "|VFREE");
1452 if (vp->v_flag & VOBJBUF)
1453 strcat(buf, "|VOBJBUF");
1454 if (buf[0] != '\0')
6ea70f76 1455 kprintf(" flags (%s)", &buf[1]);
984263bc 1456 if (vp->v_data == NULL) {
6ea70f76 1457 kprintf("\n");
984263bc 1458 } else {
6ea70f76 1459 kprintf("\n\t");
984263bc
MD
1460 VOP_PRINT(vp);
1461 }
1462}
1463
3b0783db
SK
1464/*
1465 * Do the usual access checking.
1466 * file_mode, uid and gid are from the vnode in question,
1467 * while acc_mode and cred are from the VOP_ACCESS parameter list
1468 */
1469int
1470vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
1471 mode_t acc_mode, struct ucred *cred)
1472{
1473 mode_t mask;
aa8969cf 1474 int ismember;
3b0783db
SK
1475
1476 /*
1477 * Super-user always gets read/write access, but execute access depends
1478 * on at least one execute bit being set.
1479 */
1480 if (priv_check_cred(cred, PRIV_ROOT, 0) == 0) {
1481 if ((acc_mode & VEXEC) && type != VDIR &&
1482 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
1483 return (EACCES);
1484 return (0);
1485 }
1486
1487 mask = 0;
1488
1489 /* Otherwise, check the owner. */
1490 if (cred->cr_uid == uid) {
1491 if (acc_mode & VEXEC)
1492 mask |= S_IXUSR;
1493 if (acc_mode & VREAD)
1494 mask |= S_IRUSR;
1495 if (acc_mode & VWRITE)
1496 mask |= S_IWUSR;
1497 return ((file_mode & mask) == mask ? 0 : EACCES);
1498 }
1499
1500 /* Otherwise, check the groups. */
1501 ismember = groupmember(gid, cred);
1502 if (cred->cr_svgid == gid || ismember) {
1503 if (acc_mode & VEXEC)
1504 mask |= S_IXGRP;
1505 if (acc_mode & VREAD)
1506 mask |= S_IRGRP;
1507 if (acc_mode & VWRITE)
1508 mask |= S_IWGRP;
1509 return ((file_mode & mask) == mask ? 0 : EACCES);
1510 }
1511
1512 /* Otherwise, check everyone else. */
1513 if (acc_mode & VEXEC)
1514 mask |= S_IXOTH;
1515 if (acc_mode & VREAD)
1516 mask |= S_IROTH;
1517 if (acc_mode & VWRITE)
1518 mask |= S_IWOTH;
1519 return ((file_mode & mask) == mask ? 0 : EACCES);
1520}
1521
984263bc
MD
1522#ifdef DDB
1523#include <ddb/ddb.h>
861905fb
MD
1524
1525static int db_show_locked_vnodes(struct mount *mp, void *data);
1526
984263bc
MD
1527/*
1528 * List all of the locked vnodes in the system.
1529 * Called when debugging the kernel.
1530 */
1531DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
1532{
6ea70f76 1533 kprintf("Locked vnodes\n");
861905fb
MD
1534 mountlist_scan(db_show_locked_vnodes, NULL,
1535 MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1536}
1537
1538static int
1539db_show_locked_vnodes(struct mount *mp, void *data __unused)
1540{
984263bc
MD
1541 struct vnode *vp;
1542
861905fb 1543 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
a11aaa81 1544 if (vn_islocked(vp))
60233e58 1545 vprint(NULL, vp);
984263bc 1546 }
861905fb 1547 return(0);
984263bc
MD
1548}
1549#endif
1550
1551/*
1552 * Top level filesystem related information gathering.
1553 */
402ed7e1 1554static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
984263bc
MD
1555
1556static int
1557vfs_sysctl(SYSCTL_HANDLER_ARGS)
1558{
1559 int *name = (int *)arg1 - 1; /* XXX */
1560 u_int namelen = arg2 + 1; /* XXX */
1561 struct vfsconf *vfsp;
2613053d 1562 int maxtypenum;
984263bc
MD
1563
1564#if 1 || defined(COMPAT_PRELITE2)
1565 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
1566 if (namelen == 1)
1567 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
1568#endif
1569
1570#ifdef notyet
1571 /* all sysctl names at this level are at least name and field */
1572 if (namelen < 2)
1573 return (ENOTDIR); /* overloaded */
1574 if (name[0] != VFS_GENERIC) {
2613053d 1575 vfsp = vfsconf_find_by_typenum(name[0]);
984263bc
MD
1576 if (vfsp == NULL)
1577 return (EOPNOTSUPP);
1578 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1579 oldp, oldlenp, newp, newlen, p));
1580 }
1581#endif
1582 switch (name[1]) {
1583 case VFS_MAXTYPENUM:
1584 if (namelen != 2)
1585 return (ENOTDIR);
2613053d
MN
1586 maxtypenum = vfsconf_get_maxtypenum();
1587 return (SYSCTL_OUT(req, &maxtypenum, sizeof(maxtypenum)));
984263bc
MD
1588 case VFS_CONF:
1589 if (namelen != 3)
1590 return (ENOTDIR); /* overloaded */
2613053d 1591 vfsp = vfsconf_find_by_typenum(name[2]);
984263bc
MD
1592 if (vfsp == NULL)
1593 return (EOPNOTSUPP);
1594 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
1595 }
1596 return (EOPNOTSUPP);
1597}
1598
1599SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
1600 "Generic filesystem");
1601
1602#if 1 || defined(COMPAT_PRELITE2)
1603
1604static int
2613053d 1605sysctl_ovfs_conf_iter(struct vfsconf *vfsp, void *data)
984263bc
MD
1606{
1607 int error;
984263bc 1608 struct ovfsconf ovfs;
2613053d
MN
1609 struct sysctl_req *req = (struct sysctl_req*) data;
1610
1611 bzero(&ovfs, sizeof(ovfs));
1612 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
1613 strcpy(ovfs.vfc_name, vfsp->vfc_name);
1614 ovfs.vfc_index = vfsp->vfc_typenum;
1615 ovfs.vfc_refcount = vfsp->vfc_refcount;
1616 ovfs.vfc_flags = vfsp->vfc_flags;
1617 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
1618 if (error)
1619 return error; /* abort iteration with error code */
1620 else
1621 return 0; /* continue iterating with next element */
1622}
984263bc 1623
2613053d
MN
1624static int
1625sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
1626{
1627 return vfsconf_each(sysctl_ovfs_conf_iter, (void*)req);
984263bc
MD
1628}
1629
1630#endif /* 1 || COMPAT_PRELITE2 */
1631
984263bc
MD
1632/*
1633 * Check to see if a filesystem is mounted on a block device.
1634 */
1635int
e4c9c0c8 1636vfs_mountedon(struct vnode *vp)
984263bc 1637{
b13267a5 1638 cdev_t dev;
984263bc 1639
0e9b9130 1640 if ((dev = vp->v_rdev) == NULL) {
cd29885a
MD
1641/* if (vp->v_type != VBLK)
1642 dev = get_dev(vp->v_uminor, vp->v_umajor); */
0e9b9130 1643 }
028066b1 1644 if (dev != NULL && dev->si_mountpoint)
984263bc
MD
1645 return (EBUSY);
1646 return (0);
1647}
1648
1649/*
1650 * Unmount all filesystems. The list is traversed in reverse order
1651 * of mounting to avoid dependencies.
1652 */
861905fb
MD
1653
1654static int vfs_umountall_callback(struct mount *mp, void *data);
1655
984263bc 1656void
dd98570a 1657vfs_unmountall(void)
984263bc 1658{
861905fb 1659 int count;
984263bc 1660
861905fb
MD
1661 do {
1662 count = mountlist_scan(vfs_umountall_callback,
acde96db 1663 NULL, MNTSCAN_REVERSE|MNTSCAN_NOBUSY);
861905fb
MD
1664 } while (count);
1665}
1666
1667static
1668int
1669vfs_umountall_callback(struct mount *mp, void *data)
1670{
861905fb
MD
1671 int error;
1672
acde96db 1673 error = dounmount(mp, MNT_FORCE);
861905fb
MD
1674 if (error) {
1675 mountlist_remove(mp);
6ea70f76 1676 kprintf("unmount of filesystem mounted from %s failed (",
861905fb
MD
1677 mp->mnt_stat.f_mntfromname);
1678 if (error == EBUSY)
6ea70f76 1679 kprintf("BUSY)\n");
861905fb 1680 else
6ea70f76 1681 kprintf("%d)\n", error);
984263bc 1682 }
861905fb 1683 return(1);
984263bc
MD
1684}
1685
1686/*
177403a9
MD
1687 * Checks the mount flags for parameter mp and put the names comma-separated
1688 * into a string buffer buf with a size limit specified by len.
1689 *
1690 * It returns the number of bytes written into buf, and (*errorp) will be
1691 * set to 0, EINVAL (if passed length is 0), or ENOSPC (supplied buffer was
1692 * not large enough). The buffer will be 0-terminated if len was not 0.
1693 */
177403a9 1694size_t
dad088a5
MD
1695vfs_flagstostr(int flags, const struct mountctl_opt *optp,
1696 char *buf, size_t len, int *errorp)
177403a9
MD
1697{
1698 static const struct mountctl_opt optnames[] = {
1699 { MNT_ASYNC, "asynchronous" },
1700 { MNT_EXPORTED, "NFS exported" },
1701 { MNT_LOCAL, "local" },
1702 { MNT_NOATIME, "noatime" },
1703 { MNT_NODEV, "nodev" },
1704 { MNT_NOEXEC, "noexec" },
1705 { MNT_NOSUID, "nosuid" },
1706 { MNT_NOSYMFOLLOW, "nosymfollow" },
1707 { MNT_QUOTA, "with-quotas" },
1708 { MNT_RDONLY, "read-only" },
1709 { MNT_SYNCHRONOUS, "synchronous" },
1710 { MNT_UNION, "union" },
1711 { MNT_NOCLUSTERR, "noclusterr" },
1712 { MNT_NOCLUSTERW, "noclusterw" },
1713 { MNT_SUIDDIR, "suiddir" },
1714 { MNT_SOFTDEP, "soft-updates" },
dad088a5
MD
1715 { MNT_IGNORE, "ignore" },
1716 { 0, NULL}
177403a9 1717 };
177403a9
MD
1718 int bwritten;
1719 int bleft;
1720 int optlen;
eac446c5 1721 int actsize;
dad088a5 1722
177403a9 1723 *errorp = 0;
177403a9
MD
1724 bwritten = 0;
1725 bleft = len - 1; /* leave room for trailing \0 */
eac446c5
MD
1726
1727 /*
1728 * Checks the size of the string. If it contains
1729 * any data, then we will append the new flags to
1730 * it.
1731 */
1732 actsize = strlen(buf);
1733 if (actsize > 0)
1734 buf += actsize;
1735
1736 /* Default flags if no flags passed */
1737 if (optp == NULL)
1738 optp = optnames;
1739
177403a9
MD
1740 if (bleft < 0) { /* degenerate case, 0-length buffer */
1741 *errorp = EINVAL;
1742 return(0);
1743 }
1744
dad088a5
MD
1745 for (; flags && optp->o_opt; ++optp) {
1746 if ((flags & optp->o_opt) == 0)
177403a9 1747 continue;
dad088a5 1748 optlen = strlen(optp->o_name);
eac446c5 1749 if (bwritten || actsize > 0) {
dad088a5 1750 if (bleft < 2) {
177403a9
MD
1751 *errorp = ENOSPC;
1752 break;
1753 }
1754 buf[bwritten++] = ',';
dad088a5
MD
1755 buf[bwritten++] = ' ';
1756 bleft -= 2;
177403a9
MD
1757 }
1758 if (bleft < optlen) {
1759 *errorp = ENOSPC;
1760 break;
1761 }
dad088a5 1762 bcopy(optp->o_name, buf + bwritten, optlen);
177403a9
MD
1763 bwritten += optlen;
1764 bleft -= optlen;
dad088a5 1765 flags &= ~optp->o_opt;
177403a9
MD
1766 }
1767
1768 /*
1769 * Space already reserved for trailing \0
1770 */
1771 buf[bwritten] = 0;
1772 return (bwritten);
1773}
1774
177403a9 1775/*
984263bc
MD
1776 * Build hash lists of net addresses and hang them off the mount point.
1777 * Called by ufs_mount() to set up the lists of export addresses.
1778 */
1779static int
dd98570a 1780vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1aa89f17 1781 const struct export_args *argp)
984263bc 1782{
1fd87d54
RG
1783 struct netcred *np;
1784 struct radix_node_head *rnh;
1785 int i;
984263bc
MD
1786 struct radix_node *rn;
1787 struct sockaddr *saddr, *smask = 0;
1788 struct domain *dom;
1789 int error;
1790
1791 if (argp->ex_addrlen == 0) {
1792 if (mp->mnt_flag & MNT_DEFEXPORTED)
1793 return (EPERM);
1794 np = &nep->ne_defexported;
1795 np->netc_exflags = argp->ex_flags;
1796 np->netc_anon = argp->ex_anon;
1797 np->netc_anon.cr_ref = 1;
1798 mp->mnt_flag |= MNT_DEFEXPORTED;
1799 return (0);
1800 }
1801
0260ddf9
MD
1802 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN)
1803 return (EINVAL);
1804 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN)
984263bc
MD
1805 return (EINVAL);
1806
1807 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
e7b4468c 1808 np = (struct netcred *) kmalloc(i, M_NETADDR, M_WAITOK | M_ZERO);
984263bc
MD
1809 saddr = (struct sockaddr *) (np + 1);
1810 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
1811 goto out;
1812 if (saddr->sa_len > argp->ex_addrlen)
1813 saddr->sa_len = argp->ex_addrlen;
1814 if (argp->ex_masklen) {
dd98570a
MD
1815 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1816 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
984263bc
MD
1817 if (error)
1818 goto out;
1819 if (smask->sa_len > argp->ex_masklen)
1820 smask->sa_len = argp->ex_masklen;
1821 }
1822 i = saddr->sa_family;
1823 if ((rnh = nep->ne_rtable[i]) == 0) {
1824 /*
1825 * Seems silly to initialize every AF when most are not used,
1826 * do so on demand here
1827 */
9c70fe43 1828 SLIST_FOREACH(dom, &domains, dom_next)
984263bc
MD
1829 if (dom->dom_family == i && dom->dom_rtattach) {
1830 dom->dom_rtattach((void **) &nep->ne_rtable[i],
1831 dom->dom_rtoffset);
1832 break;
1833 }
1834 if ((rnh = nep->ne_rtable[i]) == 0) {
1835 error = ENOBUFS;
1836 goto out;
1837 }
1838 }
2e9572df 1839 rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh,
984263bc
MD
1840 np->netc_rnodes);
1841 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
1842 error = EPERM;
1843 goto out;
1844 }
1845 np->netc_exflags = argp->ex_flags;
1846 np->netc_anon = argp->ex_anon;
1847 np->netc_anon.cr_ref = 1;
1848 return (0);
1849out:
efda3bd0 1850 kfree(np, M_NETADDR);
984263bc
MD
1851 return (error);
1852}
1853
1854/* ARGSUSED */
1855static int
dd98570a 1856vfs_free_netcred(struct radix_node *rn, void *w)
984263bc 1857{
1fd87d54 1858 struct radix_node_head *rnh = (struct radix_node_head *) w;
984263bc
MD
1859
1860 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
efda3bd0 1861 kfree((caddr_t) rn, M_NETADDR);
984263bc
MD
1862 return (0);
1863}
1864
1865/*
1866 * Free the net address hash lists that are hanging off the mount points.
1867 */
1868static void
dd98570a 1869vfs_free_addrlist(struct netexport *nep)
984263bc 1870{
1fd87d54
RG
1871 int i;
1872 struct radix_node_head *rnh;
984263bc
MD
1873
1874 for (i = 0; i <= AF_MAX; i++)
1875 if ((rnh = nep->ne_rtable[i])) {
1876 (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
1877 (caddr_t) rnh);
efda3bd0 1878 kfree((caddr_t) rnh, M_RTABLE);
984263bc
MD
1879 nep->ne_rtable[i] = 0;
1880 }
1881}
1882
1883int
1aa89f17
MD
1884vfs_export(struct mount *mp, struct netexport *nep,
1885 const struct export_args *argp)
984263bc
MD
1886{
1887 int error;
1888
1889 if (argp->ex_flags & MNT_DELEXPORT) {
1890 if (mp->mnt_flag & MNT_EXPUBLIC) {
1891 vfs_setpublicfs(NULL, NULL, NULL);
1892 mp->mnt_flag &= ~MNT_EXPUBLIC;
1893 }
1894 vfs_free_addrlist(nep);
1895 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1896 }
1897 if (argp->ex_flags & MNT_EXPORTED) {
1898 if (argp->ex_flags & MNT_EXPUBLIC) {
1899 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
1900 return (error);
1901 mp->mnt_flag |= MNT_EXPUBLIC;
1902 }
1903 if ((error = vfs_hang_addrlist(mp, nep, argp)))
1904 return (error);
1905 mp->mnt_flag |= MNT_EXPORTED;
1906 }
1907 return (0);
1908}
1909
1910
1911/*
1912 * Set the publicly exported filesystem (WebNFS). Currently, only
1913 * one public filesystem is possible in the spec (RFC 2054 and 2055)
1914 */
1915int
dd98570a 1916vfs_setpublicfs(struct mount *mp, struct netexport *nep,
1aa89f17 1917 const struct export_args *argp)
984263bc
MD
1918{
1919 int error;
1920 struct vnode *rvp;
1921 char *cp;
1922
1923 /*
1924 * mp == NULL -> invalidate the current info, the FS is
1925 * no longer exported. May be called from either vfs_export
1926 * or unmount, so check if it hasn't already been done.
1927 */
1928 if (mp == NULL) {
1929 if (nfs_pub.np_valid) {
1930 nfs_pub.np_valid = 0;
1931 if (nfs_pub.np_index != NULL) {
1932 FREE(nfs_pub.np_index, M_TEMP);
1933 nfs_pub.np_index = NULL;
1934 }
1935 }
1936 return (0);
1937 }
1938
1939 /*
1940 * Only one allowed at a time.
1941 */
1942 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
1943 return (EBUSY);
1944
1945 /*
1946 * Get real filehandle for root of exported FS.
1947 */
1948 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
1949 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
1950
1951 if ((error = VFS_ROOT(mp, &rvp)))
1952 return (error);
1953
1954 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
1955 return (error);
1956
1957 vput(rvp);
1958
1959 /*
1960 * If an indexfile was specified, pull it in.
1961 */
1962 if (argp->ex_indexfile != NULL) {
b80c9733
JS
1963 int namelen;
1964
1965 error = vn_get_namelen(rvp, &namelen);
1966 if (error)
1967 return (error);
1968 MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP,
984263bc
MD
1969 M_WAITOK);
1970 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
60233e58 1971 namelen, NULL);
984263bc
MD
1972 if (!error) {
1973 /*
1974 * Check for illegal filenames.
1975 */
1976 for (cp = nfs_pub.np_index; *cp; cp++) {
1977 if (*cp == '/') {
1978 error = EINVAL;
1979 break;
1980 }
1981 }
1982 }
1983 if (error) {
1984 FREE(nfs_pub.np_index, M_TEMP);
1985 return (error);
1986 }
1987 }
1988
1989 nfs_pub.np_mount = mp;
1990 nfs_pub.np_valid = 1;
1991 return (0);
1992}
1993
1994struct netcred *
dd98570a
MD
1995vfs_export_lookup(struct mount *mp, struct netexport *nep,
1996 struct sockaddr *nam)
984263bc 1997{
1fd87d54
RG
1998 struct netcred *np;
1999 struct radix_node_head *rnh;
984263bc
MD
2000 struct sockaddr *saddr;
2001
2002 np = NULL;
2003 if (mp->mnt_flag & MNT_EXPORTED) {
2004 /*
2005 * Lookup in the export list first.
2006 */
2007 if (nam != NULL) {
2008 saddr = nam;
2009 rnh = nep->ne_rtable[saddr->sa_family];
2010 if (rnh != NULL) {
2011 np = (struct netcred *)
2e9572df 2012 (*rnh->rnh_matchaddr)((char *)saddr,
984263bc
MD
2013 rnh);
2014 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2015 np = NULL;
2016 }
2017 }
2018 /*
2019 * If no address match, use the default if it exists.
2020 */
2021 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2022 np = &nep->ne_defexported;
2023 }
2024 return (np);
2025}
2026
2027/*
41a01a4d
MD
2028 * perform msync on all vnodes under a mount point. The mount point must
2029 * be locked. This code is also responsible for lazy-freeing unreferenced
2030 * vnodes whos VM objects no longer contain pages.
2031 *
2032 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
03a964e9
MD
2033 *
2034 * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked,
2035 * but vnode_pager_putpages() doesn't lock the vnode. We have to do it
2036 * way up in this high level function.
984263bc 2037 */
41a01a4d 2038static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data);
5fd012e0 2039static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data);
41a01a4d 2040
984263bc
MD
2041void
2042vfs_msync(struct mount *mp, int flags)
2043{
03a964e9
MD
2044 int vmsc_flags;
2045
2bc7505b
MD
2046 /*
2047 * tmpfs sets this flag to prevent msync(), sync, and the
2048 * filesystem periodic syncer from trying to flush VM pages
2049 * to swap. Only pure memory pressure flushes tmpfs VM pages
2050 * to swap.
2051 */
2052 if (mp->mnt_kern_flag & MNTK_NOMSYNC)
2053 return;
2054
2055 /*
2056 * Ok, scan the vnodes for work.
2057 */
03a964e9
MD
2058 vmsc_flags = VMSC_GETVP;
2059 if (flags != MNT_WAIT)
2060 vmsc_flags |= VMSC_NOWAIT;
2061 vmntvnodescan(mp, vmsc_flags, vfs_msync_scan1, vfs_msync_scan2,
973c11b9 2062 (void *)(intptr_t)flags);
41a01a4d 2063}
984263bc 2064
41a01a4d
MD
2065/*
2066 * scan1 is a fast pre-check. There could be hundreds of thousands of
2067 * vnodes, we cannot afford to do anything heavy weight until we have a
2068 * fairly good indication that there is work to do.
2069 */
2070static
2071int
2072vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data)
2073{
973c11b9 2074 int flags = (int)(intptr_t)data;
984263bc 2075
5fd012e0 2076 if ((vp->v_flag & VRECLAIMED) == 0) {
3c37c940 2077 if (vshouldmsync(vp))
5fd012e0 2078 return(0); /* call scan2 */
41a01a4d
MD
2079 if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
2080 (vp->v_flag & VOBJDIRTY) &&
a11aaa81 2081 (flags == MNT_WAIT || vn_islocked(vp) == 0)) {
5fd012e0 2082 return(0); /* call scan2 */
41a01a4d
MD
2083 }
2084 }
5fd012e0
MD
2085
2086 /*
2087 * do not call scan2, continue the loop
2088 */
41a01a4d
MD
2089 return(-1);
2090}
2091
03a964e9
MD
2092/*
2093 * This callback is handed a locked vnode.
2094 */
41a01a4d
MD
2095static
2096int
5fd012e0 2097vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data)
41a01a4d
MD
2098{
2099 vm_object_t obj;
973c11b9 2100 int flags = (int)(intptr_t)data;
41a01a4d 2101
5fd012e0 2102 if (vp->v_flag & VRECLAIMED)
41a01a4d
MD
2103 return(0);
2104
7540ab49
MD
2105 if ((mp->mnt_flag & MNT_RDONLY) == 0 && (vp->v_flag & VOBJDIRTY)) {
2106 if ((obj = vp->v_object) != NULL) {
5fd012e0
MD
2107 vm_object_page_clean(obj, 0, 0,
2108 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
984263bc
MD
2109 }
2110 }
41a01a4d 2111 return(0);
984263bc
MD
2112}
2113
2114/*
984263bc
MD
2115 * Record a process's interest in events which might happen to
2116 * a vnode. Because poll uses the historic select-style interface
2117 * internally, this routine serves as both the ``check for any
2118 * pending events'' and the ``record my interest in future events''
2119 * functions. (These are done together, while the lock is held,
2120 * to avoid race conditions.)
2121 */
2122int
87de5057 2123vn_pollrecord(struct vnode *vp, int events)
984263bc 2124{
87de5057
MD
2125 KKASSERT(curthread->td_proc != NULL);
2126
3b998fa9 2127 lwkt_gettoken(&vp->v_token);
984263bc
MD
2128 if (vp->v_pollinfo.vpi_revents & events) {
2129 /*
2130 * This leaves events we are not interested
2131 * in available for the other process which
2132 * which presumably had requested them
2133 * (otherwise they would never have been
2134 * recorded).
2135 */
2136 events &= vp->v_pollinfo.vpi_revents;
2137 vp->v_pollinfo.vpi_revents &= ~events;
2138
3b998fa9 2139 lwkt_reltoken(&vp->v_token);
984263bc
MD
2140 return events;
2141 }
2142 vp->v_pollinfo.vpi_events |= events;
87de5057 2143 selrecord(curthread, &vp->v_pollinfo.vpi_selinfo);
3b998fa9 2144 lwkt_reltoken(&vp->v_token);
984263bc
MD
2145 return 0;
2146}
2147
2148/*
2149 * Note the occurrence of an event. If the VN_POLLEVENT macro is used,
2150 * it is possible for us to miss an event due to race conditions, but
2151 * that condition is expected to be rare, so for the moment it is the
2152 * preferred interface.
2153 */
2154void
dd98570a 2155vn_pollevent(struct vnode *vp, int events)
984263bc 2156{
3b998fa9 2157 lwkt_gettoken(&vp->v_token);
984263bc
MD
2158 if (vp->v_pollinfo.vpi_events & events) {
2159 /*
2160 * We clear vpi_events so that we don't
2161 * call selwakeup() twice if two events are
2162 * posted before the polling process(es) is
2163 * awakened. This also ensures that we take at
2164 * most one selwakeup() if the polling process
2165 * is no longer interested. However, it does
2166 * mean that only one event can be noticed at
2167 * a time. (Perhaps we should only clear those
2168 * event bits which we note?) XXX
2169 */
2170 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
2171 vp->v_pollinfo.vpi_revents |= events;
2172 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2173 }
3b998fa9 2174 lwkt_reltoken(&vp->v_token);
984263bc
MD
2175}
2176
2177/*
2178 * Wake up anyone polling on vp because it is being revoked.
2179 * This depends on dead_poll() returning POLLHUP for correct
2180 * behavior.
2181 */
2182void
dd98570a 2183vn_pollgone(struct vnode *vp)
984263bc 2184{
3b998fa9 2185 lwkt_gettoken(&vp->v_token);
984263bc
MD
2186 if (vp->v_pollinfo.vpi_events) {
2187 vp->v_pollinfo.vpi_events = 0;
2188 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2189 }
3b998fa9 2190 lwkt_reltoken(&vp->v_token);
984263bc
MD
2191}
2192
984263bc 2193/*
b13267a5 2194 * extract the cdev_t from a VBLK or VCHR. The vnode must have been opened
e4c9c0c8 2195 * (or v_rdev might be NULL).
984263bc 2196 */
b13267a5 2197cdev_t
e4c9c0c8 2198vn_todev(struct vnode *vp)
984263bc
MD
2199{
2200 if (vp->v_type != VBLK && vp->v_type != VCHR)
028066b1 2201 return (NULL);
e4c9c0c8 2202 KKASSERT(vp->v_rdev != NULL);
984263bc
MD
2203 return (vp->v_rdev);
2204}
2205
2206/*
e4c9c0c8
MD
2207 * Check if vnode represents a disk device. The vnode does not need to be
2208 * opened.
2ad080fe
MD
2209 *
2210 * MPALMOSTSAFE
984263bc
MD
2211 */
2212int
e4c9c0c8 2213vn_isdisk(struct vnode *vp, int *errp)
984263bc 2214{
b13267a5 2215 cdev_t dev;
e4c9c0c8 2216
0e9b9130 2217 if (vp->v_type != VCHR) {
984263bc
MD
2218 if (errp != NULL)
2219 *errp = ENOTBLK;
2220 return (0);
2221 }
e4c9c0c8 2222
cd29885a 2223 dev = vp->v_rdev;
0e9b9130 2224
028066b1 2225 if (dev == NULL) {
984263bc
MD
2226 if (errp != NULL)
2227 *errp = ENXIO;
2228 return (0);
2229 }
e4c9c0c8 2230 if (dev_is_good(dev) == 0) {
984263bc
MD
2231 if (errp != NULL)
2232 *errp = ENXIO;
2233 return (0);
2234 }
e4c9c0c8 2235 if ((dev_dflags(dev) & D_DISK) == 0) {
984263bc
MD
2236 if (errp != NULL)
2237 *errp = ENOTBLK;
2238 return (0);
2239 }
2240 if (errp != NULL)
2241 *errp = 0;
2242 return (1);
2243}
2244
5d72d6ed
JS
2245int
2246vn_get_namelen(struct vnode *vp, int *namelen)
2247{
973c11b9
MD
2248 int error;
2249 register_t retval[2];
5d72d6ed
JS
2250
2251 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval);
2252 if (error)
2253 return (error);
973c11b9 2254 *namelen = (int)retval[0];
5d72d6ed
JS
2255 return (0);
2256}
fc46f680
JS
2257
2258int
b45c5139
MD
2259vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type,
2260 uint16_t d_namlen, const char *d_name)
fc46f680 2261{
01f31ab3
JS
2262 struct dirent *dp;
2263 size_t len;
fc46f680 2264
01f31ab3
JS
2265 len = _DIRENT_RECLEN(d_namlen);
2266 if (len > uio->uio_resid)
fc46f680
JS
2267 return(1);
2268
efda3bd0 2269 dp = kmalloc(len, M_TEMP, M_WAITOK | M_ZERO);
01f31ab3
JS
2270
2271 dp->d_ino = d_ino;
2272 dp->d_namlen = d_namlen;
fc46f680
JS
2273 dp->d_type = d_type;
2274 bcopy(d_name, dp->d_name, d_namlen);
fc46f680 2275
01f31ab3
JS
2276 *error = uiomove((caddr_t)dp, len, uio);
2277
efda3bd0 2278 kfree(dp, M_TEMP);
fc46f680
JS
2279
2280 return(0);
2281}
7540ab49 2282
349433c9
MD
2283void
2284vn_mark_atime(struct vnode *vp, struct thread *td)
2285{
2286 struct proc *p = td->td_proc;
2287 struct ucred *cred = p ? p->p_ucred : proc0.p_ucred;
2288
2289 if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
2290 VOP_MARKATIME(vp, cred);
2291 }
2292}