gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	39	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	40	* $DragonFly: src/sys/kern/vfs_subr.c,v 1.50 2004/12/17 00:18:07 dillon Exp $
	41	*/
	42
	43	/*
	44	* External virtual filesystem routines
	45	*/
	46	#include "opt_ddb.h"
	47
	48	#include <sys/param.h>
	49	#include <sys/systm.h>
	50	#include <sys/buf.h>
	51	#include <sys/conf.h>
	52	#include <sys/dirent.h>
	53	#include <sys/domain.h>
	54	#include <sys/eventhandler.h>
	55	#include <sys/fcntl.h>
	56	#include <sys/kernel.h>
	57	#include <sys/kthread.h>
	58	#include <sys/malloc.h>
	59	#include <sys/mbuf.h>
	60	#include <sys/mount.h>
	61	#include <sys/proc.h>
	62	#include <sys/reboot.h>
	63	#include <sys/socket.h>
	64	#include <sys/stat.h>
	65	#include <sys/sysctl.h>
	66	#include <sys/syslog.h>
	67	#include <sys/vmmeter.h>
	68	#include <sys/vnode.h>
	69
	70	#include <machine/limits.h>
	71
	72	#include <vm/vm.h>
	73	#include <vm/vm_object.h>
	74	#include <vm/vm_extern.h>
	75	#include <vm/vm_kern.h>
	76	#include <vm/pmap.h>
	77	#include <vm/vm_map.h>
	78	#include <vm/vm_page.h>
	79	#include <vm/vm_pager.h>
	80	#include <vm/vnode_pager.h>
	81	#include <vm/vm_zone.h>
	82
	83	#include <sys/buf2.h>
	84	#include <sys/thread2.h>
	85
	86	static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
	87
	88	int numvnodes;
	89	SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
	90	int vfs_fastdev = 1;
	91	SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, "");
	92
	93	enum vtype iftovt_tab[16] = {
	94	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	95	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	96	};
	97	int vttoif_tab[9] = {
	98	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	99	S_IFSOCK, S_IFIFO, S_IFMT,
	100	};
	101
	102	static int reassignbufcalls;
	103	SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW,
	104	&reassignbufcalls, 0, "");
	105	static int reassignbufloops;
	106	SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW,
	107	&reassignbufloops, 0, "");
	108	static int reassignbufsortgood;
	109	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW,
	110	&reassignbufsortgood, 0, "");
	111	static int reassignbufsortbad;
	112	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW,
	113	&reassignbufsortbad, 0, "");
	114	static int reassignbufmethod = 1;
	115	SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW,
	116	&reassignbufmethod, 0, "");
	117
	118	int nfs_mount_type = -1;
	119	static struct lwkt_token spechash_token;
	120	struct nfs_public nfs_pub; /* publicly exported FS */
	121
	122	int desiredvnodes;
	123	SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
	124	&desiredvnodes, 0, "Maximum number of vnodes");
	125
	126	static void vfs_free_addrlist (struct netexport *nep);
	127	static int vfs_free_netcred (struct radix_node rn, void w);
	128	static int vfs_hang_addrlist (struct mount mp, struct netexport nep,
	129	struct export_args *argp);
	130
	131	extern int dev_ref_debug;
	132	extern struct vnodeopv_entry_desc spec_vnodeop_entries[];
	133
	134	/*
	135	* Return 0 if the vnode is already on the free list or cannot be placed
	136	* on the free list. Return 1 if the vnode can be placed on the free list.
	137	*/
	138	static __inline int
	139	vshouldfree(struct vnode *vp, int usecount)
	140	{
	141	if (vp->v_flag & VFREE)
	142	return (0); /* already free */
	143	if (vp->v_holdcnt != 0 \|\| vp->v_usecount != usecount)
	144	return (0); /* other holderse */
	145	if (vp->v_object &&
	146	(vp->v_object->ref_count \|\| vp->v_object->resident_page_count)) {
	147	return (0);
	148	}
	149	return (1);
	150	}
	151
	152	/*
	153	* Initialize the vnode management data structures.
	154	*
	155	* Called from vfsinit()
	156	*/
	157	void
	158	vfs_subr_init(void)
	159	{
	160	/*
	161	* Desired vnodes is a result of the physical page count
	162	* and the size of kernel's heap. It scales in proportion
	163	* to the amount of available physical memory. This can
	164	* cause trouble on 64-bit and large memory platforms.
	165	*/
	166	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
	167	desiredvnodes =
	168	min(maxproc + vmstats.v_page_count /4,
	169	2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
	170	(5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
	171
	172	lwkt_token_init(&spechash_token);
	173	}
	174
	175	/*
	176	* Knob to control the precision of file timestamps:
	177	*
	178	* 0 = seconds only; nanoseconds zeroed.
	179	* 1 = seconds and nanoseconds, accurate within 1/HZ.
	180	* 2 = seconds and nanoseconds, truncated to microseconds.
	181	* >=3 = seconds and nanoseconds, maximum precision.
	182	*/
	183	enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
	184
	185	static int timestamp_precision = TSP_SEC;
	186	SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
	187	&timestamp_precision, 0, "");
	188
	189	/*
	190	* Get a current timestamp.
	191	*/
	192	void
	193	vfs_timestamp(struct timespec *tsp)
	194	{
	195	struct timeval tv;
	196
	197	switch (timestamp_precision) {
	198	case TSP_SEC:
	199	tsp->tv_sec = time_second;
	200	tsp->tv_nsec = 0;
	201	break;
	202	case TSP_HZ:
	203	getnanotime(tsp);
	204	break;
	205	case TSP_USEC:
	206	microtime(&tv);
	207	TIMEVAL_TO_TIMESPEC(&tv, tsp);
	208	break;
	209	case TSP_NSEC:
	210	default:
	211	nanotime(tsp);
	212	break;
	213	}
	214	}
	215
	216	/*
	217	* Set vnode attributes to VNOVAL
	218	*/
	219	void
	220	vattr_null(struct vattr *vap)
	221	{
	222	vap->va_type = VNON;
	223	vap->va_size = VNOVAL;
	224	vap->va_bytes = VNOVAL;
	225	vap->va_mode = VNOVAL;
	226	vap->va_nlink = VNOVAL;
	227	vap->va_uid = VNOVAL;
	228	vap->va_gid = VNOVAL;
	229	vap->va_fsid = VNOVAL;
	230	vap->va_fileid = VNOVAL;
	231	vap->va_blocksize = VNOVAL;
	232	vap->va_rdev = VNOVAL;
	233	vap->va_atime.tv_sec = VNOVAL;
	234	vap->va_atime.tv_nsec = VNOVAL;
	235	vap->va_mtime.tv_sec = VNOVAL;
	236	vap->va_mtime.tv_nsec = VNOVAL;
	237	vap->va_ctime.tv_sec = VNOVAL;
	238	vap->va_ctime.tv_nsec = VNOVAL;
	239	vap->va_flags = VNOVAL;
	240	vap->va_gen = VNOVAL;
	241	vap->va_vaflags = 0;
	242	}
	243
	244	/*
	245	* Update outstanding I/O count and do wakeup if requested.
	246	*/
	247	void
	248	vwakeup(struct buf *bp)
	249	{
	250	struct vnode *vp;
	251
	252	bp->b_flags &= ~B_WRITEINPROG;
	253	if ((vp = bp->b_vp)) {
	254	vp->v_numoutput--;
	255	if (vp->v_numoutput < 0)
	256	panic("vwakeup: neg numoutput");
	257	if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
	258	vp->v_flag &= ~VBWAIT;
	259	wakeup((caddr_t) &vp->v_numoutput);
	260	}
	261	}
	262	}
	263
	264	/*
	265	* Flush out and invalidate all buffers associated with a vnode.
	266	*
	267	* vp must be locked.
	268	*/
	269	int
	270	vinvalbuf(struct vnode vp, int flags, struct thread td,
	271	int slpflag, int slptimeo)
	272	{
	273	struct buf *bp;
	274	struct buf nbp, blist;
	275	int s, error;
	276	vm_object_t object;
	277
	278	if (flags & V_SAVE) {
	279	s = splbio();
	280	while (vp->v_numoutput) {
	281	vp->v_flag \|= VBWAIT;
	282	error = tsleep((caddr_t)&vp->v_numoutput,
	283	slpflag, "vinvlbuf", slptimeo);
	284	if (error) {
	285	splx(s);
	286	return (error);
	287	}
	288	}
	289	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	290	splx(s);
	291	if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
	292	return (error);
	293	s = splbio();
	294	if (vp->v_numoutput > 0 \|\|
	295	!TAILQ_EMPTY(&vp->v_dirtyblkhd))
	296	panic("vinvalbuf: dirty bufs");
	297	}
	298	splx(s);
	299	}
	300	s = splbio();
	301	for (;;) {
	302	blist = TAILQ_FIRST(&vp->v_cleanblkhd);
	303	if (!blist)
	304	blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
	305	if (!blist)
	306	break;
	307
	308	for (bp = blist; bp; bp = nbp) {
	309	nbp = TAILQ_NEXT(bp, b_vnbufs);
	310	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	311	error = BUF_TIMELOCK(bp,
	312	LK_EXCLUSIVE \| LK_SLEEPFAIL,
	313	"vinvalbuf", slpflag, slptimeo);
	314	if (error == ENOLCK)
	315	break;
	316	splx(s);
	317	return (error);
	318	}
	319	/*
	320	* XXX Since there are no node locks for NFS, I
	321	* believe there is a slight chance that a delayed
	322	* write will occur while sleeping just above, so
	323	* check for it. Note that vfs_bio_awrite expects
	324	* buffers to reside on a queue, while VOP_BWRITE and
	325	* brelse do not.
	326	*/
	327	if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) &&
	328	(flags & V_SAVE)) {
	329
	330	if (bp->b_vp == vp) {
	331	if (bp->b_flags & B_CLUSTEROK) {
	332	BUF_UNLOCK(bp);
	333	vfs_bio_awrite(bp);
	334	} else {
	335	bremfree(bp);
	336	bp->b_flags \|= B_ASYNC;
	337	VOP_BWRITE(bp->b_vp, bp);
	338	}
	339	} else {
	340	bremfree(bp);
	341	(void) VOP_BWRITE(bp->b_vp, bp);
	342	}
	343	break;
	344	}
	345	bremfree(bp);
	346	bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF);
	347	bp->b_flags &= ~B_ASYNC;
	348	brelse(bp);
	349	}
	350	}
	351
	352	/*
	353	* Wait for I/O to complete. XXX needs cleaning up. The vnode can
	354	* have write I/O in-progress but if there is a VM object then the
	355	* VM object can also have read-I/O in-progress.
	356	*/
	357	do {
	358	while (vp->v_numoutput > 0) {
	359	vp->v_flag \|= VBWAIT;
	360	tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
	361	}
	362	if (VOP_GETVOBJECT(vp, &object) == 0) {
	363	while (object->paging_in_progress)
	364	vm_object_pip_sleep(object, "vnvlbx");
	365	}
	366	} while (vp->v_numoutput > 0);
	367
	368	splx(s);
	369
	370	/*
	371	* Destroy the copy in the VM cache, too.
	372	*/
	373	if (VOP_GETVOBJECT(vp, &object) == 0) {
	374	vm_object_page_remove(object, 0, 0,
	375	(flags & V_SAVE) ? TRUE : FALSE);
	376	}
	377
	378	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd))
	379	panic("vinvalbuf: flush failed");
	380	return (0);
	381	}
	382
	383	/*
	384	* Truncate a file's buffer and pages to a specified length. This
	385	* is in lieu of the old vinvalbuf mechanism, which performed unneeded
	386	* sync activity.
	387	*
	388	* The vnode must be locked.
	389	*/
	390	int
	391	vtruncbuf(struct vnode vp, struct thread td, off_t length, int blksize)
	392	{
	393	struct buf *bp;
	394	struct buf *nbp;
	395	int s, anyfreed;
	396	int trunclbn;
	397
	398	/*
	399	* Round up to the next lbn.
	400	*/
	401	trunclbn = (length + blksize - 1) / blksize;
	402
	403	s = splbio();
	404	restart:
	405	anyfreed = 1;
	406	for (;anyfreed;) {
	407	anyfreed = 0;
	408	for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
	409	nbp = TAILQ_NEXT(bp, b_vnbufs);
	410	if (bp->b_lblkno >= trunclbn) {
	411	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	412	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	413	goto restart;
	414	} else {
	415	bremfree(bp);
	416	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	417	bp->b_flags &= ~B_ASYNC;
	418	brelse(bp);
	419	anyfreed = 1;
	420	}
	421	if (nbp &&
	422	(((nbp->b_xflags & BX_VNCLEAN) == 0) \|\|
	423	(nbp->b_vp != vp) \|\|
	424	(nbp->b_flags & B_DELWRI))) {
	425	goto restart;
	426	}
	427	}
	428	}
	429
	430	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	431	nbp = TAILQ_NEXT(bp, b_vnbufs);
	432	if (bp->b_lblkno >= trunclbn) {
	433	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	434	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	435	goto restart;
	436	} else {
	437	bremfree(bp);
	438	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	439	bp->b_flags &= ~B_ASYNC;
	440	brelse(bp);
	441	anyfreed = 1;
	442	}
	443	if (nbp &&
	444	(((nbp->b_xflags & BX_VNDIRTY) == 0) \|\|
	445	(nbp->b_vp != vp) \|\|
	446	(nbp->b_flags & B_DELWRI) == 0)) {
	447	goto restart;
	448	}
	449	}
	450	}
	451	}
	452
	453	if (length > 0) {
	454	restartsync:
	455	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	456	nbp = TAILQ_NEXT(bp, b_vnbufs);
	457	if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
	458	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	459	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	460	goto restart;
	461	} else {
	462	bremfree(bp);
	463	if (bp->b_vp == vp) {
	464	bp->b_flags \|= B_ASYNC;
	465	} else {
	466	bp->b_flags &= ~B_ASYNC;
	467	}
	468	VOP_BWRITE(bp->b_vp, bp);
	469	}
	470	goto restartsync;
	471	}
	472
	473	}
	474	}
	475
	476	while (vp->v_numoutput > 0) {
	477	vp->v_flag \|= VBWAIT;
	478	tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
	479	}
	480
	481	splx(s);
	482
	483	vnode_pager_setsize(vp, length);
	484
	485	return (0);
	486	}
	487
	488	/*
	489	* Associate a buffer with a vnode.
	490	*/
	491	void
	492	bgetvp(struct vnode vp, struct buf bp)
	493	{
	494	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
	495
	496	vhold(vp);
	497	bp->b_vp = vp;
	498	bp->b_dev = vn_todev(vp);
	499	/*
	500	* Insert onto list for new vnode.
	501	*/
	502	crit_enter();
	503	bp->b_xflags \|= BX_VNCLEAN;
	504	bp->b_xflags &= ~BX_VNDIRTY;
	505	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
	506	crit_exit();
	507	}
	508
	509	/*
	510	* Disassociate a buffer from a vnode.
	511	*/
	512	void
	513	brelvp(struct buf *bp)
	514	{
	515	struct vnode *vp;
	516	struct buflists *listheadp;
	517
	518	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
	519
	520	/*
	521	* Delete from old vnode list, if on one.
	522	*/
	523	vp = bp->b_vp;
	524	crit_enter();
	525	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	526	if (bp->b_xflags & BX_VNDIRTY)
	527	listheadp = &vp->v_dirtyblkhd;
	528	else
	529	listheadp = &vp->v_cleanblkhd;
	530	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	531	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	532	}
	533	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	534	vp->v_flag &= ~VONWORKLST;
	535	LIST_REMOVE(vp, v_synclist);
	536	}
	537	crit_exit();
	538	bp->b_vp = NULL;
	539	vdrop(vp);
	540	}
	541
	542	/*
	543	* Associate a p-buffer with a vnode.
	544	*
	545	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	546	* with the buffer. i.e. the bp has not been linked into the vnode or
	547	* ref-counted.
	548	*/
	549	void
	550	pbgetvp(struct vnode vp, struct buf bp)
	551	{
	552	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
	553
	554	bp->b_vp = vp;
	555	bp->b_flags \|= B_PAGING;
	556	bp->b_dev = vn_todev(vp);
	557	}
	558
	559	/*
	560	* Disassociate a p-buffer from a vnode.
	561	*/
	562	void
	563	pbrelvp(struct buf *bp)
	564	{
	565	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
	566
	567	/* XXX REMOVE ME */
	568	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
	569	panic(
	570	"relpbuf(): b_vp was probably reassignbuf()d %p %x",
	571	bp,
	572	(int)bp->b_flags
	573	);
	574	}
	575	bp->b_vp = (struct vnode *) 0;
	576	bp->b_flags &= ~B_PAGING;
	577	}
	578
	579	void
	580	pbreassignbuf(struct buf bp, struct vnode newvp)
	581	{
	582	if ((bp->b_flags & B_PAGING) == 0) {
	583	panic(
	584	"pbreassignbuf() on non phys bp %p",
	585	bp
	586	);
	587	}
	588	bp->b_vp = newvp;
	589	}
	590
	591	/*
	592	* Reassign a buffer from one vnode to another.
	593	* Used to assign file specific control information
	594	* (indirect blocks) to the vnode to which they belong.
	595	*/
	596	void
	597	reassignbuf(struct buf bp, struct vnode newvp)
	598	{
	599	struct buflists *listheadp;
	600	int delay;
	601
	602	if (newvp == NULL) {
	603	printf("reassignbuf: NULL");
	604	return;
	605	}
	606	++reassignbufcalls;
	607
	608	/*
	609	* B_PAGING flagged buffers cannot be reassigned because their vp
	610	* is not fully linked in.
	611	*/
	612	if (bp->b_flags & B_PAGING)
	613	panic("cannot reassign paging buffer");
	614
	615	crit_enter();
	616	/*
	617	* Delete from old vnode list, if on one.
	618	*/
	619	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	620	if (bp->b_xflags & BX_VNDIRTY)
	621	listheadp = &bp->b_vp->v_dirtyblkhd;
	622	else
	623	listheadp = &bp->b_vp->v_cleanblkhd;
	624	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	625	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	626	if (bp->b_vp != newvp) {
	627	vdrop(bp->b_vp);
	628	bp->b_vp = NULL; /* for clarification */
	629	}
	630	}
	631	/*
	632	* If dirty, put on list of dirty buffers; otherwise insert onto list
	633	* of clean buffers.
	634	*/
	635	if (bp->b_flags & B_DELWRI) {
	636	struct buf *tbp;
	637
	638	listheadp = &newvp->v_dirtyblkhd;
	639	if ((newvp->v_flag & VONWORKLST) == 0) {
	640	switch (newvp->v_type) {
	641	case VDIR:
	642	delay = dirdelay;
	643	break;
	644	case VCHR:
	645	case VBLK:
	646	if (newvp->v_rdev &&
	647	newvp->v_rdev->si_mountpoint != NULL) {
	648	delay = metadelay;
	649	break;
	650	}
	651	/* fall through */
	652	default:
	653	delay = filedelay;
	654	}
	655	vn_syncer_add_to_worklist(newvp, delay);
	656	}
	657	bp->b_xflags \|= BX_VNDIRTY;
	658	tbp = TAILQ_FIRST(listheadp);
	659	if (tbp == NULL \|\|
	660	bp->b_lblkno == 0 \|\|
	661	(bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\|
	662	(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
	663	TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
	664	++reassignbufsortgood;
	665	} else if (bp->b_lblkno < 0) {
	666	TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
	667	++reassignbufsortgood;
	668	} else if (reassignbufmethod == 1) {
	669	/*
	670	* New sorting algorithm, only handle sequential case,
	671	* otherwise append to end (but before metadata)
	672	*/
	673	if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
	674	(tbp->b_xflags & BX_VNDIRTY)) {
	675	/*
	676	* Found the best place to insert the buffer
	677	*/
	678	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	679	++reassignbufsortgood;
	680	} else {
	681	/*
	682	* Missed, append to end, but before meta-data.
	683	* We know that the head buffer in the list is
	684	* not meta-data due to prior conditionals.
	685	*
	686	* Indirect effects: NFS second stage write
	687	* tends to wind up here, giving maximum
	688	* distance between the unstable write and the
	689	* commit rpc.
	690	*/
	691	tbp = TAILQ_LAST(listheadp, buflists);
	692	while (tbp && tbp->b_lblkno < 0)
	693	tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
	694	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	695	++reassignbufsortbad;
	696	}
	697	} else {
	698	/*
	699	* Old sorting algorithm, scan queue and insert
	700	*/
	701	struct buf *ttbp;
	702	while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
	703	(ttbp->b_lblkno < bp->b_lblkno)) {
	704	++reassignbufloops;
	705	tbp = ttbp;
	706	}
	707	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	708	}
	709	} else {
	710	bp->b_xflags \|= BX_VNCLEAN;
	711	TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
	712	if ((newvp->v_flag & VONWORKLST) &&
	713	TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
	714	newvp->v_flag &= ~VONWORKLST;
	715	LIST_REMOVE(newvp, v_synclist);
	716	}
	717	}
	718	if (bp->b_vp != newvp) {
	719	bp->b_vp = newvp;
	720	vhold(bp->b_vp);
	721	}
	722	crit_exit();
	723	}
	724
	725	/*
	726	* Create a vnode for a block device.
	727	* Used for mounting the root file system.
	728	*/
	729	int
	730	bdevvp(dev_t dev, struct vnode **vpp)
	731	{
	732	struct vnode *vp;
	733	struct vnode *nvp;
	734	int error;
	735
	736	if (dev == NODEV) {
	737	*vpp = NULLVP;
	738	return (ENXIO);
	739	}
	740	error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops, &nvp, 0, 0);
	741	if (error) {
	742	*vpp = NULLVP;
	743	return (error);
	744	}
	745	vp = nvp;
	746	vp->v_type = VCHR;
	747	vp->v_udev = dev->si_udev;
	748	vx_unlock(vp);
	749	*vpp = vp;
	750	return (0);
	751	}
	752
	753	int
	754	v_associate_rdev(struct vnode *vp, dev_t dev)
	755	{
	756	lwkt_tokref ilock;
	757
	758	if (dev == NULL \|\| dev == NODEV)
	759	return(ENXIO);
	760	if (dev_is_good(dev) == 0)
	761	return(ENXIO);
	762	KKASSERT(vp->v_rdev == NULL);
	763	if (dev_ref_debug)
	764	printf("Z1");
	765	vp->v_rdev = reference_dev(dev);
	766	lwkt_gettoken(&ilock, &spechash_token);
	767	SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext);
	768	lwkt_reltoken(&ilock);
	769	return(0);
	770	}
	771
	772	void
	773	v_release_rdev(struct vnode *vp)
	774	{
	775	lwkt_tokref ilock;
	776	dev_t dev;
	777
	778	if ((dev = vp->v_rdev) != NULL) {
	779	lwkt_gettoken(&ilock, &spechash_token);
	780	SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext);
	781	if (dev_ref_debug && vp->v_opencount != 0) {
	782	printf("releasing rdev with non-0 "
	783	"v_opencount(%d) (revoked?)\n",
	784	vp->v_opencount);
	785	}
	786	vp->v_rdev = NULL;
	787	vp->v_opencount = 0;
	788	release_dev(dev);
	789	lwkt_reltoken(&ilock);
	790	}
	791	}
	792
	793	/*
	794	* Add a vnode to the alias list hung off the dev_t. We only associate
	795	* the device number with the vnode. The actual device is not associated
	796	* until the vnode is opened (usually in spec_open()), and will be
	797	* disassociated on last close.
	798	*/
	799	void
	800	addaliasu(struct vnode *nvp, udev_t nvp_udev)
	801	{
	802	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	803	panic("addaliasu on non-special vnode");
	804	nvp->v_udev = nvp_udev;
	805	}
	806
	807	/*
	808	* Disassociate a vnode from its underlying filesystem.
	809	*
	810	* The vnode must be VX locked and refd
	811	*
	812	* If there are v_usecount references to the vnode other then ours we have
	813	* to VOP_CLOSE the vnode before we can deactivate and reclaim it.
	814	*/
	815	void
	816	vclean(struct vnode vp, int flags, struct thread td)
	817	{
	818	int active;
	819
	820	/*
	821	* If the vnode has already been reclaimed we have nothing to do.
	822	*/
	823	if (vp->v_flag & VRECLAIMED)
	824	return;
	825	vp->v_flag \|= VRECLAIMED;
	826
	827	/*
	828	* Scrap the vfs cache
	829	*/
	830	cache_inval_vp(vp, 0);
	831
	832	/*
	833	* Check to see if the vnode is in use. If so we have to reference it
	834	* before we clean it out so that its count cannot fall to zero and
	835	* generate a race against ourselves to recycle it.
	836	*/
	837	active = (vp->v_usecount > 1);
	838
	839	/*
	840	* Clean out any buffers associated with the vnode and destroy its
	841	* object, if it has one.
	842	*/
	843	vinvalbuf(vp, V_SAVE, td, 0, 0);
	844	VOP_DESTROYVOBJECT(vp);
	845
	846	/*
	847	* If purging an active vnode, it must be closed and
	848	* deactivated before being reclaimed. XXX
	849	*
	850	* Note that neither of these routines unlocks the vnode.
	851	*/
	852	if (active) {
	853	if (flags & DOCLOSE)
	854	VOP_CLOSE(vp, FNONBLOCK, td);
	855	}
	856
	857	/*
	858	* If the vnode has not be deactivated, deactivated it.
	859	*/
	860	if ((vp->v_flag & VINACTIVE) == 0) {
	861	vp->v_flag \|= VINACTIVE;
	862	VOP_INACTIVE(vp, td);
	863	}
	864
	865	/*
	866	* Reclaim the vnode.
	867	*/
	868	if (VOP_RECLAIM(vp, td))
	869	panic("vclean: cannot reclaim");
	870
	871	/*
	872	* Done with purge, notify sleepers of the grim news.
	873	*/
	874	vp->v_ops = &dead_vnode_vops;
	875	vn_pollgone(vp);
	876	vp->v_tag = VT_NON;
	877	}
	878
	879	/*
	880	* Eliminate all activity associated with the requested vnode
	881	* and with all vnodes aliased to the requested vnode.
	882	*
	883	* The vnode must be referenced and vx_lock()'d
	884	*
	885	* revoke { struct vnode *a_vp, int a_flags }
	886	*/
	887	int
	888	vop_stdrevoke(struct vop_revoke_args *ap)
	889	{
	890	struct vnode vp, vq;
	891	lwkt_tokref ilock;
	892	dev_t dev;
	893
	894	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
	895
	896	vp = ap->a_vp;
	897
	898	/*
	899	* If the vnode is already dead don't try to revoke it
	900	*/
	901	if (vp->v_flag & VRECLAIMED)
	902	return (0);
	903
	904	/*
	905	* If the vnode has a device association, scrap all vnodes associated
	906	* with the device. Don't let the device disappear on us while we
	907	* are scrapping the vnodes.
	908	*
	909	* The passed vp will probably show up in the list, do not VX lock
	910	* it twice!
	911	*/
	912	if (vp->v_type != VCHR && vp->v_type != VBLK)
	913	return(0);
	914	if ((dev = vp->v_rdev) == NULL) {
	915	if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV)
	916	return(0);
	917	}
	918	reference_dev(dev);
	919	lwkt_gettoken(&ilock, &spechash_token);
	920	while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) {
	921	if (vp == vq \|\| vx_get(vq) == 0) {
	922	if (vq == SLIST_FIRST(&dev->si_hlist))
	923	vgone(vq);
	924	if (vp != vq)
	925	vx_put(vq);
	926	}
	927	}
	928	lwkt_reltoken(&ilock);
	929	release_dev(dev);
	930	return (0);
	931	}
	932
	933	/*
	934	* Recycle an unused vnode to the front of the free list.
	935	*
	936	* Returns 1 if we were successfully able to recycle the vnode,
	937	* 0 otherwise.
	938	*/
	939	int
	940	vrecycle(struct vnode vp, struct thread td)
	941	{
	942	if (vp->v_usecount == 1) {
	943	vgone(vp);
	944	return (1);
	945	}
	946	return (0);
	947	}
	948
	949	/*
	950	* Eliminate all activity associated with a vnode in preparation for reuse.
	951	*
	952	* The vnode must be VX locked and refd and will remain VX locked and refd
	953	* on return. This routine may be called with the vnode in any state, as
	954	* long as it is VX locked. The vnode will be cleaned out and marked
	955	* VRECLAIMED but will not actually be reused until all existing refs and
	956	* holds go away.
	957	*
	958	* NOTE: This routine may be called on a vnode which has not yet been
	959	* already been deactivated (VOP_INACTIVE), or on a vnode which has
	960	* already been reclaimed.
	961	*
	962	* This routine is not responsible for placing us back on the freelist.
	963	* Instead, it happens automatically when the caller releases the VX lock
	964	* (assuming there aren't any other references).
	965	*/
	966	void
	967	vgone(struct vnode *vp)
	968	{
	969	/*
	970	* assert that the VX lock is held. This is an absolute requirement
	971	* now for vgone() to be called.
	972	*/
	973	KKASSERT(vp->v_lock.lk_exclusivecount == 1);
	974
	975	/*
	976	* Clean out the filesystem specific data and set the VRECLAIMED
	977	* bit. Also deactivate the vnode if necessary.
	978	*/
	979	vclean(vp, DOCLOSE, curthread);
	980
	981	/*
	982	* Delete from old mount point vnode list, if on one.
	983	*/
	984	if (vp->v_mount != NULL)
	985	insmntque(vp, NULL);
	986
	987	/*
	988	* If special device, remove it from special device alias list
	989	* if it is on one. This should normally only occur if a vnode is
	990	* being revoked as the device should otherwise have been released
	991	* naturally.
	992	*/
	993	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) {
	994	v_release_rdev(vp);
	995	}
	996
	997	/*
	998	* Set us to VBAD
	999	*/
	1000	vp->v_type = VBAD;
	1001	}
	1002
	1003	/*
	1004	* Lookup a vnode by device number.
	1005	*/
	1006	int
	1007	vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
	1008	{
	1009	lwkt_tokref ilock;
	1010	struct vnode *vp;
	1011
	1012	lwkt_gettoken(&ilock, &spechash_token);
	1013	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	1014	if (type == vp->v_type) {
	1015	*vpp = vp;
	1016	lwkt_reltoken(&ilock);
	1017	return (1);
	1018	}
	1019	}
	1020	lwkt_reltoken(&ilock);
	1021	return (0);
	1022	}
	1023
	1024	/*
	1025	* Calculate the total number of references to a special device. This
	1026	* routine may only be called for VBLK and VCHR vnodes since v_rdev is
	1027	* an overloaded field. Since udev2dev can now return NODEV, we have
	1028	* to check for a NULL v_rdev.
	1029	*/
	1030	int
	1031	count_dev(dev_t dev)
	1032	{
	1033	lwkt_tokref ilock;
	1034	struct vnode *vp;
	1035	int count = 0;
	1036
	1037	if (SLIST_FIRST(&dev->si_hlist)) {
	1038	lwkt_gettoken(&ilock, &spechash_token);
	1039	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	1040	count += vp->v_usecount;
	1041	}
	1042	lwkt_reltoken(&ilock);
	1043	}
	1044	return(count);
	1045	}
	1046
	1047	int
	1048	count_udev(udev_t udev)
	1049	{
	1050	dev_t dev;
	1051
	1052	if ((dev = udev2dev(udev, 0)) == NODEV)
	1053	return(0);
	1054	return(count_dev(dev));
	1055	}
	1056
	1057	int
	1058	vcount(struct vnode *vp)
	1059	{
	1060	if (vp->v_rdev == NULL)
	1061	return(0);
	1062	return(count_dev(vp->v_rdev));
	1063	}
	1064
	1065	/*
	1066	* Print out a description of a vnode.
	1067	*/
	1068	static char *typename[] =
	1069	{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
	1070
	1071	void
	1072	vprint(char label, struct vnode vp)
	1073	{
	1074	char buf[96];
	1075
	1076	if (label != NULL)
	1077	printf("%s: %p: ", label, (void *)vp);
	1078	else
	1079	printf("%p: ", (void *)vp);
	1080	printf("type %s, usecount %d, writecount %d, refcount %d,",
	1081	typename[vp->v_type], vp->v_usecount, vp->v_writecount,
	1082	vp->v_holdcnt);
	1083	buf[0] = '\0';
	1084	if (vp->v_flag & VROOT)
	1085	strcat(buf, "\|VROOT");
	1086	if (vp->v_flag & VTEXT)
	1087	strcat(buf, "\|VTEXT");
	1088	if (vp->v_flag & VSYSTEM)
	1089	strcat(buf, "\|VSYSTEM");
	1090	if (vp->v_flag & VBWAIT)
	1091	strcat(buf, "\|VBWAIT");
	1092	if (vp->v_flag & VFREE)
	1093	strcat(buf, "\|VFREE");
	1094	if (vp->v_flag & VOBJBUF)
	1095	strcat(buf, "\|VOBJBUF");
	1096	if (buf[0] != '\0')
	1097	printf(" flags (%s)", &buf[1]);
	1098	if (vp->v_data == NULL) {
	1099	printf("\n");
	1100	} else {
	1101	printf("\n\t");
	1102	VOP_PRINT(vp);
	1103	}
	1104	}
	1105
	1106	#ifdef DDB
	1107	#include <ddb/ddb.h>
	1108	/*
	1109	* List all of the locked vnodes in the system.
	1110	* Called when debugging the kernel.
	1111	*/
	1112	DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
	1113	{
	1114	struct thread td = curthread; / XXX */
	1115	lwkt_tokref ilock;
	1116	struct mount mp, nmp;
	1117	struct vnode *vp;
	1118
	1119	printf("Locked vnodes\n");
	1120	lwkt_gettoken(&ilock, &mountlist_token);
	1121	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	1122	if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
	1123	nmp = TAILQ_NEXT(mp, mnt_list);
	1124	continue;
	1125	}
	1126	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	1127	if (VOP_ISLOCKED(vp, NULL))
	1128	vprint((char *)0, vp);
	1129	}
	1130	lwkt_gettokref(&ilock);
	1131	nmp = TAILQ_NEXT(mp, mnt_list);
	1132	vfs_unbusy(mp, td);
	1133	}
	1134	lwkt_reltoken(&ilock);
	1135	}
	1136	#endif
	1137
	1138	/*
	1139	* Top level filesystem related information gathering.
	1140	*/
	1141	static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
	1142
	1143	static int
	1144	vfs_sysctl(SYSCTL_HANDLER_ARGS)
	1145	{
	1146	int name = (int )arg1 - 1; /* XXX */
	1147	u_int namelen = arg2 + 1; /* XXX */
	1148	struct vfsconf *vfsp;
	1149
	1150	#if 1 \|\| defined(COMPAT_PRELITE2)
	1151	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
	1152	if (namelen == 1)
	1153	return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
	1154	#endif
	1155
	1156	#ifdef notyet
	1157	/* all sysctl names at this level are at least name and field */
	1158	if (namelen < 2)
	1159	return (ENOTDIR); /* overloaded */
	1160	if (name[0] != VFS_GENERIC) {
	1161	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	1162	if (vfsp->vfc_typenum == name[0])
	1163	break;
	1164	if (vfsp == NULL)
	1165	return (EOPNOTSUPP);
	1166	return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
	1167	oldp, oldlenp, newp, newlen, p));
	1168	}
	1169	#endif
	1170	switch (name[1]) {
	1171	case VFS_MAXTYPENUM:
	1172	if (namelen != 2)
	1173	return (ENOTDIR);
	1174	return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
	1175	case VFS_CONF:
	1176	if (namelen != 3)
	1177	return (ENOTDIR); /* overloaded */
	1178	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	1179	if (vfsp->vfc_typenum == name[2])
	1180	break;
	1181	if (vfsp == NULL)
	1182	return (EOPNOTSUPP);
	1183	return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
	1184	}
	1185	return (EOPNOTSUPP);
	1186	}
	1187
	1188	SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
	1189	"Generic filesystem");
	1190
	1191	#if 1 \|\| defined(COMPAT_PRELITE2)
	1192
	1193	static int
	1194	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
	1195	{
	1196	int error;
	1197	struct vfsconf *vfsp;
	1198	struct ovfsconf ovfs;
	1199
	1200	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	1201	ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
	1202	strcpy(ovfs.vfc_name, vfsp->vfc_name);
	1203	ovfs.vfc_index = vfsp->vfc_typenum;
	1204	ovfs.vfc_refcount = vfsp->vfc_refcount;
	1205	ovfs.vfc_flags = vfsp->vfc_flags;
	1206	error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
	1207	if (error)
	1208	return error;
	1209	}
	1210	return 0;
	1211	}
	1212
	1213	#endif /* 1 \|\| COMPAT_PRELITE2 */
	1214
	1215	#if 0
	1216	#define KINFO_VNODESLOP 10
	1217	/*
	1218	* Dump vnode list (via sysctl).
	1219	* Copyout address of vnode followed by vnode.
	1220	*/
	1221	/* ARGSUSED */
	1222	static int
	1223	sysctl_vnode(SYSCTL_HANDLER_ARGS)
	1224	{
	1225	struct proc p = curproc; / XXX */
	1226	struct mount mp, nmp;
	1227	struct vnode nvp, vp;
	1228	lwkt_tokref ilock;
	1229	lwkt_tokref jlock;
	1230	int error;
	1231
	1232	#define VPTRSZ sizeof (struct vnode *)
	1233	#define VNODESZ sizeof (struct vnode)
	1234
	1235	req->lock = 0;
	1236	if (!req->oldptr) /* Make an estimate */
	1237	return (SYSCTL_OUT(req, 0,
	1238	(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
	1239
	1240	lwkt_gettoken(&ilock, &mountlist_token);
	1241	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	1242	if (vfs_busy(mp, LK_NOWAIT, &ilock, p)) {
	1243	nmp = TAILQ_NEXT(mp, mnt_list);
	1244	continue;
	1245	}
	1246	lwkt_gettoken(&jlock, &mntvnode_token);
	1247	again:
	1248	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	1249	vp != NULL;
	1250	vp = nvp) {
	1251	/*
	1252	* Check that the vp is still associated with
	1253	* this filesystem. RACE: could have been
	1254	* recycled onto the same filesystem.
	1255	*/
	1256	if (vp->v_mount != mp)
	1257	goto again;
	1258	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	1259	if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\|
	1260	(error = SYSCTL_OUT(req, vp, VNODESZ))) {
	1261	lwkt_reltoken(&jlock);
	1262	return (error);
	1263	}
	1264	}
	1265	lwkt_reltoken(&jlock);
	1266	lwkt_gettokref(&ilock);
	1267	nmp = TAILQ_NEXT(mp, mnt_list); /* ZZZ */
	1268	vfs_unbusy(mp, p);
	1269	}
	1270	lwkt_reltoken(&ilock);
	1271
	1272	return (0);
	1273	}
	1274	#endif
	1275
	1276	/*
	1277	* XXX
	1278	* Exporting the vnode list on large systems causes them to crash.
	1279	* Exporting the vnode list on medium systems causes sysctl to coredump.
	1280	*/
	1281	#if 0
	1282	SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	1283	0, 0, sysctl_vnode, "S,vnode", "");
	1284	#endif
	1285
	1286	/*
	1287	* Check to see if a filesystem is mounted on a block device.
	1288	*/
	1289	int
	1290	vfs_mountedon(struct vnode *vp)
	1291	{
	1292	dev_t dev;
	1293
	1294	if ((dev = vp->v_rdev) == NULL)
	1295	dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
	1296	if (dev != NODEV && dev->si_mountpoint)
	1297	return (EBUSY);
	1298	return (0);
	1299	}
	1300
	1301	/*
	1302	* Unmount all filesystems. The list is traversed in reverse order
	1303	* of mounting to avoid dependencies.
	1304	*/
	1305	void
	1306	vfs_unmountall(void)
	1307	{
	1308	struct mount *mp;
	1309	struct thread *td = curthread;
	1310	int error;
	1311
	1312	if (td->td_proc == NULL)
	1313	td = initproc->p_thread; /* XXX XXX use proc0 instead? */
	1314
	1315	/*
	1316	* Since this only runs when rebooting, it is not interlocked.
	1317	*/
	1318	while(!TAILQ_EMPTY(&mountlist)) {
	1319	mp = TAILQ_LAST(&mountlist, mntlist);
	1320	error = dounmount(mp, MNT_FORCE, td);
	1321	if (error) {
	1322	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	1323	printf("unmount of %s failed (",
	1324	mp->mnt_stat.f_mntonname);
	1325	if (error == EBUSY)
	1326	printf("BUSY)\n");
	1327	else
	1328	printf("%d)\n", error);
	1329	} else {
	1330	/* The unmount has removed mp from the mountlist */
	1331	}
	1332	}
	1333	}
	1334
	1335	/*
	1336	* Build hash lists of net addresses and hang them off the mount point.
	1337	* Called by ufs_mount() to set up the lists of export addresses.
	1338	*/
	1339	static int
	1340	vfs_hang_addrlist(struct mount mp, struct netexport nep,
	1341	struct export_args *argp)
	1342	{
	1343	struct netcred *np;
	1344	struct radix_node_head *rnh;
	1345	int i;
	1346	struct radix_node *rn;
	1347	struct sockaddr saddr, smask = 0;
	1348	struct domain *dom;
	1349	int error;
	1350
	1351	if (argp->ex_addrlen == 0) {
	1352	if (mp->mnt_flag & MNT_DEFEXPORTED)
	1353	return (EPERM);
	1354	np = &nep->ne_defexported;
	1355	np->netc_exflags = argp->ex_flags;
	1356	np->netc_anon = argp->ex_anon;
	1357	np->netc_anon.cr_ref = 1;
	1358	mp->mnt_flag \|= MNT_DEFEXPORTED;
	1359	return (0);
	1360	}
	1361
	1362	if (argp->ex_addrlen < 0 \|\| argp->ex_addrlen > MLEN)
	1363	return (EINVAL);
	1364	if (argp->ex_masklen < 0 \|\| argp->ex_masklen > MLEN)
	1365	return (EINVAL);
	1366
	1367	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
	1368	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
	1369	bzero((caddr_t) np, i);
	1370	saddr = (struct sockaddr *) (np + 1);
	1371	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
	1372	goto out;
	1373	if (saddr->sa_len > argp->ex_addrlen)
	1374	saddr->sa_len = argp->ex_addrlen;
	1375	if (argp->ex_masklen) {
	1376	smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
	1377	error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
	1378	if (error)
	1379	goto out;
	1380	if (smask->sa_len > argp->ex_masklen)
	1381	smask->sa_len = argp->ex_masklen;
	1382	}
	1383	i = saddr->sa_family;
	1384	if ((rnh = nep->ne_rtable[i]) == 0) {
	1385	/*
	1386	* Seems silly to initialize every AF when most are not used,
	1387	* do so on demand here
	1388	*/
	1389	for (dom = domains; dom; dom = dom->dom_next)
	1390	if (dom->dom_family == i && dom->dom_rtattach) {
	1391	dom->dom_rtattach((void **) &nep->ne_rtable[i],
	1392	dom->dom_rtoffset);
	1393	break;
	1394	}
	1395	if ((rnh = nep->ne_rtable[i]) == 0) {
	1396	error = ENOBUFS;
	1397	goto out;
	1398	}
	1399	}
	1400	rn = (rnh->rnh_addaddr) ((char ) saddr, (char *) smask, rnh,
	1401	np->netc_rnodes);
	1402	if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists */
	1403	error = EPERM;
	1404	goto out;
	1405	}
	1406	np->netc_exflags = argp->ex_flags;
	1407	np->netc_anon = argp->ex_anon;
	1408	np->netc_anon.cr_ref = 1;
	1409	return (0);
	1410	out:
	1411	free(np, M_NETADDR);
	1412	return (error);
	1413	}
	1414
	1415	/* ARGSUSED */
	1416	static int
	1417	vfs_free_netcred(struct radix_node rn, void w)
	1418	{
	1419	struct radix_node_head rnh = (struct radix_node_head ) w;
	1420
	1421	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
	1422	free((caddr_t) rn, M_NETADDR);
	1423	return (0);
	1424	}
	1425
	1426	/*
	1427	* Free the net address hash lists that are hanging off the mount points.
	1428	*/
	1429	static void
	1430	vfs_free_addrlist(struct netexport *nep)
	1431	{
	1432	int i;
	1433	struct radix_node_head *rnh;
	1434
	1435	for (i = 0; i <= AF_MAX; i++)
	1436	if ((rnh = nep->ne_rtable[i])) {
	1437	(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
	1438	(caddr_t) rnh);
	1439	free((caddr_t) rnh, M_RTABLE);
	1440	nep->ne_rtable[i] = 0;
	1441	}
	1442	}
	1443
	1444	int
	1445	vfs_export(struct mount mp, struct netexport nep, struct export_args *argp)
	1446	{
	1447	int error;
	1448
	1449	if (argp->ex_flags & MNT_DELEXPORT) {
	1450	if (mp->mnt_flag & MNT_EXPUBLIC) {
	1451	vfs_setpublicfs(NULL, NULL, NULL);
	1452	mp->mnt_flag &= ~MNT_EXPUBLIC;
	1453	}
	1454	vfs_free_addrlist(nep);
	1455	mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED);
	1456	}
	1457	if (argp->ex_flags & MNT_EXPORTED) {
	1458	if (argp->ex_flags & MNT_EXPUBLIC) {
	1459	if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
	1460	return (error);
	1461	mp->mnt_flag \|= MNT_EXPUBLIC;
	1462	}
	1463	if ((error = vfs_hang_addrlist(mp, nep, argp)))
	1464	return (error);
	1465	mp->mnt_flag \|= MNT_EXPORTED;
	1466	}
	1467	return (0);
	1468	}
	1469
	1470
	1471	/*
	1472	* Set the publicly exported filesystem (WebNFS). Currently, only
	1473	* one public filesystem is possible in the spec (RFC 2054 and 2055)
	1474	*/
	1475	int
	1476	vfs_setpublicfs(struct mount mp, struct netexport nep,
	1477	struct export_args *argp)
	1478	{
	1479	int error;
	1480	struct vnode *rvp;
	1481	char *cp;
	1482
	1483	/*
	1484	* mp == NULL -> invalidate the current info, the FS is
	1485	* no longer exported. May be called from either vfs_export
	1486	* or unmount, so check if it hasn't already been done.
	1487	*/
	1488	if (mp == NULL) {
	1489	if (nfs_pub.np_valid) {
	1490	nfs_pub.np_valid = 0;
	1491	if (nfs_pub.np_index != NULL) {
	1492	FREE(nfs_pub.np_index, M_TEMP);
	1493	nfs_pub.np_index = NULL;
	1494	}
	1495	}
	1496	return (0);
	1497	}
	1498
	1499	/*
	1500	* Only one allowed at a time.
	1501	*/
	1502	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
	1503	return (EBUSY);
	1504
	1505	/*
	1506	* Get real filehandle for root of exported FS.
	1507	*/
	1508	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
	1509	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
	1510
	1511	if ((error = VFS_ROOT(mp, &rvp)))
	1512	return (error);
	1513
	1514	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
	1515	return (error);
	1516
	1517	vput(rvp);
	1518
	1519	/*
	1520	* If an indexfile was specified, pull it in.
	1521	*/
	1522	if (argp->ex_indexfile != NULL) {
	1523	MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
	1524	M_WAITOK);
	1525	error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
	1526	MAXNAMLEN, (size_t *)0);
	1527	if (!error) {
	1528	/*
	1529	* Check for illegal filenames.
	1530	*/
	1531	for (cp = nfs_pub.np_index; *cp; cp++) {
	1532	if (*cp == '/') {
	1533	error = EINVAL;
	1534	break;
	1535	}
	1536	}
	1537	}
	1538	if (error) {
	1539	FREE(nfs_pub.np_index, M_TEMP);
	1540	return (error);
	1541	}
	1542	}
	1543
	1544	nfs_pub.np_mount = mp;
	1545	nfs_pub.np_valid = 1;
	1546	return (0);
	1547	}
	1548
	1549	struct netcred *
	1550	vfs_export_lookup(struct mount mp, struct netexport nep,
	1551	struct sockaddr *nam)
	1552	{
	1553	struct netcred *np;
	1554	struct radix_node_head *rnh;
	1555	struct sockaddr *saddr;
	1556
	1557	np = NULL;
	1558	if (mp->mnt_flag & MNT_EXPORTED) {
	1559	/*
	1560	* Lookup in the export list first.
	1561	*/
	1562	if (nam != NULL) {
	1563	saddr = nam;
	1564	rnh = nep->ne_rtable[saddr->sa_family];
	1565	if (rnh != NULL) {
	1566	np = (struct netcred *)
	1567	(rnh->rnh_matchaddr)((char )saddr,
	1568	rnh);
	1569	if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
	1570	np = NULL;
	1571	}
	1572	}
	1573	/*
	1574	* If no address match, use the default if it exists.
	1575	*/
	1576	if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
	1577	np = &nep->ne_defexported;
	1578	}
	1579	return (np);
	1580	}
	1581
	1582	/*
	1583	* perform msync on all vnodes under a mount point. The mount point must
	1584	* be locked. This code is also responsible for lazy-freeing unreferenced
	1585	* vnodes whos VM objects no longer contain pages.
	1586	*
	1587	* NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
	1588	*/
	1589	static int vfs_msync_scan1(struct mount mp, struct vnode vp, void *data);
	1590	static int vfs_msync_scan2(struct mount mp, struct vnode vp, void *data);
	1591
	1592	void
	1593	vfs_msync(struct mount *mp, int flags)
	1594	{
	1595	vmntvnodescan(mp, VMSC_REFVP, vfs_msync_scan1, vfs_msync_scan2,
	1596	(void *)flags);
	1597	}
	1598
	1599	/*
	1600	* scan1 is a fast pre-check. There could be hundreds of thousands of
	1601	* vnodes, we cannot afford to do anything heavy weight until we have a
	1602	* fairly good indication that there is work to do.
	1603	*/
	1604	static
	1605	int
	1606	vfs_msync_scan1(struct mount mp, struct vnode vp, void *data)
	1607	{
	1608	int flags = (int)data;
	1609
	1610	if ((vp->v_flag & VRECLAIMED) == 0) {
	1611	if (vshouldfree(vp, 0))
	1612	return(0); /* call scan2 */
	1613	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	1614	(vp->v_flag & VOBJDIRTY) &&
	1615	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	1616	return(0); /* call scan2 */
	1617	}
	1618	}
	1619
	1620	/*
	1621	* do not call scan2, continue the loop
	1622	*/
	1623	return(-1);
	1624	}
	1625
	1626	static
	1627	int
	1628	vfs_msync_scan2(struct mount mp, struct vnode vp, void *data)
	1629	{
	1630	vm_object_t obj;
	1631	int flags = (int)data;
	1632
	1633	if (vp->v_flag & VRECLAIMED)
	1634	return(0);
	1635
	1636	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	1637	(vp->v_flag & VOBJDIRTY) &&
	1638	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	1639	if (VOP_GETVOBJECT(vp, &obj) == 0) {
	1640	vm_object_page_clean(obj, 0, 0,
	1641	flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
	1642	}
	1643	}
	1644	return(0);
	1645	}
	1646
	1647	/*
	1648	* Create the VM object needed for VMIO and mmap support. This
	1649	* is done for all VREG files in the system. Some filesystems might
	1650	* afford the additional metadata buffering capability of the
	1651	* VMIO code by making the device node be VMIO mode also.
	1652	*
	1653	* vp must be locked when vfs_object_create is called.
	1654	*/
	1655	int
	1656	vfs_object_create(struct vnode vp, struct thread td)
	1657	{
	1658	return (VOP_CREATEVOBJECT(vp, td));
	1659	}
	1660
	1661	/*
	1662	* Record a process's interest in events which might happen to
	1663	* a vnode. Because poll uses the historic select-style interface
	1664	* internally, this routine serves as both the ``check for any
	1665	* pending events'' and the ``record my interest in future events''
	1666	* functions. (These are done together, while the lock is held,
	1667	* to avoid race conditions.)
	1668	*/
	1669	int
	1670	vn_pollrecord(struct vnode vp, struct thread td, int events)
	1671	{
	1672	lwkt_tokref ilock;
	1673
	1674	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	1675	if (vp->v_pollinfo.vpi_revents & events) {
	1676	/*
	1677	* This leaves events we are not interested
	1678	* in available for the other process which
	1679	* which presumably had requested them
	1680	* (otherwise they would never have been
	1681	* recorded).
	1682	*/
	1683	events &= vp->v_pollinfo.vpi_revents;
	1684	vp->v_pollinfo.vpi_revents &= ~events;
	1685
	1686	lwkt_reltoken(&ilock);
	1687	return events;
	1688	}
	1689	vp->v_pollinfo.vpi_events \|= events;
	1690	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
	1691	lwkt_reltoken(&ilock);
	1692	return 0;
	1693	}
	1694
	1695	/*
	1696	* Note the occurrence of an event. If the VN_POLLEVENT macro is used,
	1697	* it is possible for us to miss an event due to race conditions, but
	1698	* that condition is expected to be rare, so for the moment it is the
	1699	* preferred interface.
	1700	*/
	1701	void
	1702	vn_pollevent(struct vnode *vp, int events)
	1703	{
	1704	lwkt_tokref ilock;
	1705
	1706	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	1707	if (vp->v_pollinfo.vpi_events & events) {
	1708	/*
	1709	* We clear vpi_events so that we don't
	1710	* call selwakeup() twice if two events are
	1711	* posted before the polling process(es) is
	1712	* awakened. This also ensures that we take at
	1713	* most one selwakeup() if the polling process
	1714	* is no longer interested. However, it does
	1715	* mean that only one event can be noticed at
	1716	* a time. (Perhaps we should only clear those
	1717	* event bits which we note?) XXX
	1718	*/
	1719	vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
	1720	vp->v_pollinfo.vpi_revents \|= events;
	1721	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	1722	}
	1723	lwkt_reltoken(&ilock);
	1724	}
	1725
	1726	/*
	1727	* Wake up anyone polling on vp because it is being revoked.
	1728	* This depends on dead_poll() returning POLLHUP for correct
	1729	* behavior.
	1730	*/
	1731	void
	1732	vn_pollgone(struct vnode *vp)
	1733	{
	1734	lwkt_tokref ilock;
	1735
	1736	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	1737	if (vp->v_pollinfo.vpi_events) {
	1738	vp->v_pollinfo.vpi_events = 0;
	1739	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	1740	}
	1741	lwkt_reltoken(&ilock);
	1742	}
	1743
	1744	/*
	1745	* extract the dev_t from a VBLK or VCHR. The vnode must have been opened
	1746	* (or v_rdev might be NULL).
	1747	*/
	1748	dev_t
	1749	vn_todev(struct vnode *vp)
	1750	{
	1751	if (vp->v_type != VBLK && vp->v_type != VCHR)
	1752	return (NODEV);
	1753	KKASSERT(vp->v_rdev != NULL);
	1754	return (vp->v_rdev);
	1755	}
	1756
	1757	/*
	1758	* Check if vnode represents a disk device. The vnode does not need to be
	1759	* opened.
	1760	*/
	1761	int
	1762	vn_isdisk(struct vnode vp, int errp)
	1763	{
	1764	dev_t dev;
	1765
	1766	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	1767	if (errp != NULL)
	1768	*errp = ENOTBLK;
	1769	return (0);
	1770	}
	1771
	1772	if ((dev = vp->v_rdev) == NULL)
	1773	dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
	1774	if (dev == NULL \|\| dev == NODEV) {
	1775	if (errp != NULL)
	1776	*errp = ENXIO;
	1777	return (0);
	1778	}
	1779	if (dev_is_good(dev) == 0) {
	1780	if (errp != NULL)
	1781	*errp = ENXIO;
	1782	return (0);
	1783	}
	1784	if ((dev_dflags(dev) & D_DISK) == 0) {
	1785	if (errp != NULL)
	1786	*errp = ENOTBLK;
	1787	return (0);
	1788	}
	1789	if (errp != NULL)
	1790	*errp = 0;
	1791	return (1);
	1792	}
	1793
	1794	#ifdef DEBUG_VFS_LOCKS
	1795
	1796	void
	1797	assert_vop_locked(struct vnode vp, const char str)
	1798	{
	1799	if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) {
	1800	panic("%s: %p is not locked shared but should be", str, vp);
	1801	}
	1802	}
	1803
	1804	void
	1805	assert_vop_unlocked(struct vnode vp, const char str)
	1806	{
	1807	if (vp && IS_LOCKING_VFS(vp)) {
	1808	if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) {
	1809	panic("%s: %p is locked but should not be", str, vp);
	1810	}
	1811	}
	1812	}
	1813
	1814	#endif