gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	39	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	40	* $DragonFly: src/sys/kern/vfs_subr.c,v 1.33 2004/06/15 00:30:53 dillon Exp $
	41	*/
	42
	43	/*
	44	* External virtual filesystem routines
	45	*/
	46	#include "opt_ddb.h"
	47
	48	#include <sys/param.h>
	49	#include <sys/systm.h>
	50	#include <sys/buf.h>
	51	#include <sys/conf.h>
	52	#include <sys/dirent.h>
	53	#include <sys/domain.h>
	54	#include <sys/eventhandler.h>
	55	#include <sys/fcntl.h>
	56	#include <sys/kernel.h>
	57	#include <sys/kthread.h>
	58	#include <sys/malloc.h>
	59	#include <sys/mbuf.h>
	60	#include <sys/mount.h>
	61	#include <sys/proc.h>
	62	#include <sys/namei.h>
	63	#include <sys/reboot.h>
	64	#include <sys/socket.h>
	65	#include <sys/stat.h>
	66	#include <sys/sysctl.h>
	67	#include <sys/syslog.h>
	68	#include <sys/vmmeter.h>
	69	#include <sys/vnode.h>
	70
	71	#include <machine/limits.h>
	72
	73	#include <vm/vm.h>
	74	#include <vm/vm_object.h>
	75	#include <vm/vm_extern.h>
	76	#include <vm/vm_kern.h>
	77	#include <vm/pmap.h>
	78	#include <vm/vm_map.h>
	79	#include <vm/vm_page.h>
	80	#include <vm/vm_pager.h>
	81	#include <vm/vnode_pager.h>
	82	#include <vm/vm_zone.h>
	83
	84	#include <sys/buf2.h>
	85	#include <sys/thread2.h>
	86
	87	static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
	88
	89	static void insmntque (struct vnode vp, struct mount mp);
	90	static void vclean (struct vnode *vp, lwkt_tokref_t vlock,
	91	int flags, struct thread *td);
	92
	93	static unsigned long numvnodes;
	94	SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
	95
	96	enum vtype iftovt_tab[16] = {
	97	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	98	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	99	};
	100	int vttoif_tab[9] = {
	101	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	102	S_IFSOCK, S_IFIFO, S_IFMT,
	103	};
	104
	105	static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
	106
	107	static u_long wantfreevnodes = 25;
	108	SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
	109	&wantfreevnodes, 0, "");
	110	static u_long freevnodes = 0;
	111	SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
	112	&freevnodes, 0, "");
	113
	114	static int reassignbufcalls;
	115	SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW,
	116	&reassignbufcalls, 0, "");
	117	static int reassignbufloops;
	118	SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW,
	119	&reassignbufloops, 0, "");
	120	static int reassignbufsortgood;
	121	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW,
	122	&reassignbufsortgood, 0, "");
	123	static int reassignbufsortbad;
	124	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW,
	125	&reassignbufsortbad, 0, "");
	126	static int reassignbufmethod = 1;
	127	SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW,
	128	&reassignbufmethod, 0, "");
	129
	130	#ifdef ENABLE_VFS_IOOPT
	131	int vfs_ioopt = 0;
	132	SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
	133	#endif
	134
	135	struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
	136	struct lwkt_token mountlist_token;
	137	struct lwkt_token mntvnode_token;
	138	int nfs_mount_type = -1;
	139	static struct lwkt_token mntid_token;
	140	static struct lwkt_token vnode_free_list_token;
	141	static struct lwkt_token spechash_token;
	142	struct nfs_public nfs_pub; /* publicly exported FS */
	143	static vm_zone_t vnode_zone;
	144
	145	/*
	146	* The workitem queue.
	147	*/
	148	#define SYNCER_MAXDELAY 32
	149	static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
	150	time_t syncdelay = 30; /* max time to delay syncing data */
	151	SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW,
	152	&syncdelay, 0, "VFS data synchronization delay");
	153	time_t filedelay = 30; /* time to delay syncing files */
	154	SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW,
	155	&filedelay, 0, "File synchronization delay");
	156	time_t dirdelay = 29; /* time to delay syncing directories */
	157	SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW,
	158	&dirdelay, 0, "Directory synchronization delay");
	159	time_t metadelay = 28; /* time to delay syncing metadata */
	160	SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW,
	161	&metadelay, 0, "VFS metadata synchronization delay");
	162	static int rushjob; /* number of slots to run ASAP */
	163	static int stat_rush_requests; /* number of times I/O speeded up */
	164	SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW,
	165	&stat_rush_requests, 0, "");
	166
	167	static int syncer_delayno = 0;
	168	static long syncer_mask;
	169	LIST_HEAD(synclist, vnode);
	170	static struct synclist *syncer_workitem_pending;
	171
	172	int desiredvnodes;
	173	SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
	174	&desiredvnodes, 0, "Maximum number of vnodes");
	175	static int minvnodes;
	176	SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
	177	&minvnodes, 0, "Minimum number of vnodes");
	178	static int vnlru_nowhere = 0;
	179	SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
	180	&vnlru_nowhere, 0,
	181	"Number of times the vnlru process ran without success");
	182
	183	static void vfs_free_addrlist (struct netexport *nep);
	184	static int vfs_free_netcred (struct radix_node rn, void w);
	185	static int vfs_hang_addrlist (struct mount mp, struct netexport nep,
	186	struct export_args *argp);
	187
	188	#define VSHOULDFREE(vp) \
	189	(!((vp)->v_flag & (VFREE\|VDOOMED)) && \
	190	!(vp)->v_holdcnt && !(vp)->v_usecount && \
	191	(!(vp)->v_object \|\| \
	192	!((vp)->v_object->ref_count \|\| (vp)->v_object->resident_page_count)))
	193
	194	#define VMIGHTFREE(vp) \
	195	(((vp)->v_flag & (VFREE\|VDOOMED\|VXLOCK)) == 0 && \
	196	cache_leaf_test(vp) == 0 && (vp)->v_usecount == 0)
	197
	198	#define VSHOULDBUSY(vp) \
	199	(((vp)->v_flag & VFREE) && \
	200	((vp)->v_holdcnt \|\| (vp)->v_usecount))
	201
	202	static void vbusy(struct vnode *vp);
	203	static void vfree(struct vnode *vp);
	204	static void vmaybefree(struct vnode *vp);
	205
	206	extern int dev_ref_debug;
	207
	208	/*
	209	* NOTE: the vnode interlock must be held on call.
	210	*/
	211	static __inline void
	212	vmaybefree(struct vnode *vp)
	213	{
	214	if (VSHOULDFREE(vp))
	215	vfree(vp);
	216	}
	217
	218	/*
	219	* Initialize the vnode management data structures.
	220	*/
	221	void
	222	vntblinit(void)
	223	{
	224
	225	/*
	226	* Desired vnodes is a result of the physical page count
	227	* and the size of kernel's heap. It scales in proportion
	228	* to the amount of available physical memory. This can
	229	* cause trouble on 64-bit and large memory platforms.
	230	*/
	231	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
	232	desiredvnodes =
	233	min(maxproc + vmstats.v_page_count /4,
	234	2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
	235	(5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
	236
	237	minvnodes = desiredvnodes / 4;
	238	lwkt_token_init(&mountlist_token);
	239	lwkt_token_init(&mntvnode_token);
	240	lwkt_token_init(&mntid_token);
	241	lwkt_token_init(&spechash_token);
	242	TAILQ_INIT(&vnode_free_list);
	243	lwkt_token_init(&vnode_free_list_token);
	244	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
	245	/*
	246	* Initialize the filesystem syncer.
	247	*/
	248	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
	249	&syncer_mask);
	250	syncer_maxdelay = syncer_mask + 1;
	251	}
	252
	253	/*
	254	* Mark a mount point as busy. Used to synchronize access and to delay
	255	* unmounting. Interlock is not released on failure.
	256	*/
	257	int
	258	vfs_busy(struct mount *mp, int flags,
	259	lwkt_tokref_t interlkp, struct thread *td)
	260	{
	261	int lkflags;
	262
	263	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
	264	if (flags & LK_NOWAIT)
	265	return (ENOENT);
	266	mp->mnt_kern_flag \|= MNTK_MWAIT;
	267	/*
	268	* Since all busy locks are shared except the exclusive
	269	* lock granted when unmounting, the only place that a
	270	* wakeup needs to be done is at the release of the
	271	* exclusive lock at the end of dounmount.
	272	*
	273	* note: interlkp is a serializer and thus can be safely
	274	* held through any sleep
	275	*/
	276	tsleep((caddr_t)mp, 0, "vfs_busy", 0);
	277	return (ENOENT);
	278	}
	279	lkflags = LK_SHARED \| LK_NOPAUSE;
	280	if (interlkp)
	281	lkflags \|= LK_INTERLOCK;
	282	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
	283	panic("vfs_busy: unexpected lock failure");
	284	return (0);
	285	}
	286
	287	/*
	288	* Free a busy filesystem.
	289	*/
	290	void
	291	vfs_unbusy(struct mount mp, struct thread td)
	292	{
	293	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
	294	}
	295
	296	/*
	297	* Lookup a filesystem type, and if found allocate and initialize
	298	* a mount structure for it.
	299	*
	300	* Devname is usually updated by mount(8) after booting.
	301	*/
	302	int
	303	vfs_rootmountalloc(char fstypename, char devname, struct mount **mpp)
	304	{
	305	struct thread td = curthread; / XXX */
	306	struct vfsconf *vfsp;
	307	struct mount *mp;
	308
	309	if (fstypename == NULL)
	310	return (ENODEV);
	311	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	312	if (!strcmp(vfsp->vfc_name, fstypename))
	313	break;
	314	if (vfsp == NULL)
	315	return (ENODEV);
	316	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
	317	bzero((char *)mp, (u_long)sizeof(struct mount));
	318	lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
	319	vfs_busy(mp, LK_NOWAIT, NULL, td);
	320	TAILQ_INIT(&mp->mnt_nvnodelist);
	321	TAILQ_INIT(&mp->mnt_reservedvnlist);
	322	mp->mnt_nvnodelistsize = 0;
	323	mp->mnt_vfc = vfsp;
	324	mp->mnt_op = vfsp->vfc_vfsops;
	325	mp->mnt_flag = MNT_RDONLY;
	326	mp->mnt_vnodecovered = NULLVP;
	327	vfsp->vfc_refcount++;
	328	mp->mnt_iosize_max = DFLTPHYS;
	329	mp->mnt_stat.f_type = vfsp->vfc_typenum;
	330	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	331	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
	332	mp->mnt_stat.f_mntonname[0] = '/';
	333	mp->mnt_stat.f_mntonname[1] = 0;
	334	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
	335	*mpp = mp;
	336	return (0);
	337	}
	338
	339	/*
	340	* Lookup a mount point by filesystem identifier.
	341	*/
	342	struct mount *
	343	vfs_getvfs(fsid_t *fsid)
	344	{
	345	struct mount *mp;
	346	lwkt_tokref ilock;
	347
	348	lwkt_gettoken(&ilock, &mountlist_token);
	349	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	350	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	351	mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
	352	break;
	353	}
	354	}
	355	lwkt_reltoken(&ilock);
	356	return (mp);
	357	}
	358
	359	/*
	360	* Get a new unique fsid. Try to make its val[0] unique, since this value
	361	* will be used to create fake device numbers for stat(). Also try (but
	362	* not so hard) make its val[0] unique mod 2^16, since some emulators only
	363	* support 16-bit device numbers. We end up with unique val[0]'s for the
	364	* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
	365	*
	366	* Keep in mind that several mounts may be running in parallel. Starting
	367	* the search one past where the previous search terminated is both a
	368	* micro-optimization and a defense against returning the same fsid to
	369	* different mounts.
	370	*/
	371	void
	372	vfs_getnewfsid(struct mount *mp)
	373	{
	374	static u_int16_t mntid_base;
	375	lwkt_tokref ilock;
	376	fsid_t tfsid;
	377	int mtype;
	378
	379	lwkt_gettoken(&ilock, &mntid_token);
	380	mtype = mp->mnt_vfc->vfc_typenum;
	381	tfsid.val[1] = mtype;
	382	mtype = (mtype & 0xFF) << 24;
	383	for (;;) {
	384	tfsid.val[0] = makeudev(255,
	385	mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF));
	386	mntid_base++;
	387	if (vfs_getvfs(&tfsid) == NULL)
	388	break;
	389	}
	390	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
	391	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
	392	lwkt_reltoken(&ilock);
	393	}
	394
	395	/*
	396	* Knob to control the precision of file timestamps:
	397	*
	398	* 0 = seconds only; nanoseconds zeroed.
	399	* 1 = seconds and nanoseconds, accurate within 1/HZ.
	400	* 2 = seconds and nanoseconds, truncated to microseconds.
	401	* >=3 = seconds and nanoseconds, maximum precision.
	402	*/
	403	enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
	404
	405	static int timestamp_precision = TSP_SEC;
	406	SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
	407	&timestamp_precision, 0, "");
	408
	409	/*
	410	* Get a current timestamp.
	411	*/
	412	void
	413	vfs_timestamp(struct timespec *tsp)
	414	{
	415	struct timeval tv;
	416
	417	switch (timestamp_precision) {
	418	case TSP_SEC:
	419	tsp->tv_sec = time_second;
	420	tsp->tv_nsec = 0;
	421	break;
	422	case TSP_HZ:
	423	getnanotime(tsp);
	424	break;
	425	case TSP_USEC:
	426	microtime(&tv);
	427	TIMEVAL_TO_TIMESPEC(&tv, tsp);
	428	break;
	429	case TSP_NSEC:
	430	default:
	431	nanotime(tsp);
	432	break;
	433	}
	434	}
	435
	436	/*
	437	* Set vnode attributes to VNOVAL
	438	*/
	439	void
	440	vattr_null(struct vattr *vap)
	441	{
	442	vap->va_type = VNON;
	443	vap->va_size = VNOVAL;
	444	vap->va_bytes = VNOVAL;
	445	vap->va_mode = VNOVAL;
	446	vap->va_nlink = VNOVAL;
	447	vap->va_uid = VNOVAL;
	448	vap->va_gid = VNOVAL;
	449	vap->va_fsid = VNOVAL;
	450	vap->va_fileid = VNOVAL;
	451	vap->va_blocksize = VNOVAL;
	452	vap->va_rdev = VNOVAL;
	453	vap->va_atime.tv_sec = VNOVAL;
	454	vap->va_atime.tv_nsec = VNOVAL;
	455	vap->va_mtime.tv_sec = VNOVAL;
	456	vap->va_mtime.tv_nsec = VNOVAL;
	457	vap->va_ctime.tv_sec = VNOVAL;
	458	vap->va_ctime.tv_nsec = VNOVAL;
	459	vap->va_flags = VNOVAL;
	460	vap->va_gen = VNOVAL;
	461	vap->va_vaflags = 0;
	462	}
	463
	464	/*
	465	* This routine is called when we have too many vnodes. It attempts
	466	* to free <count> vnodes and will potentially free vnodes that still
	467	* have VM backing store (VM backing store is typically the cause
	468	* of a vnode blowout so we want to do this). Therefore, this operation
	469	* is not considered cheap.
	470	*
	471	* A number of conditions may prevent a vnode from being reclaimed.
	472	* the buffer cache may have references on the vnode, a directory
	473	* vnode may still have references due to the namei cache representing
	474	* underlying files, or the vnode may be in active use. It is not
	475	* desireable to reuse such vnodes. These conditions may cause the
	476	* number of vnodes to reach some minimum value regardless of what
	477	* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
	478	*/
	479	static int
	480	vlrureclaim(struct mount *mp)
	481	{
	482	struct vnode *vp;
	483	lwkt_tokref ilock;
	484	lwkt_tokref vlock;
	485	int done;
	486	int trigger;
	487	int usevnodes;
	488	int count;
	489
	490	/*
	491	* Calculate the trigger point, don't allow user
	492	* screwups to blow us up. This prevents us from
	493	* recycling vnodes with lots of resident pages. We
	494	* aren't trying to free memory, we are trying to
	495	* free vnodes.
	496	*/
	497	usevnodes = desiredvnodes;
	498	if (usevnodes <= 0)
	499	usevnodes = 1;
	500	trigger = vmstats.v_page_count * 2 / usevnodes;
	501
	502	done = 0;
	503	lwkt_gettoken(&ilock, &mntvnode_token);
	504	count = mp->mnt_nvnodelistsize / 10 + 1;
	505	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
	506	/*
	507	* __VNODESCAN__
	508	*
	509	* The VP will stick around while we hold mntvnode_token,
	510	* at least until we block, so we can safely do an initial
	511	* check. But we have to check again after obtaining
	512	* the vnode interlock. vp->v_interlock points to stable
	513	* storage so it's ok if the vp gets ripped out from
	514	* under us while we are blocked.
	515	*/
	516	if (vp->v_type == VNON \|\|
	517	vp->v_type == VBAD \|\|
	518	!VMIGHTFREE(vp) \|\| /* critical path opt */
	519	(vp->v_object &&
	520	vp->v_object->resident_page_count >= trigger)
	521	) {
	522	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	523	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
	524	--count;
	525	continue;
	526	}
	527
	528	/*
	529	* Get the interlock, delay moving the node to the tail so
	530	* we don't race against new additions to the mountlist.
	531	*/
	532	lwkt_gettoken(&vlock, vp->v_interlock);
	533	if (TAILQ_FIRST(&mp->mnt_nvnodelist) != vp) {
	534	lwkt_reltoken(&vlock);
	535	continue;
	536	}
	537	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	538	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
	539
	540	/*
	541	* Must check again
	542	*/
	543	if (vp->v_type == VNON \|\|
	544	vp->v_type == VBAD \|\|
	545	!VMIGHTFREE(vp) \|\| /* critical path opt */
	546	(vp->v_object &&
	547	vp->v_object->resident_page_count >= trigger)
	548	) {
	549	lwkt_reltoken(&vlock);
	550	--count;
	551	continue;
	552	}
	553	vgonel(vp, &vlock, curthread);
	554	++done;
	555	--count;
	556	}
	557	lwkt_reltoken(&ilock);
	558	return done;
	559	}
	560
	561	/*
	562	* Attempt to recycle vnodes in a context that is always safe to block.
	563	* Calling vlrurecycle() from the bowels of file system code has some
	564	* interesting deadlock problems.
	565	*/
	566	static struct thread *vnlruthread;
	567	static int vnlruproc_sig;
	568
	569	static void
	570	vnlru_proc(void)
	571	{
	572	struct mount mp, nmp;
	573	lwkt_tokref ilock;
	574	int s;
	575	int done;
	576	struct thread *td = curthread;
	577
	578	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	579	SHUTDOWN_PRI_FIRST);
	580
	581	s = splbio();
	582	for (;;) {
	583	kproc_suspend_loop();
	584	if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
	585	vnlruproc_sig = 0;
	586	wakeup(&vnlruproc_sig);
	587	tsleep(td, 0, "vlruwt", hz);
	588	continue;
	589	}
	590	done = 0;
	591	lwkt_gettoken(&ilock, &mountlist_token);
	592	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	593	if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
	594	nmp = TAILQ_NEXT(mp, mnt_list);
	595	continue;
	596	}
	597	done += vlrureclaim(mp);
	598	lwkt_gettokref(&ilock);
	599	nmp = TAILQ_NEXT(mp, mnt_list);
	600	vfs_unbusy(mp, td);
	601	}
	602	lwkt_reltoken(&ilock);
	603	if (done == 0) {
	604	vnlru_nowhere++;
	605	tsleep(td, 0, "vlrup", hz * 3);
	606	}
	607	}
	608	splx(s);
	609	}
	610
	611	static struct kproc_desc vnlru_kp = {
	612	"vnlru",
	613	vnlru_proc,
	614	&vnlruthread
	615	};
	616	SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
	617
	618	/*
	619	* Routines having to do with the management of the vnode table.
	620	*/
	621	extern vop_t **dead_vnodeop_p;
	622
	623	/*
	624	* Return the next vnode from the free list.
	625	*/
	626	int
	627	getnewvnode(enum vtagtype tag, struct mount *mp,
	628	vop_t vops, struct vnode vpp)
	629	{
	630	int s;
	631	struct thread td = curthread; / XXX */
	632	struct vnode *vp = NULL;
	633	struct vnode *xvp;
	634	vm_object_t object;
	635	lwkt_tokref ilock;
	636	lwkt_tokref vlock;
	637
	638	s = splbio();
	639
	640	/*
	641	* Try to reuse vnodes if we hit the max. This situation only
	642	* occurs in certain large-memory (2G+) situations. We cannot
	643	* attempt to directly reclaim vnodes due to nasty recursion
	644	* problems.
	645	*/
	646	while (numvnodes - freevnodes > desiredvnodes) {
	647	if (vnlruproc_sig == 0) {
	648	vnlruproc_sig = 1; /* avoid unnecessary wakeups */
	649	wakeup(vnlruthread);
	650	}
	651	tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
	652	}
	653
	654
	655	/*
	656	* Attempt to reuse a vnode already on the free list, allocating
	657	* a new vnode if we can't find one or if we have not reached a
	658	* good minimum for good LRU performance.
	659	*/
	660	lwkt_gettoken(&ilock, &vnode_free_list_token);
	661	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
	662	int count;
	663
	664	for (count = 0; count < freevnodes; count++) {
	665	/*
	666	* __VNODESCAN__
	667	*
	668	* Pull the next vnode off the free list and do some
	669	* sanity checks. Note that regardless of how we
	670	* block, if freevnodes is non-zero there had better
	671	* be something on the list.
	672	*/
	673	vp = TAILQ_FIRST(&vnode_free_list);
	674	if (vp == NULL)
	675	panic("getnewvnode: free vnode isn't");
	676
	677	/*
	678	* Move the vnode to the end of the list so other
	679	* processes do not double-block trying to recycle
	680	* the same vnode (as an optimization), then get
	681	* the interlock.
	682	*/
	683	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	684	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	685
	686	/*
	687	* Skip vnodes that are in the process of being
	688	* held or referenced. Since the act of adding or
	689	* removing a vnode on the freelist requires a token
	690	* and may block, the ref count may be adjusted
	691	* prior to its addition or removal.
	692	*/
	693	if (VSHOULDBUSY(vp)) {
	694	vp = NULL;
	695	continue;
	696	}
	697
	698
	699	/*
	700	* Obtain the vnode interlock and check that the
	701	* vnode is still on the free list.
	702	*
	703	* This normally devolves into a degenerate case so
	704	* it is optimal. Loop up if it isn't. Note that
	705	* the vnode could be in the middle of being moved
	706	* off the free list (the VSHOULDBUSY() check) and
	707	* must be skipped if so.
	708	*/
	709	lwkt_gettoken(&vlock, vp->v_interlock);
	710	TAILQ_FOREACH_REVERSE(xvp, &vnode_free_list,
	711	freelst, v_freelist) {
	712	if (vp == xvp)
	713	break;
	714	}
	715	if (vp != xvp \|\| VSHOULDBUSY(vp)) {
	716	vp = NULL;
	717	continue;
	718	}
	719
	720	/*
	721	* We now safely own the vnode. If the vnode has
	722	* an object do not recycle it if its VM object
	723	* has resident pages or references.
	724	*/
	725	if ((VOP_GETVOBJECT(vp, &object) == 0 &&
	726	(object->resident_page_count \|\| object->ref_count))
	727	) {
	728	lwkt_reltoken(&vlock);
	729	vp = NULL;
	730	continue;
	731	}
	732
	733	/*
	734	* We can almost reuse this vnode. But we don't want
	735	* to recycle it if the vnode has children in the
	736	* namecache because that breaks the namecache's
	737	* path element chain. (YYY use nc_refs for the
	738	* check?)
	739	*/
	740	KKASSERT(vp->v_flag & VFREE);
	741	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	742
	743	if (TAILQ_FIRST(&vp->v_namecache) == NULL \|\|
	744	cache_leaf_test(vp) >= 0) {
	745	/* ok, we can reuse this vnode */
	746	break;
	747	}
	748	lwkt_reltoken(&vlock);
	749	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	750	vp = NULL;
	751	}
	752	}
	753
	754	/*
	755	* If vp is non-NULL we hold it's interlock.
	756	*/
	757	if (vp) {
	758	vp->v_flag \|= VDOOMED;
	759	vp->v_flag &= ~VFREE;
	760	freevnodes--;
	761	lwkt_reltoken(&ilock);
	762	cache_purge(vp); /* YYY may block */
	763	vp->v_lease = NULL;
	764	if (vp->v_type != VBAD) {
	765	vgonel(vp, &vlock, td);
	766	} else {
	767	lwkt_reltoken(&vlock);
	768	}
	769
	770	#ifdef INVARIANTS
	771	{
	772	int s;
	773
	774	if (vp->v_data)
	775	panic("cleaned vnode isn't");
	776	s = splbio();
	777	if (vp->v_numoutput)
	778	panic("Clean vnode has pending I/O's");
	779	splx(s);
	780	}
	781	#endif
	782	vp->v_flag = 0;
	783	vp->v_lastw = 0;
	784	vp->v_lasta = 0;
	785	vp->v_cstart = 0;
	786	vp->v_clen = 0;
	787	vp->v_socket = 0;
	788	vp->v_writecount = 0; /* XXX */
	789	} else {
	790	lwkt_reltoken(&ilock);
	791	vp = zalloc(vnode_zone);
	792	bzero(vp, sizeof(*vp));
	793	vp->v_interlock = lwkt_token_pool_get(vp);
	794	lwkt_token_init(&vp->v_pollinfo.vpi_token);
	795	cache_purge(vp);
	796	TAILQ_INIT(&vp->v_namecache);
	797	numvnodes++;
	798	}
	799
	800	TAILQ_INIT(&vp->v_cleanblkhd);
	801	TAILQ_INIT(&vp->v_dirtyblkhd);
	802	vp->v_type = VNON;
	803	vp->v_tag = tag;
	804	vp->v_op = vops;
	805	insmntque(vp, mp);
	806	*vpp = vp;
	807	vp->v_usecount = 1;
	808	vp->v_data = 0;
	809	splx(s);
	810
	811	vfs_object_create(vp, td);
	812	return (0);
	813	}
	814
	815	/*
	816	* Move a vnode from one mount queue to another.
	817	*/
	818	static void
	819	insmntque(struct vnode vp, struct mount mp)
	820	{
	821	lwkt_tokref ilock;
	822
	823	lwkt_gettoken(&ilock, &mntvnode_token);
	824	/*
	825	* Delete from old mount point vnode list, if on one.
	826	*/
	827	if (vp->v_mount != NULL) {
	828	KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
	829	("bad mount point vnode list size"));
	830	TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
	831	vp->v_mount->mnt_nvnodelistsize--;
	832	}
	833	/*
	834	* Insert into list of vnodes for the new mount point, if available.
	835	*/
	836	if ((vp->v_mount = mp) == NULL) {
	837	lwkt_reltoken(&ilock);
	838	return;
	839	}
	840	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	841	mp->mnt_nvnodelistsize++;
	842	lwkt_reltoken(&ilock);
	843	}
	844
	845	/*
	846	* Update outstanding I/O count and do wakeup if requested.
	847	*/
	848	void
	849	vwakeup(struct buf *bp)
	850	{
	851	struct vnode *vp;
	852
	853	bp->b_flags &= ~B_WRITEINPROG;
	854	if ((vp = bp->b_vp)) {
	855	vp->v_numoutput--;
	856	if (vp->v_numoutput < 0)
	857	panic("vwakeup: neg numoutput");
	858	if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
	859	vp->v_flag &= ~VBWAIT;
	860	wakeup((caddr_t) &vp->v_numoutput);
	861	}
	862	}
	863	}
	864
	865	/*
	866	* Flush out and invalidate all buffers associated with a vnode.
	867	* Called with the underlying object locked.
	868	*/
	869	int
	870	vinvalbuf(struct vnode vp, int flags, struct thread td,
	871	int slpflag, int slptimeo)
	872	{
	873	struct buf *bp;
	874	struct buf nbp, blist;
	875	int s, error;
	876	vm_object_t object;
	877	lwkt_tokref vlock;
	878
	879	if (flags & V_SAVE) {
	880	s = splbio();
	881	while (vp->v_numoutput) {
	882	vp->v_flag \|= VBWAIT;
	883	error = tsleep((caddr_t)&vp->v_numoutput,
	884	slpflag, "vinvlbuf", slptimeo);
	885	if (error) {
	886	splx(s);
	887	return (error);
	888	}
	889	}
	890	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	891	splx(s);
	892	if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
	893	return (error);
	894	s = splbio();
	895	if (vp->v_numoutput > 0 \|\|
	896	!TAILQ_EMPTY(&vp->v_dirtyblkhd))
	897	panic("vinvalbuf: dirty bufs");
	898	}
	899	splx(s);
	900	}
	901	s = splbio();
	902	for (;;) {
	903	blist = TAILQ_FIRST(&vp->v_cleanblkhd);
	904	if (!blist)
	905	blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
	906	if (!blist)
	907	break;
	908
	909	for (bp = blist; bp; bp = nbp) {
	910	nbp = TAILQ_NEXT(bp, b_vnbufs);
	911	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	912	error = BUF_TIMELOCK(bp,
	913	LK_EXCLUSIVE \| LK_SLEEPFAIL,
	914	"vinvalbuf", slpflag, slptimeo);
	915	if (error == ENOLCK)
	916	break;
	917	splx(s);
	918	return (error);
	919	}
	920	/*
	921	* XXX Since there are no node locks for NFS, I
	922	* believe there is a slight chance that a delayed
	923	* write will occur while sleeping just above, so
	924	* check for it. Note that vfs_bio_awrite expects
	925	* buffers to reside on a queue, while VOP_BWRITE and
	926	* brelse do not.
	927	*/
	928	if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) &&
	929	(flags & V_SAVE)) {
	930
	931	if (bp->b_vp == vp) {
	932	if (bp->b_flags & B_CLUSTEROK) {
	933	BUF_UNLOCK(bp);
	934	vfs_bio_awrite(bp);
	935	} else {
	936	bremfree(bp);
	937	bp->b_flags \|= B_ASYNC;
	938	VOP_BWRITE(bp->b_vp, bp);
	939	}
	940	} else {
	941	bremfree(bp);
	942	(void) VOP_BWRITE(bp->b_vp, bp);
	943	}
	944	break;
	945	}
	946	bremfree(bp);
	947	bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF);
	948	bp->b_flags &= ~B_ASYNC;
	949	brelse(bp);
	950	}
	951	}
	952
	953	/*
	954	* Wait for I/O to complete. XXX needs cleaning up. The vnode can
	955	* have write I/O in-progress but if there is a VM object then the
	956	* VM object can also have read-I/O in-progress.
	957	*/
	958	do {
	959	while (vp->v_numoutput > 0) {
	960	vp->v_flag \|= VBWAIT;
	961	tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
	962	}
	963	if (VOP_GETVOBJECT(vp, &object) == 0) {
	964	while (object->paging_in_progress)
	965	vm_object_pip_sleep(object, "vnvlbx");
	966	}
	967	} while (vp->v_numoutput > 0);
	968
	969	splx(s);
	970
	971	/*
	972	* Destroy the copy in the VM cache, too.
	973	*/
	974	lwkt_gettoken(&vlock, vp->v_interlock);
	975	if (VOP_GETVOBJECT(vp, &object) == 0) {
	976	vm_object_page_remove(object, 0, 0,
	977	(flags & V_SAVE) ? TRUE : FALSE);
	978	}
	979	lwkt_reltoken(&vlock);
	980
	981	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd))
	982	panic("vinvalbuf: flush failed");
	983	return (0);
	984	}
	985
	986	/*
	987	* Truncate a file's buffer and pages to a specified length. This
	988	* is in lieu of the old vinvalbuf mechanism, which performed unneeded
	989	* sync activity.
	990	*/
	991	int
	992	vtruncbuf(struct vnode vp, struct thread td, off_t length, int blksize)
	993	{
	994	struct buf *bp;
	995	struct buf *nbp;
	996	int s, anyfreed;
	997	int trunclbn;
	998
	999	/*
	1000	* Round up to the next lbn.
	1001	*/
	1002	trunclbn = (length + blksize - 1) / blksize;
	1003
	1004	s = splbio();
	1005	restart:
	1006	anyfreed = 1;
	1007	for (;anyfreed;) {
	1008	anyfreed = 0;
	1009	for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
	1010	nbp = TAILQ_NEXT(bp, b_vnbufs);
	1011	if (bp->b_lblkno >= trunclbn) {
	1012	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	1013	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	1014	goto restart;
	1015	} else {
	1016	bremfree(bp);
	1017	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	1018	bp->b_flags &= ~B_ASYNC;
	1019	brelse(bp);
	1020	anyfreed = 1;
	1021	}
	1022	if (nbp &&
	1023	(((nbp->b_xflags & BX_VNCLEAN) == 0) \|\|
	1024	(nbp->b_vp != vp) \|\|
	1025	(nbp->b_flags & B_DELWRI))) {
	1026	goto restart;
	1027	}
	1028	}
	1029	}
	1030
	1031	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	1032	nbp = TAILQ_NEXT(bp, b_vnbufs);
	1033	if (bp->b_lblkno >= trunclbn) {
	1034	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	1035	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	1036	goto restart;
	1037	} else {
	1038	bremfree(bp);
	1039	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	1040	bp->b_flags &= ~B_ASYNC;
	1041	brelse(bp);
	1042	anyfreed = 1;
	1043	}
	1044	if (nbp &&
	1045	(((nbp->b_xflags & BX_VNDIRTY) == 0) \|\|
	1046	(nbp->b_vp != vp) \|\|
	1047	(nbp->b_flags & B_DELWRI) == 0)) {
	1048	goto restart;
	1049	}
	1050	}
	1051	}
	1052	}
	1053
	1054	if (length > 0) {
	1055	restartsync:
	1056	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	1057	nbp = TAILQ_NEXT(bp, b_vnbufs);
	1058	if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
	1059	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	1060	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	1061	goto restart;
	1062	} else {
	1063	bremfree(bp);
	1064	if (bp->b_vp == vp) {
	1065	bp->b_flags \|= B_ASYNC;
	1066	} else {
	1067	bp->b_flags &= ~B_ASYNC;
	1068	}
	1069	VOP_BWRITE(bp->b_vp, bp);
	1070	}
	1071	goto restartsync;
	1072	}
	1073
	1074	}
	1075	}
	1076
	1077	while (vp->v_numoutput > 0) {
	1078	vp->v_flag \|= VBWAIT;
	1079	tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
	1080	}
	1081
	1082	splx(s);
	1083
	1084	vnode_pager_setsize(vp, length);
	1085
	1086	return (0);
	1087	}
	1088
	1089	/*
	1090	* Associate a buffer with a vnode.
	1091	*/
	1092	void
	1093	bgetvp(struct vnode vp, struct buf bp)
	1094	{
	1095	int s;
	1096
	1097	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
	1098
	1099	vhold(vp);
	1100	bp->b_vp = vp;
	1101	bp->b_dev = vn_todev(vp);
	1102	/*
	1103	* Insert onto list for new vnode.
	1104	*/
	1105	s = splbio();
	1106	bp->b_xflags \|= BX_VNCLEAN;
	1107	bp->b_xflags &= ~BX_VNDIRTY;
	1108	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
	1109	splx(s);
	1110	}
	1111
	1112	/*
	1113	* Disassociate a buffer from a vnode.
	1114	*/
	1115	void
	1116	brelvp(struct buf *bp)
	1117	{
	1118	struct vnode *vp;
	1119	struct buflists *listheadp;
	1120	int s;
	1121
	1122	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
	1123
	1124	/*
	1125	* Delete from old vnode list, if on one.
	1126	*/
	1127	vp = bp->b_vp;
	1128	s = splbio();
	1129	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	1130	if (bp->b_xflags & BX_VNDIRTY)
	1131	listheadp = &vp->v_dirtyblkhd;
	1132	else
	1133	listheadp = &vp->v_cleanblkhd;
	1134	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	1135	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	1136	}
	1137	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	1138	vp->v_flag &= ~VONWORKLST;
	1139	LIST_REMOVE(vp, v_synclist);
	1140	}
	1141	splx(s);
	1142	bp->b_vp = (struct vnode *) 0;
	1143	vdrop(vp);
	1144	}
	1145
	1146	/*
	1147	* The workitem queue.
	1148	*
	1149	* It is useful to delay writes of file data and filesystem metadata
	1150	* for tens of seconds so that quickly created and deleted files need
	1151	* not waste disk bandwidth being created and removed. To realize this,
	1152	* we append vnodes to a "workitem" queue. When running with a soft
	1153	* updates implementation, most pending metadata dependencies should
	1154	* not wait for more than a few seconds. Thus, mounted on block devices
	1155	* are delayed only about a half the time that file data is delayed.
	1156	* Similarly, directory updates are more critical, so are only delayed
	1157	* about a third the time that file data is delayed. Thus, there are
	1158	* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
	1159	* one each second (driven off the filesystem syncer process). The
	1160	* syncer_delayno variable indicates the next queue that is to be processed.
	1161	* Items that need to be processed soon are placed in this queue:
	1162	*
	1163	* syncer_workitem_pending[syncer_delayno]
	1164	*
	1165	* A delay of fifteen seconds is done by placing the request fifteen
	1166	* entries later in the queue:
	1167	*
	1168	* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
	1169	*
	1170	*/
	1171
	1172	/*
	1173	* Add an item to the syncer work queue.
	1174	*/
	1175	static void
	1176	vn_syncer_add_to_worklist(struct vnode *vp, int delay)
	1177	{
	1178	int s, slot;
	1179
	1180	s = splbio();
	1181
	1182	if (vp->v_flag & VONWORKLST) {
	1183	LIST_REMOVE(vp, v_synclist);
	1184	}
	1185
	1186	if (delay > syncer_maxdelay - 2)
	1187	delay = syncer_maxdelay - 2;
	1188	slot = (syncer_delayno + delay) & syncer_mask;
	1189
	1190	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
	1191	vp->v_flag \|= VONWORKLST;
	1192	splx(s);
	1193	}
	1194
	1195	struct thread *updatethread;
	1196	static void sched_sync (void);
	1197	static struct kproc_desc up_kp = {
	1198	"syncer",
	1199	sched_sync,
	1200	&updatethread
	1201	};
	1202	SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
	1203
	1204	/*
	1205	* System filesystem synchronizer daemon.
	1206	*/
	1207	void
	1208	sched_sync(void)
	1209	{
	1210	struct synclist *slp;
	1211	struct vnode *vp;
	1212	long starttime;
	1213	int s;
	1214	struct thread *td = curthread;
	1215
	1216	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	1217	SHUTDOWN_PRI_LAST);
	1218
	1219	for (;;) {
	1220	kproc_suspend_loop();
	1221
	1222	starttime = time_second;
	1223
	1224	/*
	1225	* Push files whose dirty time has expired. Be careful
	1226	* of interrupt race on slp queue.
	1227	*/
	1228	s = splbio();
	1229	slp = &syncer_workitem_pending[syncer_delayno];
	1230	syncer_delayno += 1;
	1231	if (syncer_delayno == syncer_maxdelay)
	1232	syncer_delayno = 0;
	1233	splx(s);
	1234
	1235	while ((vp = LIST_FIRST(slp)) != NULL) {
	1236	if (VOP_ISLOCKED(vp, NULL) == 0) {
	1237	vn_lock(vp, NULL, LK_EXCLUSIVE \| LK_RETRY, td);
	1238	(void) VOP_FSYNC(vp, MNT_LAZY, td);
	1239	VOP_UNLOCK(vp, NULL, 0, td);
	1240	}
	1241	s = splbio();
	1242	if (LIST_FIRST(slp) == vp) {
	1243	/*
	1244	* Note: v_tag VT_VFS vps can remain on the
	1245	* worklist too with no dirty blocks, but
	1246	* since sync_fsync() moves it to a different
	1247	* slot we are safe.
	1248	*/
	1249	if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
	1250	!vn_isdisk(vp, NULL))
	1251	panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
	1252	/*
	1253	* Put us back on the worklist. The worklist
	1254	* routine will remove us from our current
	1255	* position and then add us back in at a later
	1256	* position.
	1257	*/
	1258	vn_syncer_add_to_worklist(vp, syncdelay);
	1259	}
	1260	splx(s);
	1261	}
	1262
	1263	/*
	1264	* Do soft update processing.
	1265	*/
	1266	if (bioops.io_sync)
	1267	(*bioops.io_sync)(NULL);
	1268
	1269	/*
	1270	* The variable rushjob allows the kernel to speed up the
	1271	* processing of the filesystem syncer process. A rushjob
	1272	* value of N tells the filesystem syncer to process the next
	1273	* N seconds worth of work on its queue ASAP. Currently rushjob
	1274	* is used by the soft update code to speed up the filesystem
	1275	* syncer process when the incore state is getting so far
	1276	* ahead of the disk that the kernel memory pool is being
	1277	* threatened with exhaustion.
	1278	*/
	1279	if (rushjob > 0) {
	1280	rushjob -= 1;
	1281	continue;
	1282	}
	1283	/*
	1284	* If it has taken us less than a second to process the
	1285	* current work, then wait. Otherwise start right over
	1286	* again. We can still lose time if any single round
	1287	* takes more than two seconds, but it does not really
	1288	* matter as we are just trying to generally pace the
	1289	* filesystem activity.
	1290	*/
	1291	if (time_second == starttime)
	1292	tsleep(&lbolt, 0, "syncer", 0);
	1293	}
	1294	}
	1295
	1296	/*
	1297	* Request the syncer daemon to speed up its work.
	1298	* We never push it to speed up more than half of its
	1299	* normal turn time, otherwise it could take over the cpu.
	1300	*
	1301	* YYY wchan field protected by the BGL.
	1302	*/
	1303	int
	1304	speedup_syncer(void)
	1305	{
	1306	crit_enter();
	1307	if (updatethread->td_wchan == &lbolt) { /* YYY */
	1308	unsleep(updatethread);
	1309	lwkt_schedule(updatethread);
	1310	}
	1311	crit_exit();
	1312	if (rushjob < syncdelay / 2) {
	1313	rushjob += 1;
	1314	stat_rush_requests += 1;
	1315	return (1);
	1316	}
	1317	return(0);
	1318	}
	1319
	1320	/*
	1321	* Associate a p-buffer with a vnode.
	1322	*
	1323	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	1324	* with the buffer. i.e. the bp has not been linked into the vnode or
	1325	* ref-counted.
	1326	*/
	1327	void
	1328	pbgetvp(struct vnode vp, struct buf bp)
	1329	{
	1330	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
	1331
	1332	bp->b_vp = vp;
	1333	bp->b_flags \|= B_PAGING;
	1334	bp->b_dev = vn_todev(vp);
	1335	}
	1336
	1337	/*
	1338	* Disassociate a p-buffer from a vnode.
	1339	*/
	1340	void
	1341	pbrelvp(struct buf *bp)
	1342	{
	1343	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
	1344
	1345	/* XXX REMOVE ME */
	1346	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
	1347	panic(
	1348	"relpbuf(): b_vp was probably reassignbuf()d %p %x",
	1349	bp,
	1350	(int)bp->b_flags
	1351	);
	1352	}
	1353	bp->b_vp = (struct vnode *) 0;
	1354	bp->b_flags &= ~B_PAGING;
	1355	}
	1356
	1357	void
	1358	pbreassignbuf(struct buf bp, struct vnode newvp)
	1359	{
	1360	if ((bp->b_flags & B_PAGING) == 0) {
	1361	panic(
	1362	"pbreassignbuf() on non phys bp %p",
	1363	bp
	1364	);
	1365	}
	1366	bp->b_vp = newvp;
	1367	}
	1368
	1369	/*
	1370	* Reassign a buffer from one vnode to another.
	1371	* Used to assign file specific control information
	1372	* (indirect blocks) to the vnode to which they belong.
	1373	*/
	1374	void
	1375	reassignbuf(struct buf bp, struct vnode newvp)
	1376	{
	1377	struct buflists *listheadp;
	1378	int delay;
	1379	int s;
	1380
	1381	if (newvp == NULL) {
	1382	printf("reassignbuf: NULL");
	1383	return;
	1384	}
	1385	++reassignbufcalls;
	1386
	1387	/*
	1388	* B_PAGING flagged buffers cannot be reassigned because their vp
	1389	* is not fully linked in.
	1390	*/
	1391	if (bp->b_flags & B_PAGING)
	1392	panic("cannot reassign paging buffer");
	1393
	1394	s = splbio();
	1395	/*
	1396	* Delete from old vnode list, if on one.
	1397	*/
	1398	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	1399	if (bp->b_xflags & BX_VNDIRTY)
	1400	listheadp = &bp->b_vp->v_dirtyblkhd;
	1401	else
	1402	listheadp = &bp->b_vp->v_cleanblkhd;
	1403	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	1404	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	1405	if (bp->b_vp != newvp) {
	1406	vdrop(bp->b_vp);
	1407	bp->b_vp = NULL; /* for clarification */
	1408	}
	1409	}
	1410	/*
	1411	* If dirty, put on list of dirty buffers; otherwise insert onto list
	1412	* of clean buffers.
	1413	*/
	1414	if (bp->b_flags & B_DELWRI) {
	1415	struct buf *tbp;
	1416
	1417	listheadp = &newvp->v_dirtyblkhd;
	1418	if ((newvp->v_flag & VONWORKLST) == 0) {
	1419	switch (newvp->v_type) {
	1420	case VDIR:
	1421	delay = dirdelay;
	1422	break;
	1423	case VCHR:
	1424	case VBLK:
	1425	if (newvp->v_rdev &&
	1426	newvp->v_rdev->si_mountpoint != NULL) {
	1427	delay = metadelay;
	1428	break;
	1429	}
	1430	/* fall through */
	1431	default:
	1432	delay = filedelay;
	1433	}
	1434	vn_syncer_add_to_worklist(newvp, delay);
	1435	}
	1436	bp->b_xflags \|= BX_VNDIRTY;
	1437	tbp = TAILQ_FIRST(listheadp);
	1438	if (tbp == NULL \|\|
	1439	bp->b_lblkno == 0 \|\|
	1440	(bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\|
	1441	(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
	1442	TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
	1443	++reassignbufsortgood;
	1444	} else if (bp->b_lblkno < 0) {
	1445	TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
	1446	++reassignbufsortgood;
	1447	} else if (reassignbufmethod == 1) {
	1448	/*
	1449	* New sorting algorithm, only handle sequential case,
	1450	* otherwise append to end (but before metadata)
	1451	*/
	1452	if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
	1453	(tbp->b_xflags & BX_VNDIRTY)) {
	1454	/*
	1455	* Found the best place to insert the buffer
	1456	*/
	1457	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1458	++reassignbufsortgood;
	1459	} else {
	1460	/*
	1461	* Missed, append to end, but before meta-data.
	1462	* We know that the head buffer in the list is
	1463	* not meta-data due to prior conditionals.
	1464	*
	1465	* Indirect effects: NFS second stage write
	1466	* tends to wind up here, giving maximum
	1467	* distance between the unstable write and the
	1468	* commit rpc.
	1469	*/
	1470	tbp = TAILQ_LAST(listheadp, buflists);
	1471	while (tbp && tbp->b_lblkno < 0)
	1472	tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
	1473	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1474	++reassignbufsortbad;
	1475	}
	1476	} else {
	1477	/*
	1478	* Old sorting algorithm, scan queue and insert
	1479	*/
	1480	struct buf *ttbp;
	1481	while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
	1482	(ttbp->b_lblkno < bp->b_lblkno)) {
	1483	++reassignbufloops;
	1484	tbp = ttbp;
	1485	}
	1486	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1487	}
	1488	} else {
	1489	bp->b_xflags \|= BX_VNCLEAN;
	1490	TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
	1491	if ((newvp->v_flag & VONWORKLST) &&
	1492	TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
	1493	newvp->v_flag &= ~VONWORKLST;
	1494	LIST_REMOVE(newvp, v_synclist);
	1495	}
	1496	}
	1497	if (bp->b_vp != newvp) {
	1498	bp->b_vp = newvp;
	1499	vhold(bp->b_vp);
	1500	}
	1501	splx(s);
	1502	}
	1503
	1504	/*
	1505	* Create a vnode for a block device.
	1506	* Used for mounting the root file system.
	1507	*/
	1508	int
	1509	bdevvp(dev_t dev, struct vnode **vpp)
	1510	{
	1511	struct vnode *vp;
	1512	struct vnode *nvp;
	1513	int error;
	1514
	1515	if (dev == NODEV) {
	1516	*vpp = NULLVP;
	1517	return (ENXIO);
	1518	}
	1519	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
	1520	if (error) {
	1521	*vpp = NULLVP;
	1522	return (error);
	1523	}
	1524	vp = nvp;
	1525	vp->v_type = VCHR;
	1526	vp->v_udev = dev->si_udev;
	1527	*vpp = vp;
	1528	return (0);
	1529	}
	1530
	1531	int
	1532	v_associate_rdev(struct vnode *vp, dev_t dev)
	1533	{
	1534	lwkt_tokref ilock;
	1535
	1536	if (dev == NULL \|\| dev == NODEV)
	1537	return(ENXIO);
	1538	if (dev_is_good(dev) == 0)
	1539	return(ENXIO);
	1540	KKASSERT(vp->v_rdev == NULL);
	1541	if (dev_ref_debug)
	1542	printf("Z1");
	1543	vp->v_rdev = reference_dev(dev);
	1544	lwkt_gettoken(&ilock, &spechash_token);
	1545	SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext);
	1546	lwkt_reltoken(&ilock);
	1547	return(0);
	1548	}
	1549
	1550	void
	1551	v_release_rdev(struct vnode *vp)
	1552	{
	1553	lwkt_tokref ilock;
	1554	dev_t dev;
	1555
	1556	if ((dev = vp->v_rdev) != NULL) {
	1557	lwkt_gettoken(&ilock, &spechash_token);
	1558	SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext);
	1559	if (dev_ref_debug && vp->v_opencount != 0) {
	1560	printf("releasing rdev with non-0 "
	1561	"v_opencount(%d) (revoked?)\n",
	1562	vp->v_opencount);
	1563	}
	1564	vp->v_rdev = NULL;
	1565	vp->v_opencount = 0;
	1566	release_dev(dev);
	1567	lwkt_reltoken(&ilock);
	1568	}
	1569	}
	1570
	1571	/*
	1572	* Add a vnode to the alias list hung off the dev_t. We only associate
	1573	* the device number with the vnode. The actual device is not associated
	1574	* until the vnode is opened (usually in spec_open()), and will be
	1575	* disassociated on last close.
	1576	*/
	1577	void
	1578	addaliasu(struct vnode *nvp, udev_t nvp_udev)
	1579	{
	1580	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	1581	panic("addaliasu on non-special vnode");
	1582	nvp->v_udev = nvp_udev;
	1583	}
	1584
	1585	/*
	1586	* Grab a particular vnode from the free list, increment its
	1587	* reference count and lock it. The vnode lock bit is set if the
	1588	* vnode is being eliminated in vgone. The process is awakened
	1589	* when the transition is completed, and an error returned to
	1590	* indicate that the vnode is no longer usable (possibly having
	1591	* been changed to a new file system type).
	1592	*
	1593	* This code is very sensitive. We are depending on the vnode interlock
	1594	* to be maintained through to the vn_lock() call, which means that we
	1595	* cannot block which means that we cannot call vbusy() until after vn_lock().
	1596	* If the interlock is not maintained, the VXLOCK check will not properly
	1597	* interlock against a vclean()'s LK_DRAIN operation on the lock.
	1598	*/
	1599	int
	1600	vget(struct vnode *vp, lwkt_tokref_t vlock, int flags, thread_t td)
	1601	{
	1602	int error;
	1603	lwkt_tokref vvlock;
	1604
	1605	/*
	1606	* We need the interlock to safely modify the v_ fields. ZZZ it is
	1607	* only legal to pass (1) the vnode's interlock and (2) only pass
	1608	* NULL w/o LK_INTERLOCK if the vnode is ALREADY referenced or
	1609	* held.
	1610	*/
	1611	if ((flags & LK_INTERLOCK) == 0) {
	1612	lwkt_gettoken(&vvlock, vp->v_interlock);
	1613	vlock = &vvlock;
	1614	}
	1615
	1616	/*
	1617	* If the vnode is in the process of being cleaned out for
	1618	* another use, we wait for the cleaning to finish and then
	1619	* return failure. Cleaning is determined by checking that
	1620	* the VXLOCK flag is set. It is possible for the vnode to be
	1621	* self-referenced during the cleaning operation.
	1622	*/
	1623	if (vp->v_flag & VXLOCK) {
	1624	if (vp->v_vxthread == curthread) {
	1625	#if 0
	1626	/* this can now occur in normal operation */
	1627	log(LOG_INFO, "VXLOCK interlock avoided\n");
	1628	#endif
	1629	} else {
	1630	vp->v_flag \|= VXWANT;
	1631	lwkt_reltoken(vlock);
	1632	tsleep((caddr_t)vp, 0, "vget", 0);
	1633	return (ENOENT);
	1634	}
	1635	}
	1636
	1637	/*
	1638	* Bump v_usecount to prevent the vnode from being recycled. The
	1639	* usecount needs to be bumped before we successfully get our lock.
	1640	*/
	1641	vp->v_usecount++;
	1642	if (flags & LK_TYPE_MASK) {
	1643	if ((error = vn_lock(vp, vlock, flags \| LK_INTERLOCK, td)) != 0) {
	1644	/*
	1645	* must expand vrele here because we do not want
	1646	* to call VOP_INACTIVE if the reference count
	1647	* drops back to zero since it was never really
	1648	* active. We must remove it from the free list
	1649	* before sleeping so that multiple processes do
	1650	* not try to recycle it.
	1651	*/
	1652	lwkt_gettokref(vlock);
	1653	vp->v_usecount--;
	1654	vmaybefree(vp);
	1655	lwkt_reltoken(vlock);
	1656	}
	1657	return (error);
	1658	}
	1659	if (VSHOULDBUSY(vp))
	1660	vbusy(vp); /* interlock must be held on call */
	1661	lwkt_reltoken(vlock);
	1662	return (0);
	1663	}
	1664
	1665	void
	1666	vref(struct vnode *vp)
	1667	{
	1668	crit_enter(); /* YYY use crit section for moment / BGL protected */
	1669	vp->v_usecount++;
	1670	crit_exit();
	1671	}
	1672
	1673	/*
	1674	* Vnode put/release.
	1675	* If count drops to zero, call inactive routine and return to freelist.
	1676	*/
	1677	void
	1678	vrele(struct vnode *vp)
	1679	{
	1680	struct thread td = curthread; / XXX */
	1681	lwkt_tokref vlock;
	1682
	1683	KASSERT(vp != NULL && vp->v_usecount >= 0,
	1684	("vrele: null vp or <=0 v_usecount"));
	1685
	1686	lwkt_gettoken(&vlock, vp->v_interlock);
	1687
	1688	if (vp->v_usecount > 1) {
	1689	vp->v_usecount--;
	1690	lwkt_reltoken(&vlock);
	1691	return;
	1692	}
	1693
	1694	if (vp->v_usecount == 1) {
	1695	vp->v_usecount--;
	1696	/*
	1697	* We must call VOP_INACTIVE with the node locked and the
	1698	* usecount 0. If we are doing a vpu, the node is already
	1699	* locked, but, in the case of vrele, we must explicitly lock
	1700	* the vnode before calling VOP_INACTIVE.
	1701	*/
	1702
	1703	if (vn_lock(vp, NULL, LK_EXCLUSIVE, td) == 0)
	1704	VOP_INACTIVE(vp, td);
	1705	vmaybefree(vp);
	1706	lwkt_reltoken(&vlock);
	1707	} else {
	1708	#ifdef DIAGNOSTIC
	1709	vprint("vrele: negative ref count", vp);
	1710	#endif
	1711	lwkt_reltoken(&vlock);
	1712	panic("vrele: negative ref cnt");
	1713	}
	1714	}
	1715
	1716	void
	1717	vput(struct vnode *vp)
	1718	{
	1719	struct thread td = curthread; / XXX */
	1720	lwkt_tokref vlock;
	1721
	1722	KASSERT(vp != NULL, ("vput: null vp"));
	1723
	1724	lwkt_gettoken(&vlock, vp->v_interlock);
	1725
	1726	if (vp->v_usecount > 1) {
	1727	vp->v_usecount--;
	1728	VOP_UNLOCK(vp, &vlock, LK_INTERLOCK, td);
	1729	return;
	1730	}
	1731
	1732	if (vp->v_usecount == 1) {
	1733	vp->v_usecount--;
	1734	/*
	1735	* We must call VOP_INACTIVE with the node locked.
	1736	* If we are doing a vpu, the node is already locked,
	1737	* so we just need to release the vnode mutex.
	1738	*/
	1739	VOP_INACTIVE(vp, td);
	1740	vmaybefree(vp);
	1741	lwkt_reltoken(&vlock);
	1742	} else {
	1743	#ifdef DIAGNOSTIC
	1744	vprint("vput: negative ref count", vp);
	1745	#endif
	1746	lwkt_reltoken(&vlock);
	1747	panic("vput: negative ref cnt");
	1748	}
	1749	}
	1750
	1751	/*
	1752	* Somebody doesn't want the vnode recycled. ZZZ vnode interlock should
	1753	* be held but isn't.
	1754	*/
	1755	void
	1756	vhold(struct vnode *vp)
	1757	{
	1758	int s;
	1759
	1760	s = splbio();
	1761	vp->v_holdcnt++;
	1762	if (VSHOULDBUSY(vp))
	1763	vbusy(vp); /* interlock must be held on call */
	1764	splx(s);
	1765	}
	1766
	1767	/*
	1768	* One less who cares about this vnode.
	1769	*/
	1770	void
	1771	vdrop(struct vnode *vp)
	1772	{
	1773	lwkt_tokref vlock;
	1774
	1775	lwkt_gettoken(&vlock, vp->v_interlock);
	1776	if (vp->v_holdcnt <= 0)
	1777	panic("vdrop: holdcnt");
	1778	vp->v_holdcnt--;
	1779	vmaybefree(vp);
	1780	lwkt_reltoken(&vlock);
	1781	}
	1782
	1783	int
	1784	vmntvnodescan(
	1785	struct mount *mp,
	1786	int (fastfunc)(struct mount mp, struct vnode vp, void data),
	1787	int (slowfunc)(struct mount mp, struct vnode *vp,
	1788	lwkt_tokref_t vlock, void *data),
	1789	void *data
	1790	) {
	1791	lwkt_tokref ilock;
	1792	lwkt_tokref vlock;
	1793	struct vnode *pvp;
	1794	struct vnode *vp;
	1795	int r = 0;
	1796
	1797	/*
	1798	* Scan the vnodes on the mount's vnode list. Use a placemarker
	1799	*/
	1800	pvp = zalloc(vnode_zone);
	1801	pvp->v_flag \|= VPLACEMARKER;
	1802
	1803	lwkt_gettoken(&ilock, &mntvnode_token);
	1804	TAILQ_INSERT_HEAD(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
	1805
	1806	while ((vp = TAILQ_NEXT(pvp, v_nmntvnodes)) != NULL) {
	1807	/*
	1808	* Move the placemarker and skip other placemarkers we
	1809	* encounter. The nothing can get in our way so the
	1810	* mount point on the vp must be valid.
	1811	*/
	1812	TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
	1813	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, pvp, v_nmntvnodes);
	1814	if (vp->v_flag & VPLACEMARKER)
	1815	continue;
	1816	KKASSERT(vp->v_mount == mp);
	1817
	1818	/*
	1819	* Quick test
	1820	*/
	1821	if (fastfunc) {
	1822	if ((r = fastfunc(mp, vp, data)) < 0)
	1823	continue;
	1824	if (r)
	1825	break;
	1826	}
	1827
	1828	/*
	1829	* Get the vnodes interlock and make sure it is still on the
	1830	* mount list. Skip it if it has moved (we may encounter it
	1831	* later). Then do the with-interlock test. The callback
	1832	* is responsible for releasing the vnode interlock.
	1833	*
	1834	* The interlock is type-stable.
	1835	*/
	1836	if (slowfunc) {
	1837	lwkt_gettoken(&vlock, vp->v_interlock);
	1838	if (vp != TAILQ_PREV(pvp, vnodelst, v_nmntvnodes)) {
	1839	printf("vmntvnodescan (debug info only): f=%p vp=%p vnode ripped out from under us\n", slowfunc, vp);
	1840	lwkt_reltoken(&vlock);
	1841	continue;
	1842	}
	1843	if ((r = slowfunc(mp, vp, &vlock, data)) != 0) {
	1844	KKASSERT(lwkt_havetokref(&vlock) == 0);
	1845	break;
	1846	}
	1847	KKASSERT(lwkt_havetokref(&vlock) == 0);
	1848	}
	1849	}
	1850	TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
	1851	zfree(vnode_zone, pvp);
	1852	lwkt_reltoken(&ilock);
	1853	return(r);
	1854	}
	1855
	1856	/*
	1857	* Remove any vnodes in the vnode table belonging to mount point mp.
	1858	*
	1859	* If FORCECLOSE is not specified, there should not be any active ones,
	1860	* return error if any are found (nb: this is a user error, not a
	1861	* system error). If FORCECLOSE is specified, detach any active vnodes
	1862	* that are found.
	1863	*
	1864	* If WRITECLOSE is set, only flush out regular file vnodes open for
	1865	* writing.
	1866	*
	1867	* SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
	1868	*
	1869	* `rootrefs' specifies the base reference count for the root vnode
	1870	* of this filesystem. The root vnode is considered busy if its
	1871	* v_usecount exceeds this value. On a successful return, vflush()
	1872	* will call vrele() on the root vnode exactly rootrefs times.
	1873	* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
	1874	* be zero.
	1875	*/
	1876	#ifdef DIAGNOSTIC
	1877	static int busyprt = 0; /* print out busy vnodes */
	1878	SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
	1879	#endif
	1880
	1881	static int vflush_scan(struct mount mp, struct vnode vp,
	1882	lwkt_tokref_t vlock, void *data);
	1883
	1884	struct vflush_info {
	1885	int flags;
	1886	int busy;
	1887	thread_t td;
	1888	};
	1889
	1890	int
	1891	vflush(struct mount *mp, int rootrefs, int flags)
	1892	{
	1893	struct thread td = curthread; / XXX */
	1894	struct vnode *rootvp = NULL;
	1895	int error;
	1896	lwkt_tokref vlock;
	1897	struct vflush_info vflush_info;
	1898
	1899	if (rootrefs > 0) {
	1900	KASSERT((flags & (SKIPSYSTEM \| WRITECLOSE)) == 0,
	1901	("vflush: bad args"));
	1902	/*
	1903	* Get the filesystem root vnode. We can vput() it
	1904	* immediately, since with rootrefs > 0, it won't go away.
	1905	*/
	1906	if ((error = VFS_ROOT(mp, &rootvp)) != 0)
	1907	return (error);
	1908	vput(rootvp);
	1909	}
	1910
	1911	vflush_info.busy = 0;
	1912	vflush_info.flags = flags;
	1913	vflush_info.td = td;
	1914	vmntvnodescan(mp, NULL, vflush_scan, &vflush_info);
	1915
	1916	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
	1917	/*
	1918	* If just the root vnode is busy, and if its refcount
	1919	* is equal to `rootrefs', then go ahead and kill it.
	1920	*/
	1921	lwkt_gettoken(&vlock, rootvp->v_interlock);
	1922	KASSERT(vflush_info.busy > 0, ("vflush: not busy"));
	1923	KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
	1924	if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) {
	1925	vgonel(rootvp, &vlock, td);
	1926	vflush_info.busy = 0;
	1927	} else {
	1928	lwkt_reltoken(&vlock);
	1929	}
	1930	}
	1931	if (vflush_info.busy)
	1932	return (EBUSY);
	1933	for (; rootrefs > 0; rootrefs--)
	1934	vrele(rootvp);
	1935	return (0);
	1936	}
	1937
	1938	/*
	1939	* The scan callback is made with an interlocked vnode.
	1940	*/
	1941	static int
	1942	vflush_scan(struct mount mp, struct vnode vp,
	1943	lwkt_tokref_t vlock, void *data)
	1944	{
	1945	struct vflush_info *info = data;
	1946	struct vattr vattr;
	1947
	1948	/*
	1949	* Skip over a vnodes marked VSYSTEM.
	1950	*/
	1951	if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
	1952	lwkt_reltoken(vlock);
	1953	return(0);
	1954	}
	1955
	1956	/*
	1957	* If WRITECLOSE is set, flush out unlinked but still open
	1958	* files (even if open only for reading) and regular file
	1959	* vnodes open for writing.
	1960	*/
	1961	if ((info->flags & WRITECLOSE) &&
	1962	(vp->v_type == VNON \|\|
	1963	(VOP_GETATTR(vp, &vattr, info->td) == 0 &&
	1964	vattr.va_nlink > 0)) &&
	1965	(vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	1966	lwkt_reltoken(vlock);
	1967	return(0);
	1968	}
	1969
	1970	/*
	1971	* With v_usecount == 0, all we need to do is clear out the
	1972	* vnode data structures and we are done.
	1973	*/
	1974	if (vp->v_usecount == 0) {
	1975	vgonel(vp, vlock, info->td);
	1976	return(0);
	1977	}
	1978
	1979	/*
	1980	* If FORCECLOSE is set, forcibly close the vnode. For block
	1981	* or character devices, revert to an anonymous device. For
	1982	* all other files, just kill them.
	1983	*/
	1984	if (info->flags & FORCECLOSE) {
	1985	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	1986	vgonel(vp, vlock, info->td);
	1987	} else {
	1988	vclean(vp, vlock, 0, info->td);
	1989	vp->v_op = spec_vnodeop_p;
	1990	insmntque(vp, (struct mount *) 0);
	1991	}
	1992	return(0);
	1993	}
	1994	#ifdef DIAGNOSTIC
	1995	if (busyprt)
	1996	vprint("vflush: busy vnode", vp);
	1997	#endif
	1998	lwkt_reltoken(vlock);
	1999	++info->busy;
	2000	return(0);
	2001	}
	2002
	2003	/*
	2004	* Disassociate the underlying file system from a vnode.
	2005	*/
	2006	static void
	2007	vclean(struct vnode vp, lwkt_tokref_t vlock, int flags, struct thread td)
	2008	{
	2009	int active;
	2010
	2011	/*
	2012	* Check to see if the vnode is in use. If so we have to reference it
	2013	* before we clean it out so that its count cannot fall to zero and
	2014	* generate a race against ourselves to recycle it.
	2015	*/
	2016	if ((active = vp->v_usecount))
	2017	vp->v_usecount++;
	2018
	2019	/*
	2020	* Prevent the vnode from being recycled or brought into use while we
	2021	* clean it out.
	2022	*/
	2023	if (vp->v_flag & VXLOCK)
	2024	panic("vclean: deadlock");
	2025	vp->v_flag \|= VXLOCK;
	2026	vp->v_vxthread = curthread;
	2027
	2028	/*
	2029	* Even if the count is zero, the VOP_INACTIVE routine may still
	2030	* have the object locked while it cleans it out. The VOP_LOCK
	2031	* ensures that the VOP_INACTIVE routine is done with its work.
	2032	* For active vnodes, it ensures that no other activity can
	2033	* occur while the underlying object is being cleaned out.
	2034	*
	2035	* NOTE: we continue to hold the vnode interlock through to the
	2036	* end of vclean().
	2037	*/
	2038	VOP_LOCK(vp, NULL, LK_DRAIN, td);
	2039
	2040	/*
	2041	* Clean out any buffers associated with the vnode.
	2042	*/
	2043	vinvalbuf(vp, V_SAVE, td, 0, 0);
	2044	VOP_DESTROYVOBJECT(vp);
	2045
	2046	/*
	2047	* If purging an active vnode, it must be closed and
	2048	* deactivated before being reclaimed. Note that the
	2049	* VOP_INACTIVE will unlock the vnode.
	2050	*/
	2051	if (active) {
	2052	if (flags & DOCLOSE)
	2053	VOP_CLOSE(vp, FNONBLOCK, td);
	2054	VOP_INACTIVE(vp, td);
	2055	} else {
	2056	/*
	2057	* Any other processes trying to obtain this lock must first
	2058	* wait for VXLOCK to clear, then call the new lock operation.
	2059	*/
	2060	VOP_UNLOCK(vp, NULL, 0, td);
	2061	}
	2062	/*
	2063	* Reclaim the vnode.
	2064	*/
	2065	if (VOP_RECLAIM(vp, td))
	2066	panic("vclean: cannot reclaim");
	2067
	2068	if (active) {
	2069	/*
	2070	* Inline copy of vrele() since VOP_INACTIVE
	2071	* has already been called.
	2072	*/
	2073	if (--vp->v_usecount <= 0) {
	2074	#ifdef DIAGNOSTIC
	2075	if (vp->v_usecount < 0 \|\| vp->v_writecount != 0) {
	2076	vprint("vclean: bad ref count", vp);
	2077	panic("vclean: ref cnt");
	2078	}
	2079	#endif
	2080	vfree(vp);
	2081	}
	2082	}
	2083
	2084	cache_purge(vp);
	2085	vp->v_vnlock = NULL;
	2086	vmaybefree(vp);
	2087
	2088	/*
	2089	* Done with purge, notify sleepers of the grim news.
	2090	*/
	2091	vp->v_op = dead_vnodeop_p;
	2092	vn_pollgone(vp);
	2093	vp->v_tag = VT_NON;
	2094	vp->v_flag &= ~VXLOCK;
	2095	vp->v_vxthread = NULL;
	2096	if (vp->v_flag & VXWANT) {
	2097	vp->v_flag &= ~VXWANT;
	2098	wakeup((caddr_t) vp);
	2099	}
	2100	lwkt_reltoken(vlock);
	2101	}
	2102
	2103	/*
	2104	* Eliminate all activity associated with the requested vnode
	2105	* and with all vnodes aliased to the requested vnode.
	2106	*
	2107	* revoke { struct vnode *a_vp, int a_flags }
	2108	*/
	2109	int
	2110	vop_revoke(struct vop_revoke_args *ap)
	2111	{
	2112	struct vnode vp, vq;
	2113	lwkt_tokref ilock;
	2114	dev_t dev;
	2115
	2116	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
	2117
	2118	vp = ap->a_vp;
	2119	/*
	2120	* If a vgone (or vclean) is already in progress,
	2121	* wait until it is done and return.
	2122	*/
	2123	if (vp->v_flag & VXLOCK) {
	2124	vp->v_flag \|= VXWANT;
	2125	/lwkt_reltoken(vlock); ZZZ /
	2126	tsleep((caddr_t)vp, 0, "vop_revokeall", 0);
	2127	return (0);
	2128	}
	2129
	2130	/*
	2131	* If the vnode has a device association, scrap all vnodes associated
	2132	* with the device. Don't let the device disappear on us while we
	2133	* are scrapping the vnodes.
	2134	*/
	2135	if (vp->v_type != VCHR && vp->v_type != VBLK)
	2136	return(0);
	2137	if ((dev = vp->v_rdev) == NULL) {
	2138	if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV)
	2139	return(0);
	2140	}
	2141	reference_dev(dev);
	2142	for (;;) {
	2143	lwkt_gettoken(&ilock, &spechash_token);
	2144	vq = SLIST_FIRST(&dev->si_hlist);
	2145	lwkt_reltoken(&ilock);
	2146	if (vq == NULL)
	2147	break;
	2148	vgone(vq);
	2149	}
	2150	release_dev(dev);
	2151	return (0);
	2152	}
	2153
	2154	/*
	2155	* Recycle an unused vnode to the front of the free list.
	2156	* Release the passed interlock if the vnode will be recycled.
	2157	*/
	2158	int
	2159	vrecycle(struct vnode vp, lwkt_tokref_t inter_lkp, struct thread td)
	2160	{
	2161	lwkt_tokref vlock;
	2162
	2163	lwkt_gettoken(&vlock, vp->v_interlock);
	2164	if (vp->v_usecount == 0) {
	2165	if (inter_lkp)
	2166	lwkt_reltoken(inter_lkp);
	2167	vgonel(vp, &vlock, td);
	2168	return (1);
	2169	}
	2170	lwkt_reltoken(&vlock);
	2171	return (0);
	2172	}
	2173
	2174	/*
	2175	* Eliminate all activity associated with a vnode
	2176	* in preparation for reuse.
	2177	*/
	2178	void
	2179	vgone(struct vnode *vp)
	2180	{
	2181	struct thread td = curthread; / XXX */
	2182	lwkt_tokref vlock;
	2183
	2184	lwkt_gettoken(&vlock, vp->v_interlock);
	2185	vgonel(vp, &vlock, td);
	2186	}
	2187
	2188	/*
	2189	* vgone, with the vp interlock held.
	2190	*/
	2191	void
	2192	vgonel(struct vnode vp, lwkt_tokref_t vlock, struct thread td)
	2193	{
	2194	lwkt_tokref ilock;
	2195	int s;
	2196
	2197	/*
	2198	* If a vgone (or vclean) is already in progress,
	2199	* wait until it is done and return.
	2200	*/
	2201	if (vp->v_flag & VXLOCK) {
	2202	vp->v_flag \|= VXWANT;
	2203	lwkt_reltoken(vlock);
	2204	tsleep((caddr_t)vp, 0, "vgone", 0);
	2205	return;
	2206	}
	2207
	2208	/*
	2209	* Clean out the filesystem specific data.
	2210	*/
	2211	vclean(vp, vlock, DOCLOSE, td);
	2212	lwkt_gettokref(vlock);
	2213
	2214	/*
	2215	* Delete from old mount point vnode list, if on one.
	2216	*/
	2217	if (vp->v_mount != NULL)
	2218	insmntque(vp, (struct mount *)0);
	2219
	2220	/*
	2221	* If special device, remove it from special device alias list
	2222	* if it is on one. This should normally only occur if a vnode is
	2223	* being revoked as the device should otherwise have been released
	2224	* naturally.
	2225	*/
	2226	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) {
	2227	v_release_rdev(vp);
	2228	}
	2229
	2230	/*
	2231	* If it is on the freelist and not already at the head,
	2232	* move it to the head of the list. The test of the
	2233	* VDOOMED flag and the reference count of zero is because
	2234	* it will be removed from the free list by getnewvnode,
	2235	* but will not have its reference count incremented until
	2236	* after calling vgone. If the reference count were
	2237	* incremented first, vgone would (incorrectly) try to
	2238	* close the previous instance of the underlying object.
	2239	*/
	2240	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
	2241	s = splbio();
	2242	lwkt_gettoken(&ilock, &vnode_free_list_token);
	2243	if (vp->v_flag & VFREE)
	2244	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	2245	else
	2246	freevnodes++;
	2247	vp->v_flag \|= VFREE;
	2248	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2249	lwkt_reltoken(&ilock);
	2250	splx(s);
	2251	}
	2252	vp->v_type = VBAD;
	2253	lwkt_reltoken(vlock);
	2254	}
	2255
	2256	/*
	2257	* Lookup a vnode by device number.
	2258	*/
	2259	int
	2260	vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
	2261	{
	2262	lwkt_tokref ilock;
	2263	struct vnode *vp;
	2264
	2265	lwkt_gettoken(&ilock, &spechash_token);
	2266	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	2267	if (type == vp->v_type) {
	2268	*vpp = vp;
	2269	lwkt_reltoken(&ilock);
	2270	return (1);
	2271	}
	2272	}
	2273	lwkt_reltoken(&ilock);
	2274	return (0);
	2275	}
	2276
	2277	/*
	2278	* Calculate the total number of references to a special device. This
	2279	* routine may only be called for VBLK and VCHR vnodes since v_rdev is
	2280	* an overloaded field. Since udev2dev can now return NODEV, we have
	2281	* to check for a NULL v_rdev.
	2282	*/
	2283	int
	2284	count_dev(dev_t dev)
	2285	{
	2286	lwkt_tokref ilock;
	2287	struct vnode *vp;
	2288	int count = 0;
	2289
	2290	if (SLIST_FIRST(&dev->si_hlist)) {
	2291	lwkt_gettoken(&ilock, &spechash_token);
	2292	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	2293	count += vp->v_usecount;
	2294	}
	2295	lwkt_reltoken(&ilock);
	2296	}
	2297	return(count);
	2298	}
	2299
	2300	int
	2301	count_udev(udev_t udev)
	2302	{
	2303	dev_t dev;
	2304
	2305	if ((dev = udev2dev(udev, 0)) == NODEV)
	2306	return(0);
	2307	return(count_dev(dev));
	2308	}
	2309
	2310	int
	2311	vcount(struct vnode *vp)
	2312	{
	2313	if (vp->v_rdev == NULL)
	2314	return(0);
	2315	return(count_dev(vp->v_rdev));
	2316	}
	2317
	2318	/*
	2319	* Print out a description of a vnode.
	2320	*/
	2321	static char *typename[] =
	2322	{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
	2323
	2324	void
	2325	vprint(char label, struct vnode vp)
	2326	{
	2327	char buf[96];
	2328
	2329	if (label != NULL)
	2330	printf("%s: %p: ", label, (void *)vp);
	2331	else
	2332	printf("%p: ", (void *)vp);
	2333	printf("type %s, usecount %d, writecount %d, refcount %d,",
	2334	typename[vp->v_type], vp->v_usecount, vp->v_writecount,
	2335	vp->v_holdcnt);
	2336	buf[0] = '\0';
	2337	if (vp->v_flag & VROOT)
	2338	strcat(buf, "\|VROOT");
	2339	if (vp->v_flag & VTEXT)
	2340	strcat(buf, "\|VTEXT");
	2341	if (vp->v_flag & VSYSTEM)
	2342	strcat(buf, "\|VSYSTEM");
	2343	if (vp->v_flag & VXLOCK)
	2344	strcat(buf, "\|VXLOCK");
	2345	if (vp->v_flag & VXWANT)
	2346	strcat(buf, "\|VXWANT");
	2347	if (vp->v_flag & VBWAIT)
	2348	strcat(buf, "\|VBWAIT");
	2349	if (vp->v_flag & VDOOMED)
	2350	strcat(buf, "\|VDOOMED");
	2351	if (vp->v_flag & VFREE)
	2352	strcat(buf, "\|VFREE");
	2353	if (vp->v_flag & VOBJBUF)
	2354	strcat(buf, "\|VOBJBUF");
	2355	if (buf[0] != '\0')
	2356	printf(" flags (%s)", &buf[1]);
	2357	if (vp->v_data == NULL) {
	2358	printf("\n");
	2359	} else {
	2360	printf("\n\t");
	2361	VOP_PRINT(vp);
	2362	}
	2363	}
	2364
	2365	#ifdef DDB
	2366	#include <ddb/ddb.h>
	2367	/*
	2368	* List all of the locked vnodes in the system.
	2369	* Called when debugging the kernel.
	2370	*/
	2371	DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
	2372	{
	2373	struct thread td = curthread; / XXX */
	2374	lwkt_tokref ilock;
	2375	struct mount mp, nmp;
	2376	struct vnode *vp;
	2377
	2378	printf("Locked vnodes\n");
	2379	lwkt_gettoken(&ilock, &mountlist_token);
	2380	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	2381	if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
	2382	nmp = TAILQ_NEXT(mp, mnt_list);
	2383	continue;
	2384	}
	2385	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	2386	if (VOP_ISLOCKED(vp, NULL))
	2387	vprint((char *)0, vp);
	2388	}
	2389	lwkt_gettokref(&ilock);
	2390	nmp = TAILQ_NEXT(mp, mnt_list);
	2391	vfs_unbusy(mp, td);
	2392	}
	2393	lwkt_reltoken(&ilock);
	2394	}
	2395	#endif
	2396
	2397	/*
	2398	* Top level filesystem related information gathering.
	2399	*/
	2400	static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
	2401
	2402	static int
	2403	vfs_sysctl(SYSCTL_HANDLER_ARGS)
	2404	{
	2405	int name = (int )arg1 - 1; /* XXX */
	2406	u_int namelen = arg2 + 1; /* XXX */
	2407	struct vfsconf *vfsp;
	2408
	2409	#if 1 \|\| defined(COMPAT_PRELITE2)
	2410	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
	2411	if (namelen == 1)
	2412	return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
	2413	#endif
	2414
	2415	#ifdef notyet
	2416	/* all sysctl names at this level are at least name and field */
	2417	if (namelen < 2)
	2418	return (ENOTDIR); /* overloaded */
	2419	if (name[0] != VFS_GENERIC) {
	2420	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	2421	if (vfsp->vfc_typenum == name[0])
	2422	break;
	2423	if (vfsp == NULL)
	2424	return (EOPNOTSUPP);
	2425	return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
	2426	oldp, oldlenp, newp, newlen, p));
	2427	}
	2428	#endif
	2429	switch (name[1]) {
	2430	case VFS_MAXTYPENUM:
	2431	if (namelen != 2)
	2432	return (ENOTDIR);
	2433	return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
	2434	case VFS_CONF:
	2435	if (namelen != 3)
	2436	return (ENOTDIR); /* overloaded */
	2437	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	2438	if (vfsp->vfc_typenum == name[2])
	2439	break;
	2440	if (vfsp == NULL)
	2441	return (EOPNOTSUPP);
	2442	return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
	2443	}
	2444	return (EOPNOTSUPP);
	2445	}
	2446
	2447	SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
	2448	"Generic filesystem");
	2449
	2450	#if 1 \|\| defined(COMPAT_PRELITE2)
	2451
	2452	static int
	2453	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
	2454	{
	2455	int error;
	2456	struct vfsconf *vfsp;
	2457	struct ovfsconf ovfs;
	2458
	2459	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	2460	ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
	2461	strcpy(ovfs.vfc_name, vfsp->vfc_name);
	2462	ovfs.vfc_index = vfsp->vfc_typenum;
	2463	ovfs.vfc_refcount = vfsp->vfc_refcount;
	2464	ovfs.vfc_flags = vfsp->vfc_flags;
	2465	error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
	2466	if (error)
	2467	return error;
	2468	}
	2469	return 0;
	2470	}
	2471
	2472	#endif /* 1 \|\| COMPAT_PRELITE2 */
	2473
	2474	#if 0
	2475	#define KINFO_VNODESLOP 10
	2476	/*
	2477	* Dump vnode list (via sysctl).
	2478	* Copyout address of vnode followed by vnode.
	2479	*/
	2480	/* ARGSUSED */
	2481	static int
	2482	sysctl_vnode(SYSCTL_HANDLER_ARGS)
	2483	{
	2484	struct proc p = curproc; / XXX */
	2485	struct mount mp, nmp;
	2486	struct vnode nvp, vp;
	2487	lwkt_tokref ilock;
	2488	lwkt_tokref jlock;
	2489	int error;
	2490
	2491	#define VPTRSZ sizeof (struct vnode *)
	2492	#define VNODESZ sizeof (struct vnode)
	2493
	2494	req->lock = 0;
	2495	if (!req->oldptr) /* Make an estimate */
	2496	return (SYSCTL_OUT(req, 0,
	2497	(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
	2498
	2499	lwkt_gettoken(&ilock, &mountlist_token);
	2500	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	2501	if (vfs_busy(mp, LK_NOWAIT, &ilock, p)) {
	2502	nmp = TAILQ_NEXT(mp, mnt_list);
	2503	continue;
	2504	}
	2505	lwkt_gettoken(&jlock, &mntvnode_token);
	2506	again:
	2507	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	2508	vp != NULL;
	2509	vp = nvp) {
	2510	/*
	2511	* Check that the vp is still associated with
	2512	* this filesystem. RACE: could have been
	2513	* recycled onto the same filesystem.
	2514	*/
	2515	if (vp->v_mount != mp)
	2516	goto again;
	2517	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	2518	if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\|
	2519	(error = SYSCTL_OUT(req, vp, VNODESZ))) {
	2520	lwkt_reltoken(&jlock);
	2521	return (error);
	2522	}
	2523	}
	2524	lwkt_reltoken(&jlock);
	2525	lwkt_gettokref(&ilock);
	2526	nmp = TAILQ_NEXT(mp, mnt_list); /* ZZZ */
	2527	vfs_unbusy(mp, p);
	2528	}
	2529	lwkt_reltoken(&ilock);
	2530
	2531	return (0);
	2532	}
	2533	#endif
	2534
	2535	/*
	2536	* XXX
	2537	* Exporting the vnode list on large systems causes them to crash.
	2538	* Exporting the vnode list on medium systems causes sysctl to coredump.
	2539	*/
	2540	#if 0
	2541	SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	2542	0, 0, sysctl_vnode, "S,vnode", "");
	2543	#endif
	2544
	2545	/*
	2546	* Check to see if a filesystem is mounted on a block device.
	2547	*/
	2548	int
	2549	vfs_mountedon(struct vnode *vp)
	2550	{
	2551	dev_t dev;
	2552
	2553	if ((dev = vp->v_rdev) == NULL)
	2554	dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
	2555	if (dev != NODEV && dev->si_mountpoint)
	2556	return (EBUSY);
	2557	return (0);
	2558	}
	2559
	2560	/*
	2561	* Unmount all filesystems. The list is traversed in reverse order
	2562	* of mounting to avoid dependencies.
	2563	*/
	2564	void
	2565	vfs_unmountall(void)
	2566	{
	2567	struct mount *mp;
	2568	struct thread *td = curthread;
	2569	int error;
	2570
	2571	if (td->td_proc == NULL)
	2572	td = initproc->p_thread; /* XXX XXX use proc0 instead? */
	2573
	2574	/*
	2575	* Since this only runs when rebooting, it is not interlocked.
	2576	*/
	2577	while(!TAILQ_EMPTY(&mountlist)) {
	2578	mp = TAILQ_LAST(&mountlist, mntlist);
	2579	error = dounmount(mp, MNT_FORCE, td);
	2580	if (error) {
	2581	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	2582	printf("unmount of %s failed (",
	2583	mp->mnt_stat.f_mntonname);
	2584	if (error == EBUSY)
	2585	printf("BUSY)\n");
	2586	else
	2587	printf("%d)\n", error);
	2588	} else {
	2589	/* The unmount has removed mp from the mountlist */
	2590	}
	2591	}
	2592	}
	2593
	2594	/*
	2595	* Build hash lists of net addresses and hang them off the mount point.
	2596	* Called by ufs_mount() to set up the lists of export addresses.
	2597	*/
	2598	static int
	2599	vfs_hang_addrlist(struct mount mp, struct netexport nep,
	2600	struct export_args *argp)
	2601	{
	2602	struct netcred *np;
	2603	struct radix_node_head *rnh;
	2604	int i;
	2605	struct radix_node *rn;
	2606	struct sockaddr saddr, smask = 0;
	2607	struct domain *dom;
	2608	int error;
	2609
	2610	if (argp->ex_addrlen == 0) {
	2611	if (mp->mnt_flag & MNT_DEFEXPORTED)
	2612	return (EPERM);
	2613	np = &nep->ne_defexported;
	2614	np->netc_exflags = argp->ex_flags;
	2615	np->netc_anon = argp->ex_anon;
	2616	np->netc_anon.cr_ref = 1;
	2617	mp->mnt_flag \|= MNT_DEFEXPORTED;
	2618	return (0);
	2619	}
	2620
	2621	if (argp->ex_addrlen < 0 \|\| argp->ex_addrlen > MLEN)
	2622	return (EINVAL);
	2623	if (argp->ex_masklen < 0 \|\| argp->ex_masklen > MLEN)
	2624	return (EINVAL);
	2625
	2626	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
	2627	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
	2628	bzero((caddr_t) np, i);
	2629	saddr = (struct sockaddr *) (np + 1);
	2630	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
	2631	goto out;
	2632	if (saddr->sa_len > argp->ex_addrlen)
	2633	saddr->sa_len = argp->ex_addrlen;
	2634	if (argp->ex_masklen) {
	2635	smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
	2636	error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
	2637	if (error)
	2638	goto out;
	2639	if (smask->sa_len > argp->ex_masklen)
	2640	smask->sa_len = argp->ex_masklen;
	2641	}
	2642	i = saddr->sa_family;
	2643	if ((rnh = nep->ne_rtable[i]) == 0) {
	2644	/*
	2645	* Seems silly to initialize every AF when most are not used,
	2646	* do so on demand here
	2647	*/
	2648	for (dom = domains; dom; dom = dom->dom_next)
	2649	if (dom->dom_family == i && dom->dom_rtattach) {
	2650	dom->dom_rtattach((void **) &nep->ne_rtable[i],
	2651	dom->dom_rtoffset);
	2652	break;
	2653	}
	2654	if ((rnh = nep->ne_rtable[i]) == 0) {
	2655	error = ENOBUFS;
	2656	goto out;
	2657	}
	2658	}
	2659	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
	2660	np->netc_rnodes);
	2661	if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists */
	2662	error = EPERM;
	2663	goto out;
	2664	}
	2665	np->netc_exflags = argp->ex_flags;
	2666	np->netc_anon = argp->ex_anon;
	2667	np->netc_anon.cr_ref = 1;
	2668	return (0);
	2669	out:
	2670	free(np, M_NETADDR);
	2671	return (error);
	2672	}
	2673
	2674	/* ARGSUSED */
	2675	static int
	2676	vfs_free_netcred(struct radix_node rn, void w)
	2677	{
	2678	struct radix_node_head rnh = (struct radix_node_head ) w;
	2679
	2680	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
	2681	free((caddr_t) rn, M_NETADDR);
	2682	return (0);
	2683	}
	2684
	2685	/*
	2686	* Free the net address hash lists that are hanging off the mount points.
	2687	*/
	2688	static void
	2689	vfs_free_addrlist(struct netexport *nep)
	2690	{
	2691	int i;
	2692	struct radix_node_head *rnh;
	2693
	2694	for (i = 0; i <= AF_MAX; i++)
	2695	if ((rnh = nep->ne_rtable[i])) {
	2696	(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
	2697	(caddr_t) rnh);
	2698	free((caddr_t) rnh, M_RTABLE);
	2699	nep->ne_rtable[i] = 0;
	2700	}
	2701	}
	2702
	2703	int
	2704	vfs_export(struct mount mp, struct netexport nep, struct export_args *argp)
	2705	{
	2706	int error;
	2707
	2708	if (argp->ex_flags & MNT_DELEXPORT) {
	2709	if (mp->mnt_flag & MNT_EXPUBLIC) {
	2710	vfs_setpublicfs(NULL, NULL, NULL);
	2711	mp->mnt_flag &= ~MNT_EXPUBLIC;
	2712	}
	2713	vfs_free_addrlist(nep);
	2714	mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED);
	2715	}
	2716	if (argp->ex_flags & MNT_EXPORTED) {
	2717	if (argp->ex_flags & MNT_EXPUBLIC) {
	2718	if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
	2719	return (error);
	2720	mp->mnt_flag \|= MNT_EXPUBLIC;
	2721	}
	2722	if ((error = vfs_hang_addrlist(mp, nep, argp)))
	2723	return (error);
	2724	mp->mnt_flag \|= MNT_EXPORTED;
	2725	}
	2726	return (0);
	2727	}
	2728
	2729
	2730	/*
	2731	* Set the publicly exported filesystem (WebNFS). Currently, only
	2732	* one public filesystem is possible in the spec (RFC 2054 and 2055)
	2733	*/
	2734	int
	2735	vfs_setpublicfs(struct mount mp, struct netexport nep,
	2736	struct export_args *argp)
	2737	{
	2738	int error;
	2739	struct vnode *rvp;
	2740	char *cp;
	2741
	2742	/*
	2743	* mp == NULL -> invalidate the current info, the FS is
	2744	* no longer exported. May be called from either vfs_export
	2745	* or unmount, so check if it hasn't already been done.
	2746	*/
	2747	if (mp == NULL) {
	2748	if (nfs_pub.np_valid) {
	2749	nfs_pub.np_valid = 0;
	2750	if (nfs_pub.np_index != NULL) {
	2751	FREE(nfs_pub.np_index, M_TEMP);
	2752	nfs_pub.np_index = NULL;
	2753	}
	2754	}
	2755	return (0);
	2756	}
	2757
	2758	/*
	2759	* Only one allowed at a time.
	2760	*/
	2761	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
	2762	return (EBUSY);
	2763
	2764	/*
	2765	* Get real filehandle for root of exported FS.
	2766	*/
	2767	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
	2768	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
	2769
	2770	if ((error = VFS_ROOT(mp, &rvp)))
	2771	return (error);
	2772
	2773	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
	2774	return (error);
	2775
	2776	vput(rvp);
	2777
	2778	/*
	2779	* If an indexfile was specified, pull it in.
	2780	*/
	2781	if (argp->ex_indexfile != NULL) {
	2782	MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
	2783	M_WAITOK);
	2784	error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
	2785	MAXNAMLEN, (size_t *)0);
	2786	if (!error) {
	2787	/*
	2788	* Check for illegal filenames.
	2789	*/
	2790	for (cp = nfs_pub.np_index; *cp; cp++) {
	2791	if (*cp == '/') {
	2792	error = EINVAL;
	2793	break;
	2794	}
	2795	}
	2796	}
	2797	if (error) {
	2798	FREE(nfs_pub.np_index, M_TEMP);
	2799	return (error);
	2800	}
	2801	}
	2802
	2803	nfs_pub.np_mount = mp;
	2804	nfs_pub.np_valid = 1;
	2805	return (0);
	2806	}
	2807
	2808	struct netcred *
	2809	vfs_export_lookup(struct mount mp, struct netexport nep,
	2810	struct sockaddr *nam)
	2811	{
	2812	struct netcred *np;
	2813	struct radix_node_head *rnh;
	2814	struct sockaddr *saddr;
	2815
	2816	np = NULL;
	2817	if (mp->mnt_flag & MNT_EXPORTED) {
	2818	/*
	2819	* Lookup in the export list first.
	2820	*/
	2821	if (nam != NULL) {
	2822	saddr = nam;
	2823	rnh = nep->ne_rtable[saddr->sa_family];
	2824	if (rnh != NULL) {
	2825	np = (struct netcred *)
	2826	(*rnh->rnh_matchaddr)((caddr_t)saddr,
	2827	rnh);
	2828	if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
	2829	np = NULL;
	2830	}
	2831	}
	2832	/*
	2833	* If no address match, use the default if it exists.
	2834	*/
	2835	if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
	2836	np = &nep->ne_defexported;
	2837	}
	2838	return (np);
	2839	}
	2840
	2841	/*
	2842	* perform msync on all vnodes under a mount point. The mount point must
	2843	* be locked. This code is also responsible for lazy-freeing unreferenced
	2844	* vnodes whos VM objects no longer contain pages.
	2845	*
	2846	* NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
	2847	*/
	2848	static int vfs_msync_scan1(struct mount mp, struct vnode vp, void *data);
	2849	static int vfs_msync_scan2(struct mount mp, struct vnode vp,
	2850	lwkt_tokref_t vlock, void *data);
	2851
	2852	void
	2853	vfs_msync(struct mount *mp, int flags)
	2854	{
	2855	vmntvnodescan(mp, vfs_msync_scan1, vfs_msync_scan2, (void *)flags);
	2856	}
	2857
	2858	/*
	2859	* scan1 is a fast pre-check. There could be hundreds of thousands of
	2860	* vnodes, we cannot afford to do anything heavy weight until we have a
	2861	* fairly good indication that there is work to do.
	2862	*/
	2863	static
	2864	int
	2865	vfs_msync_scan1(struct mount mp, struct vnode vp, void *data)
	2866	{
	2867	int flags = (int)data;
	2868
	2869	if ((vp->v_flag & VXLOCK) == 0) {
	2870	if (VSHOULDFREE(vp))
	2871	return(0);
	2872	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	2873	(vp->v_flag & VOBJDIRTY) &&
	2874	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	2875	return(0);
	2876	}
	2877	}
	2878	return(-1);
	2879	}
	2880
	2881	static
	2882	int
	2883	vfs_msync_scan2(struct mount mp, struct vnode vp,
	2884	lwkt_tokref_t vlock, void *data)
	2885	{
	2886	vm_object_t obj;
	2887	int error;
	2888	int flags = (int)data;
	2889
	2890	if (vp->v_flag & VXLOCK)
	2891	return(0);
	2892
	2893	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	2894	(vp->v_flag & VOBJDIRTY) &&
	2895	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	2896	error = vget(vp, vlock, LK_EXCLUSIVE \| LK_RETRY \| LK_NOOBJ \| LK_INTERLOCK, curthread);
	2897	if (error == 0) {
	2898	if (VOP_GETVOBJECT(vp, &obj) == 0) {
	2899	vm_object_page_clean(obj, 0, 0,
	2900	flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
	2901	}
	2902	vput(vp);
	2903	}
	2904	return(0);
	2905	}
	2906	vmaybefree(vp);
	2907	lwkt_reltoken(vlock);
	2908	return(0);
	2909	}
	2910
	2911	/*
	2912	* Create the VM object needed for VMIO and mmap support. This
	2913	* is done for all VREG files in the system. Some filesystems might
	2914	* afford the additional metadata buffering capability of the
	2915	* VMIO code by making the device node be VMIO mode also.
	2916	*
	2917	* vp must be locked when vfs_object_create is called.
	2918	*/
	2919	int
	2920	vfs_object_create(struct vnode vp, struct thread td)
	2921	{
	2922	return (VOP_CREATEVOBJECT(vp, td));
	2923	}
	2924
	2925	/*
	2926	* NOTE: the vnode interlock must be held during the call. We have to recheck
	2927	* the VFREE flag since the vnode may have been removed from the free list
	2928	* while we were blocked on vnode_free_list_token. The use or hold count
	2929	* must have already been bumped by the caller.
	2930	*/
	2931	static void
	2932	vbusy(struct vnode *vp)
	2933	{
	2934	lwkt_tokref ilock;
	2935
	2936	lwkt_gettoken(&ilock, &vnode_free_list_token);
	2937	if ((vp->v_flag & VFREE) != 0) {
	2938	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	2939	freevnodes--;
	2940	vp->v_flag &= ~(VFREE\|VAGE);
	2941	}
	2942	lwkt_reltoken(&ilock);
	2943	}
	2944
	2945	/*
	2946	* NOTE: the vnode interlock must be held during the call. The use or hold
	2947	* count must have already been bumped by the caller. We use a VINFREE to
	2948	* interlock against other calls to vfree() which might occur while we
	2949	* are blocked. The vnode cannot be reused until it has actually been
	2950	* placed on the free list, so there are no other races even though the
	2951	* use and hold counts are 0.
	2952	*/
	2953	static void
	2954	vfree(struct vnode *vp)
	2955	{
	2956	lwkt_tokref ilock;
	2957
	2958	if ((vp->v_flag & VINFREE) == 0) {
	2959	vp->v_flag \|= VINFREE;
	2960	lwkt_gettoken(&ilock, &vnode_free_list_token); /* can block */
	2961	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
	2962	if (vp->v_flag & VAGE) {
	2963	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2964	} else {
	2965	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	2966	}
	2967	freevnodes++;
	2968	vp->v_flag &= ~(VAGE\|VINFREE);
	2969	vp->v_flag \|= VFREE;
	2970	lwkt_reltoken(&ilock); /* can block */
	2971	}
	2972	}
	2973
	2974
	2975	/*
	2976	* Record a process's interest in events which might happen to
	2977	* a vnode. Because poll uses the historic select-style interface
	2978	* internally, this routine serves as both the ``check for any
	2979	* pending events'' and the ``record my interest in future events''
	2980	* functions. (These are done together, while the lock is held,
	2981	* to avoid race conditions.)
	2982	*/
	2983	int
	2984	vn_pollrecord(struct vnode vp, struct thread td, int events)
	2985	{
	2986	lwkt_tokref ilock;
	2987
	2988	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	2989	if (vp->v_pollinfo.vpi_revents & events) {
	2990	/*
	2991	* This leaves events we are not interested
	2992	* in available for the other process which
	2993	* which presumably had requested them
	2994	* (otherwise they would never have been
	2995	* recorded).
	2996	*/
	2997	events &= vp->v_pollinfo.vpi_revents;
	2998	vp->v_pollinfo.vpi_revents &= ~events;
	2999
	3000	lwkt_reltoken(&ilock);
	3001	return events;
	3002	}
	3003	vp->v_pollinfo.vpi_events \|= events;
	3004	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
	3005	lwkt_reltoken(&ilock);
	3006	return 0;
	3007	}
	3008
	3009	/*
	3010	* Note the occurrence of an event. If the VN_POLLEVENT macro is used,
	3011	* it is possible for us to miss an event due to race conditions, but
	3012	* that condition is expected to be rare, so for the moment it is the
	3013	* preferred interface.
	3014	*/
	3015	void
	3016	vn_pollevent(struct vnode *vp, int events)
	3017	{
	3018	lwkt_tokref ilock;
	3019
	3020	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	3021	if (vp->v_pollinfo.vpi_events & events) {
	3022	/*
	3023	* We clear vpi_events so that we don't
	3024	* call selwakeup() twice if two events are
	3025	* posted before the polling process(es) is
	3026	* awakened. This also ensures that we take at
	3027	* most one selwakeup() if the polling process
	3028	* is no longer interested. However, it does
	3029	* mean that only one event can be noticed at
	3030	* a time. (Perhaps we should only clear those
	3031	* event bits which we note?) XXX
	3032	*/
	3033	vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
	3034	vp->v_pollinfo.vpi_revents \|= events;
	3035	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	3036	}
	3037	lwkt_reltoken(&ilock);
	3038	}
	3039
	3040	/*
	3041	* Wake up anyone polling on vp because it is being revoked.
	3042	* This depends on dead_poll() returning POLLHUP for correct
	3043	* behavior.
	3044	*/
	3045	void
	3046	vn_pollgone(struct vnode *vp)
	3047	{
	3048	lwkt_tokref ilock;
	3049
	3050	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	3051	if (vp->v_pollinfo.vpi_events) {
	3052	vp->v_pollinfo.vpi_events = 0;
	3053	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	3054	}
	3055	lwkt_reltoken(&ilock);
	3056	}
	3057
	3058
	3059
	3060	/*
	3061	* Routine to create and manage a filesystem syncer vnode.
	3062	*/
	3063	#define sync_close ((int () (struct vop_close_args ))nullop)
	3064	static int sync_fsync (struct vop_fsync_args *);
	3065	static int sync_inactive (struct vop_inactive_args *);
	3066	static int sync_reclaim (struct vop_reclaim_args *);
	3067	#define sync_lock ((int () (struct vop_lock_args ))vop_nolock)
	3068	#define sync_unlock ((int () (struct vop_unlock_args ))vop_nounlock)
	3069	static int sync_print (struct vop_print_args *);
	3070	#define sync_islocked ((int() (struct vop_islocked_args ))vop_noislocked)
	3071
	3072	static vop_t **sync_vnodeop_p;
	3073	static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
	3074	{ &vop_default_desc, (vop_t *) vop_eopnotsupp },
	3075	{ &vop_close_desc, (vop_t ) sync_close }, / close */
	3076	{ &vop_fsync_desc, (vop_t ) sync_fsync }, / fsync */
	3077	{ &vop_inactive_desc, (vop_t ) sync_inactive }, / inactive */
	3078	{ &vop_reclaim_desc, (vop_t ) sync_reclaim }, / reclaim */
	3079	{ &vop_lock_desc, (vop_t ) sync_lock }, / lock */
	3080	{ &vop_unlock_desc, (vop_t ) sync_unlock }, / unlock */
	3081	{ &vop_print_desc, (vop_t ) sync_print }, / print */
	3082	{ &vop_islocked_desc, (vop_t ) sync_islocked }, / islocked */
	3083	{ NULL, NULL }
	3084	};
	3085	static struct vnodeopv_desc sync_vnodeop_opv_desc =
	3086	{ &sync_vnodeop_p, sync_vnodeop_entries };
	3087
	3088	VNODEOP_SET(sync_vnodeop_opv_desc);
	3089
	3090	/*
	3091	* Create a new filesystem syncer vnode for the specified mount point.
	3092	* This vnode is placed on the worklist and is responsible for sync'ing
	3093	* the filesystem.
	3094	*
	3095	* NOTE: read-only mounts are also placed on the worklist. The filesystem
	3096	* sync code is also responsible for cleaning up vnodes.
	3097	*/
	3098	int
	3099	vfs_allocate_syncvnode(struct mount *mp)
	3100	{
	3101	struct vnode *vp;
	3102	static long start, incr, next;
	3103	int error;
	3104
	3105	/* Allocate a new vnode */
	3106	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
	3107	mp->mnt_syncer = NULL;
	3108	return (error);
	3109	}
	3110	vp->v_type = VNON;
	3111	/*
	3112	* Place the vnode onto the syncer worklist. We attempt to
	3113	* scatter them about on the list so that they will go off
	3114	* at evenly distributed times even if all the filesystems
	3115	* are mounted at once.
	3116	*/
	3117	next += incr;
	3118	if (next == 0 \|\| next > syncer_maxdelay) {
	3119	start /= 2;
	3120	incr /= 2;
	3121	if (start == 0) {
	3122	start = syncer_maxdelay / 2;
	3123	incr = syncer_maxdelay;
	3124	}
	3125	next = start;
	3126	}
	3127	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
	3128	mp->mnt_syncer = vp;
	3129	return (0);
	3130	}
	3131
	3132	/*
	3133	* Do a lazy sync of the filesystem.
	3134	*
	3135	* sync_fsync { struct vnode a_vp, struct ucred a_cred, int a_waitfor,
	3136	* struct thread *a_td }
	3137	*/
	3138	static int
	3139	sync_fsync(struct vop_fsync_args *ap)
	3140	{
	3141	struct vnode *syncvp = ap->a_vp;
	3142	struct mount *mp = syncvp->v_mount;
	3143	struct thread *td = ap->a_td;
	3144	lwkt_tokref ilock;
	3145	int asyncflag;
	3146
	3147	/*
	3148	* We only need to do something if this is a lazy evaluation.
	3149	*/
	3150	if (ap->a_waitfor != MNT_LAZY)
	3151	return (0);
	3152
	3153	/*
	3154	* Move ourselves to the back of the sync list.
	3155	*/
	3156	vn_syncer_add_to_worklist(syncvp, syncdelay);
	3157
	3158	/*
	3159	* Walk the list of vnodes pushing all that are dirty and
	3160	* not already on the sync list, and freeing vnodes which have
	3161	* no refs and whos VM objects are empty. vfs_msync() handles
	3162	* the VM issues and must be called whether the mount is readonly
	3163	* or not.
	3164	*/
	3165	lwkt_gettoken(&ilock, &mountlist_token);
	3166	if (vfs_busy(mp, LK_EXCLUSIVE \| LK_NOWAIT, &ilock, td) != 0) {
	3167	lwkt_reltoken(&ilock);
	3168	return (0);
	3169	}
	3170	if (mp->mnt_flag & MNT_RDONLY) {
	3171	vfs_msync(mp, MNT_NOWAIT);
	3172	} else {
	3173	asyncflag = mp->mnt_flag & MNT_ASYNC;
	3174	mp->mnt_flag &= ~MNT_ASYNC; /* ZZZ hack */
	3175	vfs_msync(mp, MNT_NOWAIT);
	3176	VFS_SYNC(mp, MNT_LAZY, td);
	3177	if (asyncflag)
	3178	mp->mnt_flag \|= MNT_ASYNC;
	3179	}
	3180	vfs_unbusy(mp, td);
	3181	return (0);
	3182	}
	3183
	3184	/*
	3185	* The syncer vnode is no referenced.
	3186	*
	3187	* sync_inactive { struct vnode a_vp, struct proc a_p }
	3188	*/
	3189	static int
	3190	sync_inactive(struct vop_inactive_args *ap)
	3191	{
	3192	vgone(ap->a_vp);
	3193	return (0);
	3194	}
	3195
	3196	/*
	3197	* The syncer vnode is no longer needed and is being decommissioned.
	3198	*
	3199	* Modifications to the worklist must be protected at splbio().
	3200	*
	3201	* sync_reclaim { struct vnode *a_vp }
	3202	*/
	3203	static int
	3204	sync_reclaim(struct vop_reclaim_args *ap)
	3205	{
	3206	struct vnode *vp = ap->a_vp;
	3207	int s;
	3208
	3209	s = splbio();
	3210	vp->v_mount->mnt_syncer = NULL;
	3211	if (vp->v_flag & VONWORKLST) {
	3212	LIST_REMOVE(vp, v_synclist);
	3213	vp->v_flag &= ~VONWORKLST;
	3214	}
	3215	splx(s);
	3216
	3217	return (0);
	3218	}
	3219
	3220	/*
	3221	* Print out a syncer vnode.
	3222	*
	3223	* sync_print { struct vnode *a_vp }
	3224	*/
	3225	static int
	3226	sync_print(struct vop_print_args *ap)
	3227	{
	3228	struct vnode *vp = ap->a_vp;
	3229
	3230	printf("syncer vnode");
	3231	if (vp->v_vnlock != NULL)
	3232	lockmgr_printinfo(vp->v_vnlock);
	3233	printf("\n");
	3234	return (0);
	3235	}
	3236
	3237	/*
	3238	* extract the dev_t from a VBLK or VCHR. The vnode must have been opened
	3239	* (or v_rdev might be NULL).
	3240	*/
	3241	dev_t
	3242	vn_todev(struct vnode *vp)
	3243	{
	3244	if (vp->v_type != VBLK && vp->v_type != VCHR)
	3245	return (NODEV);
	3246	KKASSERT(vp->v_rdev != NULL);
	3247	return (vp->v_rdev);
	3248	}
	3249
	3250	/*
	3251	* Check if vnode represents a disk device. The vnode does not need to be
	3252	* opened.
	3253	*/
	3254	int
	3255	vn_isdisk(struct vnode vp, int errp)
	3256	{
	3257	dev_t dev;
	3258
	3259	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	3260	if (errp != NULL)
	3261	*errp = ENOTBLK;
	3262	return (0);
	3263	}
	3264
	3265	if ((dev = vp->v_rdev) == NULL)
	3266	dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
	3267	if (dev == NULL \|\| dev == NODEV) {
	3268	if (errp != NULL)
	3269	*errp = ENXIO;
	3270	return (0);
	3271	}
	3272	if (dev_is_good(dev) == 0) {
	3273	if (errp != NULL)
	3274	*errp = ENXIO;
	3275	return (0);
	3276	}
	3277	if ((dev_dflags(dev) & D_DISK) == 0) {
	3278	if (errp != NULL)
	3279	*errp = ENOTBLK;
	3280	return (0);
	3281	}
	3282	if (errp != NULL)
	3283	*errp = 0;
	3284	return (1);
	3285	}
	3286
	3287	void
	3288	NDFREE(struct nameidata *ndp, const uint flags)
	3289	{
	3290	if (!(flags & NDF_NO_FREE_PNBUF) &&
	3291	(ndp->ni_cnd.cn_flags & CNP_HASBUF)) {
	3292	zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
	3293	ndp->ni_cnd.cn_flags &= ~CNP_HASBUF;
	3294	}
	3295	if (!(flags & NDF_NO_DNCP_RELE) &&
	3296	(ndp->ni_cnd.cn_flags & CNP_WANTDNCP) &&
	3297	ndp->ni_dncp) {
	3298	cache_drop(ndp->ni_dncp);
	3299	ndp->ni_dncp = NULL;
	3300	}
	3301	if (!(flags & NDF_NO_NCP_RELE) &&
	3302	(ndp->ni_cnd.cn_flags & CNP_WANTNCP) &&
	3303	ndp->ni_ncp) {
	3304	cache_drop(ndp->ni_ncp);
	3305	ndp->ni_ncp = NULL;
	3306	}
	3307	if (!(flags & NDF_NO_DVP_UNLOCK) &&
	3308	(ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) &&
	3309	ndp->ni_dvp != ndp->ni_vp) {
	3310	VOP_UNLOCK(ndp->ni_dvp, NULL, 0, ndp->ni_cnd.cn_td);
	3311	}
	3312	if (!(flags & NDF_NO_DVP_RELE) &&
	3313	(ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT\|CNP_WANTPARENT))) {
	3314	vrele(ndp->ni_dvp);
	3315	ndp->ni_dvp = NULL;
	3316	}
	3317	if (!(flags & NDF_NO_VP_UNLOCK) &&
	3318	(ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) {
	3319	VOP_UNLOCK(ndp->ni_vp, NULL, 0, ndp->ni_cnd.cn_td);
	3320	}
	3321	if (!(flags & NDF_NO_VP_RELE) &&
	3322	ndp->ni_vp) {
	3323	vrele(ndp->ni_vp);
	3324	ndp->ni_vp = NULL;
	3325	}
	3326	if (!(flags & NDF_NO_STARTDIR_RELE) &&
	3327	(ndp->ni_cnd.cn_flags & CNP_SAVESTART)) {
	3328	vrele(ndp->ni_startdir);
	3329	ndp->ni_startdir = NULL;
	3330	}
	3331	}
	3332
	3333	#ifdef DEBUG_VFS_LOCKS
	3334
	3335	void
	3336	assert_vop_locked(struct vnode vp, const char str)
	3337	{
	3338	if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) {
	3339	panic("%s: %p is not locked shared but should be", str, vp);
	3340	}
	3341	}
	3342
	3343	void
	3344	assert_vop_unlocked(struct vnode vp, const char str)
	3345	{
	3346	if (vp && IS_LOCKING_VFS(vp)) {
	3347	if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) {
	3348	panic("%s: %p is locked but should not be", str, vp);
	3349	}
	3350	}
	3351	}
	3352
	3353	#endif