gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	39	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	40	* $DragonFly: src/sys/kern/vfs_subr.c,v 1.25 2004/02/10 07:34:42 dillon Exp $
	41	*/
	42
	43	/*
	44	* External virtual filesystem routines
	45	*/
	46	#include "opt_ddb.h"
	47
	48	#include <sys/param.h>
	49	#include <sys/systm.h>
	50	#include <sys/buf.h>
	51	#include <sys/conf.h>
	52	#include <sys/dirent.h>
	53	#include <sys/domain.h>
	54	#include <sys/eventhandler.h>
	55	#include <sys/fcntl.h>
	56	#include <sys/kernel.h>
	57	#include <sys/kthread.h>
	58	#include <sys/malloc.h>
	59	#include <sys/mbuf.h>
	60	#include <sys/mount.h>
	61	#include <sys/proc.h>
	62	#include <sys/namei.h>
	63	#include <sys/reboot.h>
	64	#include <sys/socket.h>
	65	#include <sys/stat.h>
	66	#include <sys/sysctl.h>
	67	#include <sys/syslog.h>
	68	#include <sys/vmmeter.h>
	69	#include <sys/vnode.h>
	70
	71	#include <machine/limits.h>
	72
	73	#include <vm/vm.h>
	74	#include <vm/vm_object.h>
	75	#include <vm/vm_extern.h>
	76	#include <vm/vm_kern.h>
	77	#include <vm/pmap.h>
	78	#include <vm/vm_map.h>
	79	#include <vm/vm_page.h>
	80	#include <vm/vm_pager.h>
	81	#include <vm/vnode_pager.h>
	82	#include <vm/vm_zone.h>
	83
	84	#include <sys/buf2.h>
	85	#include <sys/thread2.h>
	86
	87	static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
	88
	89	static void insmntque (struct vnode vp, struct mount mp);
	90	static void vclean (struct vnode vp, int flags, struct thread td);
	91	static unsigned long numvnodes;
	92	static void vlruvp(struct vnode *vp);
	93	SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
	94
	95	enum vtype iftovt_tab[16] = {
	96	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	97	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	98	};
	99	int vttoif_tab[9] = {
	100	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	101	S_IFSOCK, S_IFIFO, S_IFMT,
	102	};
	103
	104	static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
	105
	106	static u_long wantfreevnodes = 25;
	107	SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
	108	static u_long freevnodes = 0;
	109	SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
	110
	111	static int reassignbufcalls;
	112	SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
	113	static int reassignbufloops;
	114	SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
	115	static int reassignbufsortgood;
	116	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
	117	static int reassignbufsortbad;
	118	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
	119	static int reassignbufmethod = 1;
	120	SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
	121
	122	#ifdef ENABLE_VFS_IOOPT
	123	int vfs_ioopt = 0;
	124	SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
	125	#endif
	126
	127	struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
	128	struct lwkt_token mountlist_token;
	129	struct lwkt_token mntvnode_token;
	130	int nfs_mount_type = -1;
	131	static struct lwkt_token mntid_token;
	132	static struct lwkt_token vnode_free_list_token;
	133	static struct lwkt_token spechash_token;
	134	struct nfs_public nfs_pub; /* publicly exported FS */
	135	static vm_zone_t vnode_zone;
	136
	137	/*
	138	* The workitem queue.
	139	*/
	140	#define SYNCER_MAXDELAY 32
	141	static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
	142	time_t syncdelay = 30; /* max time to delay syncing data */
	143	SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW, &syncdelay, 0,
	144	"VFS data synchronization delay");
	145	time_t filedelay = 30; /* time to delay syncing files */
	146	SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
	147	"File synchronization delay");
	148	time_t dirdelay = 29; /* time to delay syncing directories */
	149	SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
	150	"Directory synchronization delay");
	151	time_t metadelay = 28; /* time to delay syncing metadata */
	152	SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
	153	"VFS metadata synchronization delay");
	154	static int rushjob; /* number of slots to run ASAP */
	155	static int stat_rush_requests; /* number of times I/O speeded up */
	156	SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
	157
	158	static int syncer_delayno = 0;
	159	static long syncer_mask;
	160	LIST_HEAD(synclist, vnode);
	161	static struct synclist *syncer_workitem_pending;
	162
	163	int desiredvnodes;
	164	SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
	165	&desiredvnodes, 0, "Maximum number of vnodes");
	166	static int minvnodes;
	167	SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
	168	&minvnodes, 0, "Minimum number of vnodes");
	169	static int vnlru_nowhere = 0;
	170	SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
	171	"Number of times the vnlru process ran without success");
	172
	173	static void vfs_free_addrlist (struct netexport *nep);
	174	static int vfs_free_netcred (struct radix_node rn, void w);
	175	static int vfs_hang_addrlist (struct mount mp, struct netexport nep,
	176	struct export_args *argp);
	177
	178	/*
	179	* Initialize the vnode management data structures.
	180	*/
	181	void
	182	vntblinit()
	183	{
	184
	185	/*
	186	* Desired vnodes is a result of the physical page count
	187	* and the size of kernel's heap. It scales in proportion
	188	* to the amount of available physical memory. This can
	189	* cause trouble on 64-bit and large memory platforms.
	190	*/
	191	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
	192	desiredvnodes =
	193	min(maxproc + vmstats.v_page_count /4,
	194	2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
	195	(5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
	196
	197	minvnodes = desiredvnodes / 4;
	198	lwkt_inittoken(&mountlist_token);
	199	lwkt_inittoken(&mntvnode_token);
	200	lwkt_inittoken(&mntid_token);
	201	lwkt_inittoken(&spechash_token);
	202	TAILQ_INIT(&vnode_free_list);
	203	lwkt_inittoken(&vnode_free_list_token);
	204	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
	205	/*
	206	* Initialize the filesystem syncer.
	207	*/
	208	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
	209	&syncer_mask);
	210	syncer_maxdelay = syncer_mask + 1;
	211	}
	212
	213	/*
	214	* Mark a mount point as busy. Used to synchronize access and to delay
	215	* unmounting. Interlock is not released on failure.
	216	*/
	217	int
	218	vfs_busy(struct mount mp, int flags, struct lwkt_token interlkp,
	219	struct thread *td)
	220	{
	221	int lkflags;
	222
	223	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
	224	if (flags & LK_NOWAIT)
	225	return (ENOENT);
	226	mp->mnt_kern_flag \|= MNTK_MWAIT;
	227	if (interlkp) {
	228	lwkt_reltoken(interlkp);
	229	}
	230	/*
	231	* Since all busy locks are shared except the exclusive
	232	* lock granted when unmounting, the only place that a
	233	* wakeup needs to be done is at the release of the
	234	* exclusive lock at the end of dounmount.
	235	*/
	236	tsleep((caddr_t)mp, 0, "vfs_busy", 0);
	237	if (interlkp) {
	238	lwkt_gettoken(interlkp);
	239	}
	240	return (ENOENT);
	241	}
	242	lkflags = LK_SHARED \| LK_NOPAUSE;
	243	if (interlkp)
	244	lkflags \|= LK_INTERLOCK;
	245	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
	246	panic("vfs_busy: unexpected lock failure");
	247	return (0);
	248	}
	249
	250	/*
	251	* Free a busy filesystem.
	252	*/
	253	void
	254	vfs_unbusy(struct mount mp, struct thread td)
	255	{
	256	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
	257	}
	258
	259	/*
	260	* Lookup a filesystem type, and if found allocate and initialize
	261	* a mount structure for it.
	262	*
	263	* Devname is usually updated by mount(8) after booting.
	264	*/
	265	int
	266	vfs_rootmountalloc(char fstypename, char devname, struct mount **mpp)
	267	{
	268	struct thread td = curthread; / XXX */
	269	struct vfsconf *vfsp;
	270	struct mount *mp;
	271
	272	if (fstypename == NULL)
	273	return (ENODEV);
	274	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	275	if (!strcmp(vfsp->vfc_name, fstypename))
	276	break;
	277	if (vfsp == NULL)
	278	return (ENODEV);
	279	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
	280	bzero((char *)mp, (u_long)sizeof(struct mount));
	281	lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
	282	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
	283	TAILQ_INIT(&mp->mnt_nvnodelist);
	284	TAILQ_INIT(&mp->mnt_reservedvnlist);
	285	mp->mnt_nvnodelistsize = 0;
	286	mp->mnt_vfc = vfsp;
	287	mp->mnt_op = vfsp->vfc_vfsops;
	288	mp->mnt_flag = MNT_RDONLY;
	289	mp->mnt_vnodecovered = NULLVP;
	290	vfsp->vfc_refcount++;
	291	mp->mnt_iosize_max = DFLTPHYS;
	292	mp->mnt_stat.f_type = vfsp->vfc_typenum;
	293	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	294	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
	295	mp->mnt_stat.f_mntonname[0] = '/';
	296	mp->mnt_stat.f_mntonname[1] = 0;
	297	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
	298	*mpp = mp;
	299	return (0);
	300	}
	301
	302	/*
	303	* Find an appropriate filesystem to use for the root. If a filesystem
	304	* has not been preselected, walk through the list of known filesystems
	305	* trying those that have mountroot routines, and try them until one
	306	* works or we have tried them all.
	307	*/
	308	#ifdef notdef /* XXX JH */
	309	int
	310	lite2_vfs_mountroot()
	311	{
	312	struct vfsconf *vfsp;
	313	extern int (*lite2_mountroot) (void);
	314	int error;
	315
	316	if (lite2_mountroot != NULL)
	317	return ((*lite2_mountroot)());
	318	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	319	if (vfsp->vfc_mountroot == NULL)
	320	continue;
	321	if ((error = (*vfsp->vfc_mountroot)()) == 0)
	322	return (0);
	323	printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
	324	}
	325	return (ENODEV);
	326	}
	327	#endif
	328
	329	/*
	330	* Lookup a mount point by filesystem identifier.
	331	*/
	332	struct mount *
	333	vfs_getvfs(fsid)
	334	fsid_t *fsid;
	335	{
	336	struct mount *mp;
	337
	338	lwkt_gettoken(&mountlist_token);
	339	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	340	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	341	mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
	342	lwkt_reltoken(&mountlist_token);
	343	return (mp);
	344	}
	345	}
	346	lwkt_reltoken(&mountlist_token);
	347	return ((struct mount *) 0);
	348	}
	349
	350	/*
	351	* Get a new unique fsid. Try to make its val[0] unique, since this value
	352	* will be used to create fake device numbers for stat(). Also try (but
	353	* not so hard) make its val[0] unique mod 2^16, since some emulators only
	354	* support 16-bit device numbers. We end up with unique val[0]'s for the
	355	* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
	356	*
	357	* Keep in mind that several mounts may be running in parallel. Starting
	358	* the search one past where the previous search terminated is both a
	359	* micro-optimization and a defense against returning the same fsid to
	360	* different mounts.
	361	*/
	362	void
	363	vfs_getnewfsid(mp)
	364	struct mount *mp;
	365	{
	366	static u_int16_t mntid_base;
	367	fsid_t tfsid;
	368	int mtype;
	369
	370	lwkt_gettoken(&mntid_token);
	371	mtype = mp->mnt_vfc->vfc_typenum;
	372	tfsid.val[1] = mtype;
	373	mtype = (mtype & 0xFF) << 24;
	374	for (;;) {
	375	tfsid.val[0] = makeudev(255,
	376	mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF));
	377	mntid_base++;
	378	if (vfs_getvfs(&tfsid) == NULL)
	379	break;
	380	}
	381	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
	382	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
	383	lwkt_reltoken(&mntid_token);
	384	}
	385
	386	/*
	387	* Knob to control the precision of file timestamps:
	388	*
	389	* 0 = seconds only; nanoseconds zeroed.
	390	* 1 = seconds and nanoseconds, accurate within 1/HZ.
	391	* 2 = seconds and nanoseconds, truncated to microseconds.
	392	* >=3 = seconds and nanoseconds, maximum precision.
	393	*/
	394	enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
	395
	396	static int timestamp_precision = TSP_SEC;
	397	SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
	398	&timestamp_precision, 0, "");
	399
	400	/*
	401	* Get a current timestamp.
	402	*/
	403	void
	404	vfs_timestamp(tsp)
	405	struct timespec *tsp;
	406	{
	407	struct timeval tv;
	408
	409	switch (timestamp_precision) {
	410	case TSP_SEC:
	411	tsp->tv_sec = time_second;
	412	tsp->tv_nsec = 0;
	413	break;
	414	case TSP_HZ:
	415	getnanotime(tsp);
	416	break;
	417	case TSP_USEC:
	418	microtime(&tv);
	419	TIMEVAL_TO_TIMESPEC(&tv, tsp);
	420	break;
	421	case TSP_NSEC:
	422	default:
	423	nanotime(tsp);
	424	break;
	425	}
	426	}
	427
	428	/*
	429	* Set vnode attributes to VNOVAL
	430	*/
	431	void
	432	vattr_null(vap)
	433	struct vattr *vap;
	434	{
	435
	436	vap->va_type = VNON;
	437	vap->va_size = VNOVAL;
	438	vap->va_bytes = VNOVAL;
	439	vap->va_mode = VNOVAL;
	440	vap->va_nlink = VNOVAL;
	441	vap->va_uid = VNOVAL;
	442	vap->va_gid = VNOVAL;
	443	vap->va_fsid = VNOVAL;
	444	vap->va_fileid = VNOVAL;
	445	vap->va_blocksize = VNOVAL;
	446	vap->va_rdev = VNOVAL;
	447	vap->va_atime.tv_sec = VNOVAL;
	448	vap->va_atime.tv_nsec = VNOVAL;
	449	vap->va_mtime.tv_sec = VNOVAL;
	450	vap->va_mtime.tv_nsec = VNOVAL;
	451	vap->va_ctime.tv_sec = VNOVAL;
	452	vap->va_ctime.tv_nsec = VNOVAL;
	453	vap->va_flags = VNOVAL;
	454	vap->va_gen = VNOVAL;
	455	vap->va_vaflags = 0;
	456	}
	457
	458	/*
	459	* This routine is called when we have too many vnodes. It attempts
	460	* to free <count> vnodes and will potentially free vnodes that still
	461	* have VM backing store (VM backing store is typically the cause
	462	* of a vnode blowout so we want to do this). Therefore, this operation
	463	* is not considered cheap.
	464	*
	465	* A number of conditions may prevent a vnode from being reclaimed.
	466	* the buffer cache may have references on the vnode, a directory
	467	* vnode may still have references due to the namei cache representing
	468	* underlying files, or the vnode may be in active use. It is not
	469	* desireable to reuse such vnodes. These conditions may cause the
	470	* number of vnodes to reach some minimum value regardless of what
	471	* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
	472	*/
	473	static int
	474	vlrureclaim(struct mount *mp)
	475	{
	476	struct vnode *vp;
	477	int done;
	478	int trigger;
	479	int usevnodes;
	480	int count;
	481	int gen;
	482
	483	/*
	484	* Calculate the trigger point, don't allow user
	485	* screwups to blow us up. This prevents us from
	486	* recycling vnodes with lots of resident pages. We
	487	* aren't trying to free memory, we are trying to
	488	* free vnodes.
	489	*/
	490	usevnodes = desiredvnodes;
	491	if (usevnodes <= 0)
	492	usevnodes = 1;
	493	trigger = vmstats.v_page_count * 2 / usevnodes;
	494
	495	done = 0;
	496	gen = lwkt_gettoken(&mntvnode_token);
	497	count = mp->mnt_nvnodelistsize / 10 + 1;
	498	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
	499	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	500	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	501
	502	if (vp->v_type != VNON &&
	503	vp->v_type != VBAD &&
	504	VMIGHTFREE(vp) && /* critical path opt */
	505	(vp->v_object == NULL \|\| vp->v_object->resident_page_count < trigger)
	506	) {
	507	lwkt_gettoken(&vp->v_interlock);
	508	if (lwkt_gentoken(&mntvnode_token, &gen) == 0) {
	509	if (VMIGHTFREE(vp)) {
	510	vgonel(vp, curthread);
	511	done++;
	512	} else {
	513	lwkt_reltoken(&vp->v_interlock);
	514	}
	515	} else {
	516	lwkt_reltoken(&vp->v_interlock);
	517	}
	518	}
	519	--count;
	520	}
	521	lwkt_reltoken(&mntvnode_token);
	522	return done;
	523	}
	524
	525	/*
	526	* Attempt to recycle vnodes in a context that is always safe to block.
	527	* Calling vlrurecycle() from the bowels of file system code has some
	528	* interesting deadlock problems.
	529	*/
	530	static struct thread *vnlruthread;
	531	static int vnlruproc_sig;
	532
	533	static void
	534	vnlru_proc(void)
	535	{
	536	struct mount mp, nmp;
	537	int s;
	538	int done;
	539	struct thread *td = curthread;
	540
	541	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	542	SHUTDOWN_PRI_FIRST);
	543
	544	s = splbio();
	545	for (;;) {
	546	kproc_suspend_loop();
	547	if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
	548	vnlruproc_sig = 0;
	549	wakeup(&vnlruproc_sig);
	550	tsleep(td, 0, "vlruwt", hz);
	551	continue;
	552	}
	553	done = 0;
	554	lwkt_gettoken(&mountlist_token);
	555	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	556	if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, td)) {
	557	nmp = TAILQ_NEXT(mp, mnt_list);
	558	continue;
	559	}
	560	done += vlrureclaim(mp);
	561	lwkt_gettoken(&mountlist_token);
	562	nmp = TAILQ_NEXT(mp, mnt_list);
	563	vfs_unbusy(mp, td);
	564	}
	565	lwkt_reltoken(&mountlist_token);
	566	if (done == 0) {
	567	vnlru_nowhere++;
	568	tsleep(td, 0, "vlrup", hz * 3);
	569	}
	570	}
	571	splx(s);
	572	}
	573
	574	static struct kproc_desc vnlru_kp = {
	575	"vnlru",
	576	vnlru_proc,
	577	&vnlruthread
	578	};
	579	SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
	580
	581	/*
	582	* Routines having to do with the management of the vnode table.
	583	*/
	584	extern vop_t **dead_vnodeop_p;
	585
	586	/*
	587	* Return the next vnode from the free list.
	588	*/
	589	int
	590	getnewvnode(tag, mp, vops, vpp)
	591	enum vtagtype tag;
	592	struct mount *mp;
	593	vop_t **vops;
	594	struct vnode **vpp;
	595	{
	596	int s;
	597	int gen;
	598	int vgen;
	599	struct thread td = curthread; / XXX */
	600	struct vnode *vp = NULL;
	601	vm_object_t object;
	602
	603	s = splbio();
	604
	605	/*
	606	* Try to reuse vnodes if we hit the max. This situation only
	607	* occurs in certain large-memory (2G+) situations. We cannot
	608	* attempt to directly reclaim vnodes due to nasty recursion
	609	* problems.
	610	*/
	611	while (numvnodes - freevnodes > desiredvnodes) {
	612	if (vnlruproc_sig == 0) {
	613	vnlruproc_sig = 1; /* avoid unnecessary wakeups */
	614	wakeup(vnlruthread);
	615	}
	616	tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
	617	}
	618
	619
	620	/*
	621	* Attempt to reuse a vnode already on the free list, allocating
	622	* a new vnode if we can't find one or if we have not reached a
	623	* good minimum for good LRU performance.
	624	*/
	625	gen = lwkt_gettoken(&vnode_free_list_token);
	626	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
	627	int count;
	628
	629	for (count = 0; count < freevnodes; count++) {
	630	vp = TAILQ_FIRST(&vnode_free_list);
	631	if (vp == NULL \|\| vp->v_usecount)
	632	panic("getnewvnode: free vnode isn't");
	633
	634	/*
	635	* Get the vnode's interlock, then re-obtain
	636	* vnode_free_list_token in case we lost it. If we
	637	* did lose it while getting the vnode interlock,
	638	* even if we got it back again, then retry.
	639	*/
	640	vgen = lwkt_gettoken(&vp->v_interlock);
	641	if (lwkt_gentoken(&vnode_free_list_token, &gen) != 0) {
	642	--count;
	643	lwkt_reltoken(&vp->v_interlock);
	644	vp = NULL;
	645	continue;
	646	}
	647
	648	/*
	649	* Whew! We have both tokens. Since we didn't lose
	650	* the free list VFREE had better still be set. But
	651	* we aren't out of the woods yet. We have to get
	652	* the object (may block). If the vnode is not
	653	* suitable then move it to the end of the list
	654	* if we can. If we can't move it to the end of the
	655	* list retry again.
	656	*/
	657	if ((VOP_GETVOBJECT(vp, &object) == 0 &&
	658	(object->resident_page_count \|\| object->ref_count))
	659	) {
	660	if (lwkt_gentoken(&vp->v_interlock, &vgen) == 0 &&
	661	lwkt_gentoken(&vnode_free_list_token, &gen) == 0
	662	) {
	663	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	664	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	665	} else {
	666	--count;
	667	}
	668	lwkt_reltoken(&vp->v_interlock);
	669	vp = NULL;
	670	continue;
	671	}
	672
	673	/*
	674	* Still not out of the woods. VOBJECT might have
	675	* blocked, if we did not retain our tokens we have
	676	* to retry.
	677	*/
	678	if (lwkt_gentoken(&vp->v_interlock, &vgen) != 0 \|\|
	679	lwkt_gentoken(&vnode_free_list_token, &gen) != 0) {
	680	--count;
	681	vp = NULL;
	682	continue;
	683	}
	684	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	685	KKASSERT(vp->v_flag & VFREE);
	686
	687	/*
	688	* If we have children in the namecache we cannot
	689	* reuse the vnode yet because it will break the
	690	* namecache chain (YYY use nc_refs for the check?)
	691	*/
	692	if (TAILQ_FIRST(&vp->v_namecache)) {
	693	if (cache_leaf_test(vp) < 0) {
	694	lwkt_reltoken(&vp->v_interlock);
	695	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	696	vp = NULL;
	697	continue;
	698	}
	699	}
	700	break;
	701	}
	702	}
	703
	704	if (vp) {
	705	vp->v_flag \|= VDOOMED;
	706	vp->v_flag &= ~VFREE;
	707	freevnodes--;
	708	lwkt_reltoken(&vnode_free_list_token);
	709	cache_purge(vp); /* YYY may block */
	710	vp->v_lease = NULL;
	711	if (vp->v_type != VBAD) {
	712	vgonel(vp, td);
	713	} else {
	714	lwkt_reltoken(&vp->v_interlock);
	715	}
	716
	717	#ifdef INVARIANTS
	718	{
	719	int s;
	720
	721	if (vp->v_data)
	722	panic("cleaned vnode isn't");
	723	s = splbio();
	724	if (vp->v_numoutput)
	725	panic("Clean vnode has pending I/O's");
	726	splx(s);
	727	}
	728	#endif
	729	vp->v_flag = 0;
	730	vp->v_lastw = 0;
	731	vp->v_lasta = 0;
	732	vp->v_cstart = 0;
	733	vp->v_clen = 0;
	734	vp->v_socket = 0;
	735	vp->v_writecount = 0; /* XXX */
	736	} else {
	737	lwkt_reltoken(&vnode_free_list_token);
	738	vp = (struct vnode *) zalloc(vnode_zone);
	739	bzero((char ) vp, sizeof vp);
	740	lwkt_inittoken(&vp->v_interlock);
	741	lwkt_inittoken(&vp->v_pollinfo.vpi_token);
	742	vp->v_dd = vp;
	743	cache_purge(vp);
	744	TAILQ_INIT(&vp->v_namecache);
	745	numvnodes++;
	746	}
	747
	748	TAILQ_INIT(&vp->v_cleanblkhd);
	749	TAILQ_INIT(&vp->v_dirtyblkhd);
	750	vp->v_type = VNON;
	751	vp->v_tag = tag;
	752	vp->v_op = vops;
	753	insmntque(vp, mp);
	754	*vpp = vp;
	755	vp->v_usecount = 1;
	756	vp->v_data = 0;
	757	splx(s);
	758
	759	vfs_object_create(vp, td);
	760	return (0);
	761	}
	762
	763	/*
	764	* Move a vnode from one mount queue to another.
	765	*/
	766	static void
	767	insmntque(vp, mp)
	768	struct vnode *vp;
	769	struct mount *mp;
	770	{
	771
	772	lwkt_gettoken(&mntvnode_token);
	773	/*
	774	* Delete from old mount point vnode list, if on one.
	775	*/
	776	if (vp->v_mount != NULL) {
	777	KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
	778	("bad mount point vnode list size"));
	779	TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
	780	vp->v_mount->mnt_nvnodelistsize--;
	781	}
	782	/*
	783	* Insert into list of vnodes for the new mount point, if available.
	784	*/
	785	if ((vp->v_mount = mp) == NULL) {
	786	lwkt_reltoken(&mntvnode_token);
	787	return;
	788	}
	789	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	790	mp->mnt_nvnodelistsize++;
	791	lwkt_reltoken(&mntvnode_token);
	792	}
	793
	794	/*
	795	* Update outstanding I/O count and do wakeup if requested.
	796	*/
	797	void
	798	vwakeup(bp)
	799	struct buf *bp;
	800	{
	801	struct vnode *vp;
	802
	803	bp->b_flags &= ~B_WRITEINPROG;
	804	if ((vp = bp->b_vp)) {
	805	vp->v_numoutput--;
	806	if (vp->v_numoutput < 0)
	807	panic("vwakeup: neg numoutput");
	808	if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
	809	vp->v_flag &= ~VBWAIT;
	810	wakeup((caddr_t) &vp->v_numoutput);
	811	}
	812	}
	813	}
	814
	815	/*
	816	* Flush out and invalidate all buffers associated with a vnode.
	817	* Called with the underlying object locked.
	818	*/
	819	int
	820	vinvalbuf(struct vnode vp, int flags, struct thread td,
	821	int slpflag, int slptimeo)
	822	{
	823	struct buf *bp;
	824	struct buf nbp, blist;
	825	int s, error;
	826	vm_object_t object;
	827
	828	if (flags & V_SAVE) {
	829	s = splbio();
	830	while (vp->v_numoutput) {
	831	vp->v_flag \|= VBWAIT;
	832	error = tsleep((caddr_t)&vp->v_numoutput,
	833	slpflag, "vinvlbuf", slptimeo);
	834	if (error) {
	835	splx(s);
	836	return (error);
	837	}
	838	}
	839	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	840	splx(s);
	841	if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
	842	return (error);
	843	s = splbio();
	844	if (vp->v_numoutput > 0 \|\|
	845	!TAILQ_EMPTY(&vp->v_dirtyblkhd))
	846	panic("vinvalbuf: dirty bufs");
	847	}
	848	splx(s);
	849	}
	850	s = splbio();
	851	for (;;) {
	852	blist = TAILQ_FIRST(&vp->v_cleanblkhd);
	853	if (!blist)
	854	blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
	855	if (!blist)
	856	break;
	857
	858	for (bp = blist; bp; bp = nbp) {
	859	nbp = TAILQ_NEXT(bp, b_vnbufs);
	860	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	861	error = BUF_TIMELOCK(bp,
	862	LK_EXCLUSIVE \| LK_SLEEPFAIL,
	863	"vinvalbuf", slpflag, slptimeo);
	864	if (error == ENOLCK)
	865	break;
	866	splx(s);
	867	return (error);
	868	}
	869	/*
	870	* XXX Since there are no node locks for NFS, I
	871	* believe there is a slight chance that a delayed
	872	* write will occur while sleeping just above, so
	873	* check for it. Note that vfs_bio_awrite expects
	874	* buffers to reside on a queue, while VOP_BWRITE and
	875	* brelse do not.
	876	*/
	877	if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) &&
	878	(flags & V_SAVE)) {
	879
	880	if (bp->b_vp == vp) {
	881	if (bp->b_flags & B_CLUSTEROK) {
	882	BUF_UNLOCK(bp);
	883	vfs_bio_awrite(bp);
	884	} else {
	885	bremfree(bp);
	886	bp->b_flags \|= B_ASYNC;
	887	VOP_BWRITE(bp->b_vp, bp);
	888	}
	889	} else {
	890	bremfree(bp);
	891	(void) VOP_BWRITE(bp->b_vp, bp);
	892	}
	893	break;
	894	}
	895	bremfree(bp);
	896	bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF);
	897	bp->b_flags &= ~B_ASYNC;
	898	brelse(bp);
	899	}
	900	}
	901
	902	/*
	903	* Wait for I/O to complete. XXX needs cleaning up. The vnode can
	904	* have write I/O in-progress but if there is a VM object then the
	905	* VM object can also have read-I/O in-progress.
	906	*/
	907	do {
	908	while (vp->v_numoutput > 0) {
	909	vp->v_flag \|= VBWAIT;
	910	tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
	911	}
	912	if (VOP_GETVOBJECT(vp, &object) == 0) {
	913	while (object->paging_in_progress)
	914	vm_object_pip_sleep(object, "vnvlbx");
	915	}
	916	} while (vp->v_numoutput > 0);
	917
	918	splx(s);
	919
	920	/*
	921	* Destroy the copy in the VM cache, too.
	922	*/
	923	lwkt_gettoken(&vp->v_interlock);
	924	if (VOP_GETVOBJECT(vp, &object) == 0) {
	925	vm_object_page_remove(object, 0, 0,
	926	(flags & V_SAVE) ? TRUE : FALSE);
	927	}
	928	lwkt_reltoken(&vp->v_interlock);
	929
	930	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd))
	931	panic("vinvalbuf: flush failed");
	932	return (0);
	933	}
	934
	935	/*
	936	* Truncate a file's buffer and pages to a specified length. This
	937	* is in lieu of the old vinvalbuf mechanism, which performed unneeded
	938	* sync activity.
	939	*/
	940	int
	941	vtruncbuf(struct vnode vp, struct thread td, off_t length, int blksize)
	942	{
	943	struct buf *bp;
	944	struct buf *nbp;
	945	int s, anyfreed;
	946	int trunclbn;
	947
	948	/*
	949	* Round up to the next lbn.
	950	*/
	951	trunclbn = (length + blksize - 1) / blksize;
	952
	953	s = splbio();
	954	restart:
	955	anyfreed = 1;
	956	for (;anyfreed;) {
	957	anyfreed = 0;
	958	for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
	959	nbp = TAILQ_NEXT(bp, b_vnbufs);
	960	if (bp->b_lblkno >= trunclbn) {
	961	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	962	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	963	goto restart;
	964	} else {
	965	bremfree(bp);
	966	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	967	bp->b_flags &= ~B_ASYNC;
	968	brelse(bp);
	969	anyfreed = 1;
	970	}
	971	if (nbp &&
	972	(((nbp->b_xflags & BX_VNCLEAN) == 0) \|\|
	973	(nbp->b_vp != vp) \|\|
	974	(nbp->b_flags & B_DELWRI))) {
	975	goto restart;
	976	}
	977	}
	978	}
	979
	980	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	981	nbp = TAILQ_NEXT(bp, b_vnbufs);
	982	if (bp->b_lblkno >= trunclbn) {
	983	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	984	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	985	goto restart;
	986	} else {
	987	bremfree(bp);
	988	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	989	bp->b_flags &= ~B_ASYNC;
	990	brelse(bp);
	991	anyfreed = 1;
	992	}
	993	if (nbp &&
	994	(((nbp->b_xflags & BX_VNDIRTY) == 0) \|\|
	995	(nbp->b_vp != vp) \|\|
	996	(nbp->b_flags & B_DELWRI) == 0)) {
	997	goto restart;
	998	}
	999	}
	1000	}
	1001	}
	1002
	1003	if (length > 0) {
	1004	restartsync:
	1005	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	1006	nbp = TAILQ_NEXT(bp, b_vnbufs);
	1007	if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
	1008	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	1009	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	1010	goto restart;
	1011	} else {
	1012	bremfree(bp);
	1013	if (bp->b_vp == vp) {
	1014	bp->b_flags \|= B_ASYNC;
	1015	} else {
	1016	bp->b_flags &= ~B_ASYNC;
	1017	}
	1018	VOP_BWRITE(bp->b_vp, bp);
	1019	}
	1020	goto restartsync;
	1021	}
	1022
	1023	}
	1024	}
	1025
	1026	while (vp->v_numoutput > 0) {
	1027	vp->v_flag \|= VBWAIT;
	1028	tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
	1029	}
	1030
	1031	splx(s);
	1032
	1033	vnode_pager_setsize(vp, length);
	1034
	1035	return (0);
	1036	}
	1037
	1038	/*
	1039	* Associate a buffer with a vnode.
	1040	*/
	1041	void
	1042	bgetvp(vp, bp)
	1043	struct vnode *vp;
	1044	struct buf *bp;
	1045	{
	1046	int s;
	1047
	1048	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
	1049
	1050	vhold(vp);
	1051	bp->b_vp = vp;
	1052	bp->b_dev = vn_todev(vp);
	1053	/*
	1054	* Insert onto list for new vnode.
	1055	*/
	1056	s = splbio();
	1057	bp->b_xflags \|= BX_VNCLEAN;
	1058	bp->b_xflags &= ~BX_VNDIRTY;
	1059	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
	1060	splx(s);
	1061	}
	1062
	1063	/*
	1064	* Disassociate a buffer from a vnode.
	1065	*/
	1066	void
	1067	brelvp(bp)
	1068	struct buf *bp;
	1069	{
	1070	struct vnode *vp;
	1071	struct buflists *listheadp;
	1072	int s;
	1073
	1074	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
	1075
	1076	/*
	1077	* Delete from old vnode list, if on one.
	1078	*/
	1079	vp = bp->b_vp;
	1080	s = splbio();
	1081	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	1082	if (bp->b_xflags & BX_VNDIRTY)
	1083	listheadp = &vp->v_dirtyblkhd;
	1084	else
	1085	listheadp = &vp->v_cleanblkhd;
	1086	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	1087	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	1088	}
	1089	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	1090	vp->v_flag &= ~VONWORKLST;
	1091	LIST_REMOVE(vp, v_synclist);
	1092	}
	1093	splx(s);
	1094	bp->b_vp = (struct vnode *) 0;
	1095	vdrop(vp);
	1096	}
	1097
	1098	/*
	1099	* The workitem queue.
	1100	*
	1101	* It is useful to delay writes of file data and filesystem metadata
	1102	* for tens of seconds so that quickly created and deleted files need
	1103	* not waste disk bandwidth being created and removed. To realize this,
	1104	* we append vnodes to a "workitem" queue. When running with a soft
	1105	* updates implementation, most pending metadata dependencies should
	1106	* not wait for more than a few seconds. Thus, mounted on block devices
	1107	* are delayed only about a half the time that file data is delayed.
	1108	* Similarly, directory updates are more critical, so are only delayed
	1109	* about a third the time that file data is delayed. Thus, there are
	1110	* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
	1111	* one each second (driven off the filesystem syncer process). The
	1112	* syncer_delayno variable indicates the next queue that is to be processed.
	1113	* Items that need to be processed soon are placed in this queue:
	1114	*
	1115	* syncer_workitem_pending[syncer_delayno]
	1116	*
	1117	* A delay of fifteen seconds is done by placing the request fifteen
	1118	* entries later in the queue:
	1119	*
	1120	* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
	1121	*
	1122	*/
	1123
	1124	/*
	1125	* Add an item to the syncer work queue.
	1126	*/
	1127	static void
	1128	vn_syncer_add_to_worklist(struct vnode *vp, int delay)
	1129	{
	1130	int s, slot;
	1131
	1132	s = splbio();
	1133
	1134	if (vp->v_flag & VONWORKLST) {
	1135	LIST_REMOVE(vp, v_synclist);
	1136	}
	1137
	1138	if (delay > syncer_maxdelay - 2)
	1139	delay = syncer_maxdelay - 2;
	1140	slot = (syncer_delayno + delay) & syncer_mask;
	1141
	1142	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
	1143	vp->v_flag \|= VONWORKLST;
	1144	splx(s);
	1145	}
	1146
	1147	struct thread *updatethread;
	1148	static void sched_sync (void);
	1149	static struct kproc_desc up_kp = {
	1150	"syncer",
	1151	sched_sync,
	1152	&updatethread
	1153	};
	1154	SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
	1155
	1156	/*
	1157	* System filesystem synchronizer daemon.
	1158	*/
	1159	void
	1160	sched_sync(void)
	1161	{
	1162	struct synclist *slp;
	1163	struct vnode *vp;
	1164	long starttime;
	1165	int s;
	1166	struct thread *td = curthread;
	1167
	1168	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	1169	SHUTDOWN_PRI_LAST);
	1170
	1171	for (;;) {
	1172	kproc_suspend_loop();
	1173
	1174	starttime = time_second;
	1175
	1176	/*
	1177	* Push files whose dirty time has expired. Be careful
	1178	* of interrupt race on slp queue.
	1179	*/
	1180	s = splbio();
	1181	slp = &syncer_workitem_pending[syncer_delayno];
	1182	syncer_delayno += 1;
	1183	if (syncer_delayno == syncer_maxdelay)
	1184	syncer_delayno = 0;
	1185	splx(s);
	1186
	1187	while ((vp = LIST_FIRST(slp)) != NULL) {
	1188	if (VOP_ISLOCKED(vp, NULL) == 0) {
	1189	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, td);
	1190	(void) VOP_FSYNC(vp, MNT_LAZY, td);
	1191	VOP_UNLOCK(vp, 0, td);
	1192	}
	1193	s = splbio();
	1194	if (LIST_FIRST(slp) == vp) {
	1195	/*
	1196	* Note: v_tag VT_VFS vps can remain on the
	1197	* worklist too with no dirty blocks, but
	1198	* since sync_fsync() moves it to a different
	1199	* slot we are safe.
	1200	*/
	1201	if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
	1202	!vn_isdisk(vp, NULL))
	1203	panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
	1204	/*
	1205	* Put us back on the worklist. The worklist
	1206	* routine will remove us from our current
	1207	* position and then add us back in at a later
	1208	* position.
	1209	*/
	1210	vn_syncer_add_to_worklist(vp, syncdelay);
	1211	}
	1212	splx(s);
	1213	}
	1214
	1215	/*
	1216	* Do soft update processing.
	1217	*/
	1218	if (bioops.io_sync)
	1219	(*bioops.io_sync)(NULL);
	1220
	1221	/*
	1222	* The variable rushjob allows the kernel to speed up the
	1223	* processing of the filesystem syncer process. A rushjob
	1224	* value of N tells the filesystem syncer to process the next
	1225	* N seconds worth of work on its queue ASAP. Currently rushjob
	1226	* is used by the soft update code to speed up the filesystem
	1227	* syncer process when the incore state is getting so far
	1228	* ahead of the disk that the kernel memory pool is being
	1229	* threatened with exhaustion.
	1230	*/
	1231	if (rushjob > 0) {
	1232	rushjob -= 1;
	1233	continue;
	1234	}
	1235	/*
	1236	* If it has taken us less than a second to process the
	1237	* current work, then wait. Otherwise start right over
	1238	* again. We can still lose time if any single round
	1239	* takes more than two seconds, but it does not really
	1240	* matter as we are just trying to generally pace the
	1241	* filesystem activity.
	1242	*/
	1243	if (time_second == starttime)
	1244	tsleep(&lbolt, 0, "syncer", 0);
	1245	}
	1246	}
	1247
	1248	/*
	1249	* Request the syncer daemon to speed up its work.
	1250	* We never push it to speed up more than half of its
	1251	* normal turn time, otherwise it could take over the cpu.
	1252	*
	1253	* YYY wchan field protected by the BGL.
	1254	*/
	1255	int
	1256	speedup_syncer()
	1257	{
	1258	crit_enter();
	1259	if (updatethread->td_wchan == &lbolt) { /* YYY */
	1260	unsleep(updatethread);
	1261	lwkt_schedule(updatethread);
	1262	}
	1263	crit_exit();
	1264	if (rushjob < syncdelay / 2) {
	1265	rushjob += 1;
	1266	stat_rush_requests += 1;
	1267	return (1);
	1268	}
	1269	return(0);
	1270	}
	1271
	1272	/*
	1273	* Associate a p-buffer with a vnode.
	1274	*
	1275	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	1276	* with the buffer. i.e. the bp has not been linked into the vnode or
	1277	* ref-counted.
	1278	*/
	1279	void
	1280	pbgetvp(vp, bp)
	1281	struct vnode *vp;
	1282	struct buf *bp;
	1283	{
	1284
	1285	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
	1286
	1287	bp->b_vp = vp;
	1288	bp->b_flags \|= B_PAGING;
	1289	bp->b_dev = vn_todev(vp);
	1290	}
	1291
	1292	/*
	1293	* Disassociate a p-buffer from a vnode.
	1294	*/
	1295	void
	1296	pbrelvp(bp)
	1297	struct buf *bp;
	1298	{
	1299
	1300	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
	1301
	1302	/* XXX REMOVE ME */
	1303	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
	1304	panic(
	1305	"relpbuf(): b_vp was probably reassignbuf()d %p %x",
	1306	bp,
	1307	(int)bp->b_flags
	1308	);
	1309	}
	1310	bp->b_vp = (struct vnode *) 0;
	1311	bp->b_flags &= ~B_PAGING;
	1312	}
	1313
	1314	void
	1315	pbreassignbuf(bp, newvp)
	1316	struct buf *bp;
	1317	struct vnode *newvp;
	1318	{
	1319	if ((bp->b_flags & B_PAGING) == 0) {
	1320	panic(
	1321	"pbreassignbuf() on non phys bp %p",
	1322	bp
	1323	);
	1324	}
	1325	bp->b_vp = newvp;
	1326	}
	1327
	1328	/*
	1329	* Reassign a buffer from one vnode to another.
	1330	* Used to assign file specific control information
	1331	* (indirect blocks) to the vnode to which they belong.
	1332	*/
	1333	void
	1334	reassignbuf(bp, newvp)
	1335	struct buf *bp;
	1336	struct vnode *newvp;
	1337	{
	1338	struct buflists *listheadp;
	1339	int delay;
	1340	int s;
	1341
	1342	if (newvp == NULL) {
	1343	printf("reassignbuf: NULL");
	1344	return;
	1345	}
	1346	++reassignbufcalls;
	1347
	1348	/*
	1349	* B_PAGING flagged buffers cannot be reassigned because their vp
	1350	* is not fully linked in.
	1351	*/
	1352	if (bp->b_flags & B_PAGING)
	1353	panic("cannot reassign paging buffer");
	1354
	1355	s = splbio();
	1356	/*
	1357	* Delete from old vnode list, if on one.
	1358	*/
	1359	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	1360	if (bp->b_xflags & BX_VNDIRTY)
	1361	listheadp = &bp->b_vp->v_dirtyblkhd;
	1362	else
	1363	listheadp = &bp->b_vp->v_cleanblkhd;
	1364	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	1365	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	1366	if (bp->b_vp != newvp) {
	1367	vdrop(bp->b_vp);
	1368	bp->b_vp = NULL; /* for clarification */
	1369	}
	1370	}
	1371	/*
	1372	* If dirty, put on list of dirty buffers; otherwise insert onto list
	1373	* of clean buffers.
	1374	*/
	1375	if (bp->b_flags & B_DELWRI) {
	1376	struct buf *tbp;
	1377
	1378	listheadp = &newvp->v_dirtyblkhd;
	1379	if ((newvp->v_flag & VONWORKLST) == 0) {
	1380	switch (newvp->v_type) {
	1381	case VDIR:
	1382	delay = dirdelay;
	1383	break;
	1384	case VCHR:
	1385	case VBLK:
	1386	if (newvp->v_specmountpoint != NULL) {
	1387	delay = metadelay;
	1388	break;
	1389	}
	1390	/* fall through */
	1391	default:
	1392	delay = filedelay;
	1393	}
	1394	vn_syncer_add_to_worklist(newvp, delay);
	1395	}
	1396	bp->b_xflags \|= BX_VNDIRTY;
	1397	tbp = TAILQ_FIRST(listheadp);
	1398	if (tbp == NULL \|\|
	1399	bp->b_lblkno == 0 \|\|
	1400	(bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\|
	1401	(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
	1402	TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
	1403	++reassignbufsortgood;
	1404	} else if (bp->b_lblkno < 0) {
	1405	TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
	1406	++reassignbufsortgood;
	1407	} else if (reassignbufmethod == 1) {
	1408	/*
	1409	* New sorting algorithm, only handle sequential case,
	1410	* otherwise append to end (but before metadata)
	1411	*/
	1412	if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
	1413	(tbp->b_xflags & BX_VNDIRTY)) {
	1414	/*
	1415	* Found the best place to insert the buffer
	1416	*/
	1417	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1418	++reassignbufsortgood;
	1419	} else {
	1420	/*
	1421	* Missed, append to end, but before meta-data.
	1422	* We know that the head buffer in the list is
	1423	* not meta-data due to prior conditionals.
	1424	*
	1425	* Indirect effects: NFS second stage write
	1426	* tends to wind up here, giving maximum
	1427	* distance between the unstable write and the
	1428	* commit rpc.
	1429	*/
	1430	tbp = TAILQ_LAST(listheadp, buflists);
	1431	while (tbp && tbp->b_lblkno < 0)
	1432	tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
	1433	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1434	++reassignbufsortbad;
	1435	}
	1436	} else {
	1437	/*
	1438	* Old sorting algorithm, scan queue and insert
	1439	*/
	1440	struct buf *ttbp;
	1441	while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
	1442	(ttbp->b_lblkno < bp->b_lblkno)) {
	1443	++reassignbufloops;
	1444	tbp = ttbp;
	1445	}
	1446	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1447	}
	1448	} else {
	1449	bp->b_xflags \|= BX_VNCLEAN;
	1450	TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
	1451	if ((newvp->v_flag & VONWORKLST) &&
	1452	TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
	1453	newvp->v_flag &= ~VONWORKLST;
	1454	LIST_REMOVE(newvp, v_synclist);
	1455	}
	1456	}
	1457	if (bp->b_vp != newvp) {
	1458	bp->b_vp = newvp;
	1459	vhold(bp->b_vp);
	1460	}
	1461	splx(s);
	1462	}
	1463
	1464	/*
	1465	* Create a vnode for a block device.
	1466	* Used for mounting the root file system.
	1467	*/
	1468	int
	1469	bdevvp(dev, vpp)
	1470	dev_t dev;
	1471	struct vnode **vpp;
	1472	{
	1473	struct vnode *vp;
	1474	struct vnode *nvp;
	1475	int error;
	1476
	1477	if (dev == NODEV) {
	1478	*vpp = NULLVP;
	1479	return (ENXIO);
	1480	}
	1481	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
	1482	if (error) {
	1483	*vpp = NULLVP;
	1484	return (error);
	1485	}
	1486	vp = nvp;
	1487	vp->v_type = VBLK;
	1488	addalias(vp, dev);
	1489	*vpp = vp;
	1490	return (0);
	1491	}
	1492
	1493	/*
	1494	* Add a vnode to the alias list hung off the dev_t.
	1495	*
	1496	* The reason for this gunk is that multiple vnodes can reference
	1497	* the same physical device, so checking vp->v_usecount to see
	1498	* how many users there are is inadequate; the v_usecount for
	1499	* the vnodes need to be accumulated. vcount() does that.
	1500	*/
	1501	void
	1502	addaliasu(struct vnode *nvp, udev_t nvp_rdev)
	1503	{
	1504	dev_t dev;
	1505
	1506	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	1507	panic("addaliasu on non-special vnode");
	1508	dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
	1509	if (dev != NODEV) {
	1510	nvp->v_rdev = dev;
	1511	addalias(nvp, dev);
	1512	} else
	1513	nvp->v_rdev = NULL;
	1514	}
	1515
	1516	void
	1517	addalias(struct vnode *nvp, dev_t dev)
	1518	{
	1519
	1520	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	1521	panic("addalias on non-special vnode");
	1522
	1523	nvp->v_rdev = dev;
	1524	lwkt_gettoken(&spechash_token);
	1525	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
	1526	lwkt_reltoken(&spechash_token);
	1527	}
	1528
	1529	/*
	1530	* Grab a particular vnode from the free list, increment its
	1531	* reference count and lock it. The vnode lock bit is set if the
	1532	* vnode is being eliminated in vgone. The process is awakened
	1533	* when the transition is completed, and an error returned to
	1534	* indicate that the vnode is no longer usable (possibly having
	1535	* been changed to a new file system type).
	1536	*/
	1537	int
	1538	vget(vp, flags, td)
	1539	struct vnode *vp;
	1540	int flags;
	1541	struct thread *td;
	1542	{
	1543	int error;
	1544
	1545	/*
	1546	* If the vnode is in the process of being cleaned out for
	1547	* another use, we wait for the cleaning to finish and then
	1548	* return failure. Cleaning is determined by checking that
	1549	* the VXLOCK flag is set.
	1550	*/
	1551	if (vp->v_flag & VXLOCK) {
	1552	if (vp->v_vxproc == curproc) {
	1553	#if 0
	1554	/* this can now occur in normal operation */
	1555	log(LOG_INFO, "VXLOCK interlock avoided\n");
	1556	#endif
	1557	} else {
	1558	vp->v_flag \|= VXWANT;
	1559	tsleep((caddr_t)vp, 0, "vget", 0);
	1560	return (ENOENT);
	1561	}
	1562	}
	1563
	1564	/*
	1565	* Bump v_usecount to prevent the vnode from being cleaned. If the
	1566	* vnode gets cleaned unexpectedly we could wind up calling lockmgr
	1567	* on a lock embedded in an inode which is then ripped out from
	1568	* it.
	1569	*/
	1570	vp->v_usecount++; /* XXX MP */
	1571
	1572	if ((flags & LK_INTERLOCK) == 0) {
	1573	lwkt_gettoken(&vp->v_interlock);
	1574	}
	1575
	1576	if (VSHOULDBUSY(vp))
	1577	vbusy(vp);
	1578	if (flags & LK_TYPE_MASK) {
	1579	if ((error = vn_lock(vp, flags \| LK_INTERLOCK, td)) != 0) {
	1580	/*
	1581	* must expand vrele here because we do not want
	1582	* to call VOP_INACTIVE if the reference count
	1583	* drops back to zero since it was never really
	1584	* active. We must remove it from the free list
	1585	* before sleeping so that multiple processes do
	1586	* not try to recycle it.
	1587	*/
	1588	lwkt_gettoken(&vp->v_interlock);
	1589	vp->v_usecount--;
	1590	if (VSHOULDFREE(vp))
	1591	vfree(vp);
	1592	else
	1593	vlruvp(vp);
	1594	lwkt_reltoken(&vp->v_interlock);
	1595	}
	1596	return (error);
	1597	}
	1598	lwkt_reltoken(&vp->v_interlock);
	1599	return (0);
	1600	}
	1601
	1602	void
	1603	vref(struct vnode *vp)
	1604	{
	1605	vp->v_usecount++; /* XXX MP */
	1606	}
	1607
	1608	/*
	1609	* Vnode put/release.
	1610	* If count drops to zero, call inactive routine and return to freelist.
	1611	*/
	1612	void
	1613	vrele(struct vnode *vp)
	1614	{
	1615	struct thread td = curthread; / XXX */
	1616
	1617	KASSERT(vp != NULL, ("vrele: null vp"));
	1618
	1619	lwkt_gettoken(&vp->v_interlock);
	1620
	1621	if (vp->v_usecount > 1) {
	1622
	1623	vp->v_usecount--;
	1624	lwkt_reltoken(&vp->v_interlock);
	1625
	1626	return;
	1627	}
	1628
	1629	if (vp->v_usecount == 1) {
	1630	vp->v_usecount--;
	1631	/*
	1632	* We must call VOP_INACTIVE with the node locked.
	1633	* If we are doing a vpu, the node is already locked,
	1634	* but, in the case of vrele, we must explicitly lock
	1635	* the vnode before calling VOP_INACTIVE
	1636	*/
	1637
	1638	if (vn_lock(vp, LK_EXCLUSIVE \| LK_INTERLOCK, td) == 0)
	1639	VOP_INACTIVE(vp, td);
	1640	if (VSHOULDFREE(vp))
	1641	vfree(vp);
	1642	else
	1643	vlruvp(vp);
	1644	} else {
	1645	#ifdef DIAGNOSTIC
	1646	vprint("vrele: negative ref count", vp);
	1647	lwkt_reltoken(&vp->v_interlock);
	1648	#endif
	1649	panic("vrele: negative ref cnt");
	1650	}
	1651	}
	1652
	1653	void
	1654	vput(struct vnode *vp)
	1655	{
	1656	struct thread td = curthread; / XXX */
	1657
	1658	KASSERT(vp != NULL, ("vput: null vp"));
	1659
	1660	lwkt_gettoken(&vp->v_interlock);
	1661
	1662	if (vp->v_usecount > 1) {
	1663	vp->v_usecount--;
	1664	VOP_UNLOCK(vp, LK_INTERLOCK, td);
	1665	return;
	1666	}
	1667
	1668	if (vp->v_usecount == 1) {
	1669	vp->v_usecount--;
	1670	/*
	1671	* We must call VOP_INACTIVE with the node locked.
	1672	* If we are doing a vpu, the node is already locked,
	1673	* so we just need to release the vnode mutex.
	1674	*/
	1675	lwkt_reltoken(&vp->v_interlock);
	1676	VOP_INACTIVE(vp, td);
	1677	if (VSHOULDFREE(vp))
	1678	vfree(vp);
	1679	else
	1680	vlruvp(vp);
	1681	} else {
	1682	#ifdef DIAGNOSTIC
	1683	vprint("vput: negative ref count", vp);
	1684	#endif
	1685	panic("vput: negative ref cnt");
	1686	}
	1687	}
	1688
	1689	/*
	1690	* Somebody doesn't want the vnode recycled.
	1691	*/
	1692	void
	1693	vhold(vp)
	1694	struct vnode *vp;
	1695	{
	1696	int s;
	1697
	1698	s = splbio();
	1699	vp->v_holdcnt++;
	1700	if (VSHOULDBUSY(vp))
	1701	vbusy(vp);
	1702	splx(s);
	1703	}
	1704
	1705	/*
	1706	* One less who cares about this vnode.
	1707	*/
	1708	void
	1709	vdrop(vp)
	1710	struct vnode *vp;
	1711	{
	1712	int s;
	1713
	1714	s = splbio();
	1715	if (vp->v_holdcnt <= 0)
	1716	panic("vdrop: holdcnt");
	1717	vp->v_holdcnt--;
	1718	if (VSHOULDFREE(vp))
	1719	vfree(vp);
	1720	splx(s);
	1721	}
	1722
	1723	/*
	1724	* Remove any vnodes in the vnode table belonging to mount point mp.
	1725	*
	1726	* If FORCECLOSE is not specified, there should not be any active ones,
	1727	* return error if any are found (nb: this is a user error, not a
	1728	* system error). If FORCECLOSE is specified, detach any active vnodes
	1729	* that are found.
	1730	*
	1731	* If WRITECLOSE is set, only flush out regular file vnodes open for
	1732	* writing.
	1733	*
	1734	* SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
	1735	*
	1736	* `rootrefs' specifies the base reference count for the root vnode
	1737	* of this filesystem. The root vnode is considered busy if its
	1738	* v_usecount exceeds this value. On a successful return, vflush()
	1739	* will call vrele() on the root vnode exactly rootrefs times.
	1740	* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
	1741	* be zero.
	1742	*/
	1743	#ifdef DIAGNOSTIC
	1744	static int busyprt = 0; /* print out busy vnodes */
	1745	SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
	1746	#endif
	1747
	1748	int
	1749	vflush(mp, rootrefs, flags)
	1750	struct mount *mp;
	1751	int rootrefs;
	1752	int flags;
	1753	{
	1754	struct thread td = curthread; / XXX */
	1755	struct vnode vp, nvp, *rootvp = NULL;
	1756	struct vattr vattr;
	1757	int busy = 0, error;
	1758
	1759	if (rootrefs > 0) {
	1760	KASSERT((flags & (SKIPSYSTEM \| WRITECLOSE)) == 0,
	1761	("vflush: bad args"));
	1762	/*
	1763	* Get the filesystem root vnode. We can vput() it
	1764	* immediately, since with rootrefs > 0, it won't go away.
	1765	*/
	1766	if ((error = VFS_ROOT(mp, &rootvp)) != 0)
	1767	return (error);
	1768	vput(rootvp);
	1769	}
	1770	lwkt_gettoken(&mntvnode_token);
	1771	loop:
	1772	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
	1773	/*
	1774	* Make sure this vnode wasn't reclaimed in getnewvnode().
	1775	* Start over if it has (it won't be on the list anymore).
	1776	*/
	1777	if (vp->v_mount != mp)
	1778	goto loop;
	1779	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	1780
	1781	lwkt_gettoken(&vp->v_interlock);
	1782	/*
	1783	* Skip over a vnodes marked VSYSTEM.
	1784	*/
	1785	if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
	1786	lwkt_reltoken(&vp->v_interlock);
	1787	continue;
	1788	}
	1789	/*
	1790	* If WRITECLOSE is set, flush out unlinked but still open
	1791	* files (even if open only for reading) and regular file
	1792	* vnodes open for writing.
	1793	*/
	1794	if ((flags & WRITECLOSE) &&
	1795	(vp->v_type == VNON \|\|
	1796	(VOP_GETATTR(vp, &vattr, td) == 0 &&
	1797	vattr.va_nlink > 0)) &&
	1798	(vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	1799	lwkt_reltoken(&vp->v_interlock);
	1800	continue;
	1801	}
	1802
	1803	/*
	1804	* With v_usecount == 0, all we need to do is clear out the
	1805	* vnode data structures and we are done.
	1806	*/
	1807	if (vp->v_usecount == 0) {
	1808	lwkt_reltoken(&mntvnode_token);
	1809	vgonel(vp, td);
	1810	lwkt_gettoken(&mntvnode_token);
	1811	continue;
	1812	}
	1813
	1814	/*
	1815	* If FORCECLOSE is set, forcibly close the vnode. For block
	1816	* or character devices, revert to an anonymous device. For
	1817	* all other files, just kill them.
	1818	*/
	1819	if (flags & FORCECLOSE) {
	1820	lwkt_reltoken(&mntvnode_token);
	1821	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	1822	vgonel(vp, td);
	1823	} else {
	1824	vclean(vp, 0, td);
	1825	vp->v_op = spec_vnodeop_p;
	1826	insmntque(vp, (struct mount *) 0);
	1827	}
	1828	lwkt_gettoken(&mntvnode_token);
	1829	continue;
	1830	}
	1831	#ifdef DIAGNOSTIC
	1832	if (busyprt)
	1833	vprint("vflush: busy vnode", vp);
	1834	#endif
	1835	lwkt_reltoken(&vp->v_interlock);
	1836	busy++;
	1837	}
	1838	lwkt_reltoken(&mntvnode_token);
	1839	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
	1840	/*
	1841	* If just the root vnode is busy, and if its refcount
	1842	* is equal to `rootrefs', then go ahead and kill it.
	1843	*/
	1844	lwkt_gettoken(&rootvp->v_interlock);
	1845	KASSERT(busy > 0, ("vflush: not busy"));
	1846	KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
	1847	if (busy == 1 && rootvp->v_usecount == rootrefs) {
	1848	vgonel(rootvp, td);
	1849	busy = 0;
	1850	} else
	1851	lwkt_reltoken(&rootvp->v_interlock);
	1852	}
	1853	if (busy)
	1854	return (EBUSY);
	1855	for (; rootrefs > 0; rootrefs--)
	1856	vrele(rootvp);
	1857	return (0);
	1858	}
	1859
	1860	/*
	1861	* We do not want to recycle the vnode too quickly.
	1862	*
	1863	* XXX we can't move vp's around the nvnodelist without really screwing
	1864	* up the efficiency of filesystem SYNC and friends. This code is
	1865	* disabled until we fix the syncing code's scanning algorithm.
	1866	*/
	1867	static void
	1868	vlruvp(struct vnode *vp)
	1869	{
	1870	#if 0
	1871	struct mount *mp;
	1872
	1873	if ((mp = vp->v_mount) != NULL) {
	1874	lwkt_gettoken(&mntvnode_token);
	1875	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	1876	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	1877	lwkt_reltoken(&mntvnode_token);
	1878	}
	1879	#endif
	1880	}
	1881
	1882	/*
	1883	* Disassociate the underlying file system from a vnode.
	1884	*/
	1885	static void
	1886	vclean(struct vnode vp, int flags, struct thread td)
	1887	{
	1888	int active;
	1889
	1890	/*
	1891	* Check to see if the vnode is in use. If so we have to reference it
	1892	* before we clean it out so that its count cannot fall to zero and
	1893	* generate a race against ourselves to recycle it.
	1894	*/
	1895	if ((active = vp->v_usecount))
	1896	vp->v_usecount++;
	1897
	1898	/*
	1899	* Prevent the vnode from being recycled or brought into use while we
	1900	* clean it out.
	1901	*/
	1902	if (vp->v_flag & VXLOCK)
	1903	panic("vclean: deadlock");
	1904	vp->v_flag \|= VXLOCK;
	1905	vp->v_vxproc = curproc;
	1906	/*
	1907	* Even if the count is zero, the VOP_INACTIVE routine may still
	1908	* have the object locked while it cleans it out. The VOP_LOCK
	1909	* ensures that the VOP_INACTIVE routine is done with its work.
	1910	* For active vnodes, it ensures that no other activity can
	1911	* occur while the underlying object is being cleaned out.
	1912	*/
	1913	VOP_LOCK(vp, LK_DRAIN \| LK_INTERLOCK, td);
	1914
	1915	/*
	1916	* Clean out any buffers associated with the vnode.
	1917	*/
	1918	vinvalbuf(vp, V_SAVE, td, 0, 0);
	1919
	1920	VOP_DESTROYVOBJECT(vp);
	1921
	1922	/*
	1923	* If purging an active vnode, it must be closed and
	1924	* deactivated before being reclaimed. Note that the
	1925	* VOP_INACTIVE will unlock the vnode.
	1926	*/
	1927	if (active) {
	1928	if (flags & DOCLOSE)
	1929	VOP_CLOSE(vp, FNONBLOCK, td);
	1930	VOP_INACTIVE(vp, td);
	1931	} else {
	1932	/*
	1933	* Any other processes trying to obtain this lock must first
	1934	* wait for VXLOCK to clear, then call the new lock operation.
	1935	*/
	1936	VOP_UNLOCK(vp, 0, td);
	1937	}
	1938	/*
	1939	* Reclaim the vnode.
	1940	*/
	1941	if (VOP_RECLAIM(vp, td))
	1942	panic("vclean: cannot reclaim");
	1943
	1944	if (active) {
	1945	/*
	1946	* Inline copy of vrele() since VOP_INACTIVE
	1947	* has already been called.
	1948	*/
	1949	lwkt_gettoken(&vp->v_interlock);
	1950	if (--vp->v_usecount <= 0) {
	1951	#ifdef DIAGNOSTIC
	1952	if (vp->v_usecount < 0 \|\| vp->v_writecount != 0) {
	1953	vprint("vclean: bad ref count", vp);
	1954	panic("vclean: ref cnt");
	1955	}
	1956	#endif
	1957	vfree(vp);
	1958	}
	1959	lwkt_reltoken(&vp->v_interlock);
	1960	}
	1961
	1962	cache_purge(vp);
	1963	vp->v_vnlock = NULL;
	1964
	1965	if (VSHOULDFREE(vp))
	1966	vfree(vp);
	1967
	1968	/*
	1969	* Done with purge, notify sleepers of the grim news.
	1970	*/
	1971	vp->v_op = dead_vnodeop_p;
	1972	vn_pollgone(vp);
	1973	vp->v_tag = VT_NON;
	1974	vp->v_flag &= ~VXLOCK;
	1975	vp->v_vxproc = NULL;
	1976	if (vp->v_flag & VXWANT) {
	1977	vp->v_flag &= ~VXWANT;
	1978	wakeup((caddr_t) vp);
	1979	}
	1980	}
	1981
	1982	/*
	1983	* Eliminate all activity associated with the requested vnode
	1984	* and with all vnodes aliased to the requested vnode.
	1985	*/
	1986	int
	1987	vop_revoke(ap)
	1988	struct vop_revoke_args /* {
	1989	struct vnode *a_vp;
	1990	int a_flags;
	1991	} / ap;
	1992	{
	1993	struct vnode vp, vq;
	1994	dev_t dev;
	1995
	1996	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
	1997
	1998	vp = ap->a_vp;
	1999	/*
	2000	* If a vgone (or vclean) is already in progress,
	2001	* wait until it is done and return.
	2002	*/
	2003	if (vp->v_flag & VXLOCK) {
	2004	vp->v_flag \|= VXWANT;
	2005	lwkt_reltoken(&vp->v_interlock);
	2006	tsleep((caddr_t)vp, 0, "vop_revokeall", 0);
	2007	return (0);
	2008	}
	2009	dev = vp->v_rdev;
	2010	for (;;) {
	2011	lwkt_gettoken(&spechash_token);
	2012	vq = SLIST_FIRST(&dev->si_hlist);
	2013	lwkt_reltoken(&spechash_token);
	2014	if (!vq)
	2015	break;
	2016	vgone(vq);
	2017	}
	2018	return (0);
	2019	}
	2020
	2021	/*
	2022	* Recycle an unused vnode to the front of the free list.
	2023	* Release the passed interlock if the vnode will be recycled.
	2024	*/
	2025	int
	2026	vrecycle(struct vnode vp, struct lwkt_token inter_lkp, struct thread *td)
	2027	{
	2028	lwkt_gettoken(&vp->v_interlock);
	2029	if (vp->v_usecount == 0) {
	2030	if (inter_lkp) {
	2031	lwkt_reltoken(inter_lkp);
	2032	}
	2033	vgonel(vp, td);
	2034	return (1);
	2035	}
	2036	lwkt_reltoken(&vp->v_interlock);
	2037	return (0);
	2038	}
	2039
	2040	/*
	2041	* Eliminate all activity associated with a vnode
	2042	* in preparation for reuse.
	2043	*/
	2044	void
	2045	vgone(struct vnode *vp)
	2046	{
	2047	struct thread td = curthread; / XXX */
	2048
	2049	lwkt_gettoken(&vp->v_interlock);
	2050	vgonel(vp, td);
	2051	}
	2052
	2053	/*
	2054	* vgone, with the vp interlock held.
	2055	*/
	2056	void
	2057	vgonel(struct vnode vp, struct thread td)
	2058	{
	2059	int s;
	2060
	2061	/*
	2062	* If a vgone (or vclean) is already in progress,
	2063	* wait until it is done and return.
	2064	*/
	2065	if (vp->v_flag & VXLOCK) {
	2066	vp->v_flag \|= VXWANT;
	2067	lwkt_reltoken(&vp->v_interlock);
	2068	tsleep((caddr_t)vp, 0, "vgone", 0);
	2069	return;
	2070	}
	2071
	2072	/*
	2073	* Clean out the filesystem specific data.
	2074	*/
	2075	vclean(vp, DOCLOSE, td);
	2076	lwkt_gettoken(&vp->v_interlock);
	2077
	2078	/*
	2079	* Delete from old mount point vnode list, if on one.
	2080	*/
	2081	if (vp->v_mount != NULL)
	2082	insmntque(vp, (struct mount *)0);
	2083	/*
	2084	* If special device, remove it from special device alias list
	2085	* if it is on one.
	2086	*/
	2087	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) {
	2088	lwkt_gettoken(&spechash_token);
	2089	SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
	2090	freedev(vp->v_rdev);
	2091	lwkt_reltoken(&spechash_token);
	2092	vp->v_rdev = NULL;
	2093	}
	2094
	2095	/*
	2096	* If it is on the freelist and not already at the head,
	2097	* move it to the head of the list. The test of the
	2098	* VDOOMED flag and the reference count of zero is because
	2099	* it will be removed from the free list by getnewvnode,
	2100	* but will not have its reference count incremented until
	2101	* after calling vgone. If the reference count were
	2102	* incremented first, vgone would (incorrectly) try to
	2103	* close the previous instance of the underlying object.
	2104	*/
	2105	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
	2106	s = splbio();
	2107	lwkt_gettoken(&vnode_free_list_token);
	2108	if (vp->v_flag & VFREE)
	2109	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	2110	else
	2111	freevnodes++;
	2112	vp->v_flag \|= VFREE;
	2113	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2114	lwkt_reltoken(&vnode_free_list_token);
	2115	splx(s);
	2116	}
	2117
	2118	vp->v_type = VBAD;
	2119	lwkt_reltoken(&vp->v_interlock);
	2120	}
	2121
	2122	/*
	2123	* Lookup a vnode by device number.
	2124	*/
	2125	int
	2126	vfinddev(dev, type, vpp)
	2127	dev_t dev;
	2128	enum vtype type;
	2129	struct vnode **vpp;
	2130	{
	2131	struct vnode *vp;
	2132
	2133	lwkt_gettoken(&spechash_token);
	2134	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	2135	if (type == vp->v_type) {
	2136	*vpp = vp;
	2137	lwkt_reltoken(&spechash_token);
	2138	return (1);
	2139	}
	2140	}
	2141	lwkt_reltoken(&spechash_token);
	2142	return (0);
	2143	}
	2144
	2145	/*
	2146	* Calculate the total number of references to a special device.
	2147	*/
	2148	int
	2149	vcount(vp)
	2150	struct vnode *vp;
	2151	{
	2152	struct vnode *vq;
	2153	int count;
	2154
	2155	count = 0;
	2156	lwkt_gettoken(&spechash_token);
	2157	SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
	2158	count += vq->v_usecount;
	2159	lwkt_reltoken(&spechash_token);
	2160	return (count);
	2161	}
	2162
	2163	/*
	2164	* Same as above, but using the dev_t as argument
	2165	*/
	2166
	2167	int
	2168	count_dev(dev)
	2169	dev_t dev;
	2170	{
	2171	struct vnode *vp;
	2172
	2173	vp = SLIST_FIRST(&dev->si_hlist);
	2174	if (vp == NULL)
	2175	return (0);
	2176	return(vcount(vp));
	2177	}
	2178
	2179	/*
	2180	* Print out a description of a vnode.
	2181	*/
	2182	static char *typename[] =
	2183	{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
	2184
	2185	void
	2186	vprint(label, vp)
	2187	char *label;
	2188	struct vnode *vp;
	2189	{
	2190	char buf[96];
	2191
	2192	if (label != NULL)
	2193	printf("%s: %p: ", label, (void *)vp);
	2194	else
	2195	printf("%p: ", (void *)vp);
	2196	printf("type %s, usecount %d, writecount %d, refcount %d,",
	2197	typename[vp->v_type], vp->v_usecount, vp->v_writecount,
	2198	vp->v_holdcnt);
	2199	buf[0] = '\0';
	2200	if (vp->v_flag & VROOT)
	2201	strcat(buf, "\|VROOT");
	2202	if (vp->v_flag & VTEXT)
	2203	strcat(buf, "\|VTEXT");
	2204	if (vp->v_flag & VSYSTEM)
	2205	strcat(buf, "\|VSYSTEM");
	2206	if (vp->v_flag & VXLOCK)
	2207	strcat(buf, "\|VXLOCK");
	2208	if (vp->v_flag & VXWANT)
	2209	strcat(buf, "\|VXWANT");
	2210	if (vp->v_flag & VBWAIT)
	2211	strcat(buf, "\|VBWAIT");
	2212	if (vp->v_flag & VDOOMED)
	2213	strcat(buf, "\|VDOOMED");
	2214	if (vp->v_flag & VFREE)
	2215	strcat(buf, "\|VFREE");
	2216	if (vp->v_flag & VOBJBUF)
	2217	strcat(buf, "\|VOBJBUF");
	2218	if (buf[0] != '\0')
	2219	printf(" flags (%s)", &buf[1]);
	2220	if (vp->v_data == NULL) {
	2221	printf("\n");
	2222	} else {
	2223	printf("\n\t");
	2224	VOP_PRINT(vp);
	2225	}
	2226	}
	2227
	2228	#ifdef DDB
	2229	#include <ddb/ddb.h>
	2230	/*
	2231	* List all of the locked vnodes in the system.
	2232	* Called when debugging the kernel.
	2233	*/
	2234	DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
	2235	{
	2236	struct thread td = curthread; / XXX */
	2237	struct mount mp, nmp;
	2238	struct vnode *vp;
	2239
	2240	printf("Locked vnodes\n");
	2241	lwkt_gettoken(&mountlist_token);
	2242	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	2243	if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, td)) {
	2244	nmp = TAILQ_NEXT(mp, mnt_list);
	2245	continue;
	2246	}
	2247	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	2248	if (VOP_ISLOCKED(vp, NULL))
	2249	vprint((char *)0, vp);
	2250	}
	2251	lwkt_gettoken(&mountlist_token);
	2252	nmp = TAILQ_NEXT(mp, mnt_list);
	2253	vfs_unbusy(mp, td);
	2254	}
	2255	lwkt_reltoken(&mountlist_token);
	2256	}
	2257	#endif
	2258
	2259	/*
	2260	* Top level filesystem related information gathering.
	2261	*/
	2262	static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
	2263
	2264	static int
	2265	vfs_sysctl(SYSCTL_HANDLER_ARGS)
	2266	{
	2267	int name = (int )arg1 - 1; /* XXX */
	2268	u_int namelen = arg2 + 1; /* XXX */
	2269	struct vfsconf *vfsp;
	2270
	2271	#if 1 \|\| defined(COMPAT_PRELITE2)
	2272	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
	2273	if (namelen == 1)
	2274	return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
	2275	#endif
	2276
	2277	#ifdef notyet
	2278	/* all sysctl names at this level are at least name and field */
	2279	if (namelen < 2)
	2280	return (ENOTDIR); /* overloaded */
	2281	if (name[0] != VFS_GENERIC) {
	2282	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	2283	if (vfsp->vfc_typenum == name[0])
	2284	break;
	2285	if (vfsp == NULL)
	2286	return (EOPNOTSUPP);
	2287	return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
	2288	oldp, oldlenp, newp, newlen, p));
	2289	}
	2290	#endif
	2291	switch (name[1]) {
	2292	case VFS_MAXTYPENUM:
	2293	if (namelen != 2)
	2294	return (ENOTDIR);
	2295	return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
	2296	case VFS_CONF:
	2297	if (namelen != 3)
	2298	return (ENOTDIR); /* overloaded */
	2299	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	2300	if (vfsp->vfc_typenum == name[2])
	2301	break;
	2302	if (vfsp == NULL)
	2303	return (EOPNOTSUPP);
	2304	return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
	2305	}
	2306	return (EOPNOTSUPP);
	2307	}
	2308
	2309	SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
	2310	"Generic filesystem");
	2311
	2312	#if 1 \|\| defined(COMPAT_PRELITE2)
	2313
	2314	static int
	2315	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
	2316	{
	2317	int error;
	2318	struct vfsconf *vfsp;
	2319	struct ovfsconf ovfs;
	2320
	2321	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	2322	ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
	2323	strcpy(ovfs.vfc_name, vfsp->vfc_name);
	2324	ovfs.vfc_index = vfsp->vfc_typenum;
	2325	ovfs.vfc_refcount = vfsp->vfc_refcount;
	2326	ovfs.vfc_flags = vfsp->vfc_flags;
	2327	error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
	2328	if (error)
	2329	return error;
	2330	}
	2331	return 0;
	2332	}
	2333
	2334	#endif /* 1 \|\| COMPAT_PRELITE2 */
	2335
	2336	#if 0
	2337	#define KINFO_VNODESLOP 10
	2338	/*
	2339	* Dump vnode list (via sysctl).
	2340	* Copyout address of vnode followed by vnode.
	2341	*/
	2342	/* ARGSUSED */
	2343	static int
	2344	sysctl_vnode(SYSCTL_HANDLER_ARGS)
	2345	{
	2346	struct proc p = curproc; / XXX */
	2347	struct mount mp, nmp;
	2348	struct vnode nvp, vp;
	2349	int error;
	2350
	2351	#define VPTRSZ sizeof (struct vnode *)
	2352	#define VNODESZ sizeof (struct vnode)
	2353
	2354	req->lock = 0;
	2355	if (!req->oldptr) /* Make an estimate */
	2356	return (SYSCTL_OUT(req, 0,
	2357	(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
	2358
	2359	lwkt_gettoken(&mountlist_token);
	2360	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	2361	if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, p)) {
	2362	nmp = TAILQ_NEXT(mp, mnt_list);
	2363	continue;
	2364	}
	2365	again:
	2366	lwkt_gettoken(&mntvnode_token);
	2367	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	2368	vp != NULL;
	2369	vp = nvp) {
	2370	/*
	2371	* Check that the vp is still associated with
	2372	* this filesystem. RACE: could have been
	2373	* recycled onto the same filesystem.
	2374	*/
	2375	if (vp->v_mount != mp) {
	2376	lwkt_reltoken(&mntvnode_token);
	2377	goto again;
	2378	}
	2379	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	2380	lwkt_reltoken(&mntvnode_token);
	2381	if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\|
	2382	(error = SYSCTL_OUT(req, vp, VNODESZ)))
	2383	return (error);
	2384	lwkt_gettoken(&mntvnode_token);
	2385	}
	2386	lwkt_reltoken(&mntvnode_token);
	2387	lwkt_gettoken(&mountlist_token);
	2388	nmp = TAILQ_NEXT(mp, mnt_list);
	2389	vfs_unbusy(mp, p);
	2390	}
	2391	lwkt_reltoken(&mountlist_token);
	2392
	2393	return (0);
	2394	}
	2395	#endif
	2396
	2397	/*
	2398	* XXX
	2399	* Exporting the vnode list on large systems causes them to crash.
	2400	* Exporting the vnode list on medium systems causes sysctl to coredump.
	2401	*/
	2402	#if 0
	2403	SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	2404	0, 0, sysctl_vnode, "S,vnode", "");
	2405	#endif
	2406
	2407	/*
	2408	* Check to see if a filesystem is mounted on a block device.
	2409	*/
	2410	int
	2411	vfs_mountedon(vp)
	2412	struct vnode *vp;
	2413	{
	2414
	2415	if (vp->v_specmountpoint != NULL)
	2416	return (EBUSY);
	2417	return (0);
	2418	}
	2419
	2420	/*
	2421	* Unmount all filesystems. The list is traversed in reverse order
	2422	* of mounting to avoid dependencies.
	2423	*/
	2424	void
	2425	vfs_unmountall()
	2426	{
	2427	struct mount *mp;
	2428	struct thread *td = curthread;
	2429	int error;
	2430
	2431	if (td->td_proc == NULL)
	2432	td = initproc->p_thread; /* XXX XXX use proc0 instead? */
	2433
	2434	/*
	2435	* Since this only runs when rebooting, it is not interlocked.
	2436	*/
	2437	while(!TAILQ_EMPTY(&mountlist)) {
	2438	mp = TAILQ_LAST(&mountlist, mntlist);
	2439	error = dounmount(mp, MNT_FORCE, td);
	2440	if (error) {
	2441	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	2442	printf("unmount of %s failed (",
	2443	mp->mnt_stat.f_mntonname);
	2444	if (error == EBUSY)
	2445	printf("BUSY)\n");
	2446	else
	2447	printf("%d)\n", error);
	2448	} else {
	2449	/* The unmount has removed mp from the mountlist */
	2450	}
	2451	}
	2452	}
	2453
	2454	/*
	2455	* Build hash lists of net addresses and hang them off the mount point.
	2456	* Called by ufs_mount() to set up the lists of export addresses.
	2457	*/
	2458	static int
	2459	vfs_hang_addrlist(mp, nep, argp)
	2460	struct mount *mp;
	2461	struct netexport *nep;
	2462	struct export_args *argp;
	2463	{
	2464	struct netcred *np;
	2465	struct radix_node_head *rnh;
	2466	int i;
	2467	struct radix_node *rn;
	2468	struct sockaddr saddr, smask = 0;
	2469	struct domain *dom;
	2470	int error;
	2471
	2472	if (argp->ex_addrlen == 0) {
	2473	if (mp->mnt_flag & MNT_DEFEXPORTED)
	2474	return (EPERM);
	2475	np = &nep->ne_defexported;
	2476	np->netc_exflags = argp->ex_flags;
	2477	np->netc_anon = argp->ex_anon;
	2478	np->netc_anon.cr_ref = 1;
	2479	mp->mnt_flag \|= MNT_DEFEXPORTED;
	2480	return (0);
	2481	}
	2482
	2483	if (argp->ex_addrlen < 0 \|\| argp->ex_addrlen > MLEN)
	2484	return (EINVAL);
	2485	if (argp->ex_masklen < 0 \|\| argp->ex_masklen > MLEN)
	2486	return (EINVAL);
	2487
	2488	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
	2489	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
	2490	bzero((caddr_t) np, i);
	2491	saddr = (struct sockaddr *) (np + 1);
	2492	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
	2493	goto out;
	2494	if (saddr->sa_len > argp->ex_addrlen)
	2495	saddr->sa_len = argp->ex_addrlen;
	2496	if (argp->ex_masklen) {
	2497	smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
	2498	error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
	2499	if (error)
	2500	goto out;
	2501	if (smask->sa_len > argp->ex_masklen)
	2502	smask->sa_len = argp->ex_masklen;
	2503	}
	2504	i = saddr->sa_family;
	2505	if ((rnh = nep->ne_rtable[i]) == 0) {
	2506	/*
	2507	* Seems silly to initialize every AF when most are not used,
	2508	* do so on demand here
	2509	*/
	2510	for (dom = domains; dom; dom = dom->dom_next)
	2511	if (dom->dom_family == i && dom->dom_rtattach) {
	2512	dom->dom_rtattach((void **) &nep->ne_rtable[i],
	2513	dom->dom_rtoffset);
	2514	break;
	2515	}
	2516	if ((rnh = nep->ne_rtable[i]) == 0) {
	2517	error = ENOBUFS;
	2518	goto out;
	2519	}
	2520	}
	2521	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
	2522	np->netc_rnodes);
	2523	if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists */
	2524	error = EPERM;
	2525	goto out;
	2526	}
	2527	np->netc_exflags = argp->ex_flags;
	2528	np->netc_anon = argp->ex_anon;
	2529	np->netc_anon.cr_ref = 1;
	2530	return (0);
	2531	out:
	2532	free(np, M_NETADDR);
	2533	return (error);
	2534	}
	2535
	2536	/* ARGSUSED */
	2537	static int
	2538	vfs_free_netcred(rn, w)
	2539	struct radix_node *rn;
	2540	void *w;
	2541	{
	2542	struct radix_node_head rnh = (struct radix_node_head ) w;
	2543
	2544	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
	2545	free((caddr_t) rn, M_NETADDR);
	2546	return (0);
	2547	}
	2548
	2549	/*
	2550	* Free the net address hash lists that are hanging off the mount points.
	2551	*/
	2552	static void
	2553	vfs_free_addrlist(nep)
	2554	struct netexport *nep;
	2555	{
	2556	int i;
	2557	struct radix_node_head *rnh;
	2558
	2559	for (i = 0; i <= AF_MAX; i++)
	2560	if ((rnh = nep->ne_rtable[i])) {
	2561	(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
	2562	(caddr_t) rnh);
	2563	free((caddr_t) rnh, M_RTABLE);
	2564	nep->ne_rtable[i] = 0;
	2565	}
	2566	}
	2567
	2568	int
	2569	vfs_export(mp, nep, argp)
	2570	struct mount *mp;
	2571	struct netexport *nep;
	2572	struct export_args *argp;
	2573	{
	2574	int error;
	2575
	2576	if (argp->ex_flags & MNT_DELEXPORT) {
	2577	if (mp->mnt_flag & MNT_EXPUBLIC) {
	2578	vfs_setpublicfs(NULL, NULL, NULL);
	2579	mp->mnt_flag &= ~MNT_EXPUBLIC;
	2580	}
	2581	vfs_free_addrlist(nep);
	2582	mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED);
	2583	}
	2584	if (argp->ex_flags & MNT_EXPORTED) {
	2585	if (argp->ex_flags & MNT_EXPUBLIC) {
	2586	if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
	2587	return (error);
	2588	mp->mnt_flag \|= MNT_EXPUBLIC;
	2589	}
	2590	if ((error = vfs_hang_addrlist(mp, nep, argp)))
	2591	return (error);
	2592	mp->mnt_flag \|= MNT_EXPORTED;
	2593	}
	2594	return (0);
	2595	}
	2596
	2597
	2598	/*
	2599	* Set the publicly exported filesystem (WebNFS). Currently, only
	2600	* one public filesystem is possible in the spec (RFC 2054 and 2055)
	2601	*/
	2602	int
	2603	vfs_setpublicfs(mp, nep, argp)
	2604	struct mount *mp;
	2605	struct netexport *nep;
	2606	struct export_args *argp;
	2607	{
	2608	int error;
	2609	struct vnode *rvp;
	2610	char *cp;
	2611
	2612	/*
	2613	* mp == NULL -> invalidate the current info, the FS is
	2614	* no longer exported. May be called from either vfs_export
	2615	* or unmount, so check if it hasn't already been done.
	2616	*/
	2617	if (mp == NULL) {
	2618	if (nfs_pub.np_valid) {
	2619	nfs_pub.np_valid = 0;
	2620	if (nfs_pub.np_index != NULL) {
	2621	FREE(nfs_pub.np_index, M_TEMP);
	2622	nfs_pub.np_index = NULL;
	2623	}
	2624	}
	2625	return (0);
	2626	}
	2627
	2628	/*
	2629	* Only one allowed at a time.
	2630	*/
	2631	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
	2632	return (EBUSY);
	2633
	2634	/*
	2635	* Get real filehandle for root of exported FS.
	2636	*/
	2637	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
	2638	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
	2639
	2640	if ((error = VFS_ROOT(mp, &rvp)))
	2641	return (error);
	2642
	2643	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
	2644	return (error);
	2645
	2646	vput(rvp);
	2647
	2648	/*
	2649	* If an indexfile was specified, pull it in.
	2650	*/
	2651	if (argp->ex_indexfile != NULL) {
	2652	MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
	2653	M_WAITOK);
	2654	error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
	2655	MAXNAMLEN, (size_t *)0);
	2656	if (!error) {
	2657	/*
	2658	* Check for illegal filenames.
	2659	*/
	2660	for (cp = nfs_pub.np_index; *cp; cp++) {
	2661	if (*cp == '/') {
	2662	error = EINVAL;
	2663	break;
	2664	}
	2665	}
	2666	}
	2667	if (error) {
	2668	FREE(nfs_pub.np_index, M_TEMP);
	2669	return (error);
	2670	}
	2671	}
	2672
	2673	nfs_pub.np_mount = mp;
	2674	nfs_pub.np_valid = 1;
	2675	return (0);
	2676	}
	2677
	2678	struct netcred *
	2679	vfs_export_lookup(mp, nep, nam)
	2680	struct mount *mp;
	2681	struct netexport *nep;
	2682	struct sockaddr *nam;
	2683	{
	2684	struct netcred *np;
	2685	struct radix_node_head *rnh;
	2686	struct sockaddr *saddr;
	2687
	2688	np = NULL;
	2689	if (mp->mnt_flag & MNT_EXPORTED) {
	2690	/*
	2691	* Lookup in the export list first.
	2692	*/
	2693	if (nam != NULL) {
	2694	saddr = nam;
	2695	rnh = nep->ne_rtable[saddr->sa_family];
	2696	if (rnh != NULL) {
	2697	np = (struct netcred *)
	2698	(*rnh->rnh_matchaddr)((caddr_t)saddr,
	2699	rnh);
	2700	if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
	2701	np = NULL;
	2702	}
	2703	}
	2704	/*
	2705	* If no address match, use the default if it exists.
	2706	*/
	2707	if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
	2708	np = &nep->ne_defexported;
	2709	}
	2710	return (np);
	2711	}
	2712
	2713	/*
	2714	* perform msync on all vnodes under a mount point
	2715	* the mount point must be locked.
	2716	*/
	2717	void
	2718	vfs_msync(struct mount *mp, int flags)
	2719	{
	2720	struct thread td = curthread; / XXX */
	2721	struct vnode vp, nvp;
	2722	struct vm_object *obj;
	2723	int tries;
	2724
	2725	tries = 5;
	2726	lwkt_gettoken(&mntvnode_token);
	2727	loop:
	2728	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
	2729	if (vp->v_mount != mp) {
	2730	if (--tries > 0)
	2731	goto loop;
	2732	break;
	2733	}
	2734	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	2735
	2736	if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
	2737	continue;
	2738
	2739	/*
	2740	* There could be hundreds of thousands of vnodes, we cannot
	2741	* afford to do anything heavy-weight until we have a fairly
	2742	* good indication that there is something to do.
	2743	*/
	2744	if ((vp->v_flag & VOBJDIRTY) &&
	2745	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	2746	lwkt_reltoken(&mntvnode_token);
	2747	if (!vget(vp,
	2748	LK_EXCLUSIVE \| LK_RETRY \| LK_NOOBJ, td)) {
	2749	if (VOP_GETVOBJECT(vp, &obj) == 0) {
	2750	vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
	2751	}
	2752	vput(vp);
	2753	}
	2754	lwkt_gettoken(&mntvnode_token);
	2755	if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
	2756	if (--tries > 0)
	2757	goto loop;
	2758	break;
	2759	}
	2760	}
	2761	}
	2762	lwkt_reltoken(&mntvnode_token);
	2763	}
	2764
	2765	/*
	2766	* Create the VM object needed for VMIO and mmap support. This
	2767	* is done for all VREG files in the system. Some filesystems might
	2768	* afford the additional metadata buffering capability of the
	2769	* VMIO code by making the device node be VMIO mode also.
	2770	*
	2771	* vp must be locked when vfs_object_create is called.
	2772	*/
	2773	int
	2774	vfs_object_create(struct vnode vp, struct thread td)
	2775	{
	2776	return (VOP_CREATEVOBJECT(vp, td));
	2777	}
	2778
	2779	void
	2780	vfree(vp)
	2781	struct vnode *vp;
	2782	{
	2783	int s;
	2784
	2785	s = splbio();
	2786	lwkt_gettoken(&vnode_free_list_token);
	2787	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
	2788	if (vp->v_flag & VAGE) {
	2789	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2790	} else {
	2791	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	2792	}
	2793	freevnodes++;
	2794	lwkt_reltoken(&vnode_free_list_token);
	2795	vp->v_flag &= ~VAGE;
	2796	vp->v_flag \|= VFREE;
	2797	splx(s);
	2798	}
	2799
	2800	void
	2801	vbusy(vp)
	2802	struct vnode *vp;
	2803	{
	2804	int s;
	2805
	2806	s = splbio();
	2807	lwkt_gettoken(&vnode_free_list_token);
	2808	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
	2809	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	2810	freevnodes--;
	2811	lwkt_reltoken(&vnode_free_list_token);
	2812	vp->v_flag &= ~(VFREE\|VAGE);
	2813	splx(s);
	2814	}
	2815
	2816	/*
	2817	* Record a process's interest in events which might happen to
	2818	* a vnode. Because poll uses the historic select-style interface
	2819	* internally, this routine serves as both the ``check for any
	2820	* pending events'' and the ``record my interest in future events''
	2821	* functions. (These are done together, while the lock is held,
	2822	* to avoid race conditions.)
	2823	*/
	2824	int
	2825	vn_pollrecord(struct vnode vp, struct thread td, int events)
	2826	{
	2827	lwkt_gettoken(&vp->v_pollinfo.vpi_token);
	2828	if (vp->v_pollinfo.vpi_revents & events) {
	2829	/*
	2830	* This leaves events we are not interested
	2831	* in available for the other process which
	2832	* which presumably had requested them
	2833	* (otherwise they would never have been
	2834	* recorded).
	2835	*/
	2836	events &= vp->v_pollinfo.vpi_revents;
	2837	vp->v_pollinfo.vpi_revents &= ~events;
	2838
	2839	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2840	return events;
	2841	}
	2842	vp->v_pollinfo.vpi_events \|= events;
	2843	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
	2844	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2845	return 0;
	2846	}
	2847
	2848	/*
	2849	* Note the occurrence of an event. If the VN_POLLEVENT macro is used,
	2850	* it is possible for us to miss an event due to race conditions, but
	2851	* that condition is expected to be rare, so for the moment it is the
	2852	* preferred interface.
	2853	*/
	2854	void
	2855	vn_pollevent(vp, events)
	2856	struct vnode *vp;
	2857	short events;
	2858	{
	2859	lwkt_gettoken(&vp->v_pollinfo.vpi_token);
	2860	if (vp->v_pollinfo.vpi_events & events) {
	2861	/*
	2862	* We clear vpi_events so that we don't
	2863	* call selwakeup() twice if two events are
	2864	* posted before the polling process(es) is
	2865	* awakened. This also ensures that we take at
	2866	* most one selwakeup() if the polling process
	2867	* is no longer interested. However, it does
	2868	* mean that only one event can be noticed at
	2869	* a time. (Perhaps we should only clear those
	2870	* event bits which we note?) XXX
	2871	*/
	2872	vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
	2873	vp->v_pollinfo.vpi_revents \|= events;
	2874	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	2875	}
	2876	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2877	}
	2878
	2879	/*
	2880	* Wake up anyone polling on vp because it is being revoked.
	2881	* This depends on dead_poll() returning POLLHUP for correct
	2882	* behavior.
	2883	*/
	2884	void
	2885	vn_pollgone(vp)
	2886	struct vnode *vp;
	2887	{
	2888	lwkt_gettoken(&vp->v_pollinfo.vpi_token);
	2889	if (vp->v_pollinfo.vpi_events) {
	2890	vp->v_pollinfo.vpi_events = 0;
	2891	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	2892	}
	2893	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2894	}
	2895
	2896
	2897
	2898	/*
	2899	* Routine to create and manage a filesystem syncer vnode.
	2900	*/
	2901	#define sync_close ((int () (struct vop_close_args ))nullop)
	2902	static int sync_fsync (struct vop_fsync_args *);
	2903	static int sync_inactive (struct vop_inactive_args *);
	2904	static int sync_reclaim (struct vop_reclaim_args *);
	2905	#define sync_lock ((int () (struct vop_lock_args ))vop_nolock)
	2906	#define sync_unlock ((int () (struct vop_unlock_args ))vop_nounlock)
	2907	static int sync_print (struct vop_print_args *);
	2908	#define sync_islocked ((int() (struct vop_islocked_args ))vop_noislocked)
	2909
	2910	static vop_t **sync_vnodeop_p;
	2911	static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
	2912	{ &vop_default_desc, (vop_t *) vop_eopnotsupp },
	2913	{ &vop_close_desc, (vop_t ) sync_close }, / close */
	2914	{ &vop_fsync_desc, (vop_t ) sync_fsync }, / fsync */
	2915	{ &vop_inactive_desc, (vop_t ) sync_inactive }, / inactive */
	2916	{ &vop_reclaim_desc, (vop_t ) sync_reclaim }, / reclaim */
	2917	{ &vop_lock_desc, (vop_t ) sync_lock }, / lock */
	2918	{ &vop_unlock_desc, (vop_t ) sync_unlock }, / unlock */
	2919	{ &vop_print_desc, (vop_t ) sync_print }, / print */
	2920	{ &vop_islocked_desc, (vop_t ) sync_islocked }, / islocked */
	2921	{ NULL, NULL }
	2922	};
	2923	static struct vnodeopv_desc sync_vnodeop_opv_desc =
	2924	{ &sync_vnodeop_p, sync_vnodeop_entries };
	2925
	2926	VNODEOP_SET(sync_vnodeop_opv_desc);
	2927
	2928	/*
	2929	* Create a new filesystem syncer vnode for the specified mount point.
	2930	*/
	2931	int
	2932	vfs_allocate_syncvnode(mp)
	2933	struct mount *mp;
	2934	{
	2935	struct vnode *vp;
	2936	static long start, incr, next;
	2937	int error;
	2938
	2939	/* Allocate a new vnode */
	2940	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
	2941	mp->mnt_syncer = NULL;
	2942	return (error);
	2943	}
	2944	vp->v_type = VNON;
	2945	/*
	2946	* Place the vnode onto the syncer worklist. We attempt to
	2947	* scatter them about on the list so that they will go off
	2948	* at evenly distributed times even if all the filesystems
	2949	* are mounted at once.
	2950	*/
	2951	next += incr;
	2952	if (next == 0 \|\| next > syncer_maxdelay) {
	2953	start /= 2;
	2954	incr /= 2;
	2955	if (start == 0) {
	2956	start = syncer_maxdelay / 2;
	2957	incr = syncer_maxdelay;
	2958	}
	2959	next = start;
	2960	}
	2961	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
	2962	mp->mnt_syncer = vp;
	2963	return (0);
	2964	}
	2965
	2966	/*
	2967	* Do a lazy sync of the filesystem.
	2968	*/
	2969	static int
	2970	sync_fsync(ap)
	2971	struct vop_fsync_args /* {
	2972	struct vnode *a_vp;
	2973	struct ucred *a_cred;
	2974	int a_waitfor;
	2975	struct thread *a_td;
	2976	} / ap;
	2977	{
	2978	struct vnode *syncvp = ap->a_vp;
	2979	struct mount *mp = syncvp->v_mount;
	2980	struct thread *td = ap->a_td;
	2981	int asyncflag;
	2982
	2983	/*
	2984	* We only need to do something if this is a lazy evaluation.
	2985	*/
	2986	if (ap->a_waitfor != MNT_LAZY)
	2987	return (0);
	2988
	2989	/*
	2990	* Move ourselves to the back of the sync list.
	2991	*/
	2992	vn_syncer_add_to_worklist(syncvp, syncdelay);
	2993
	2994	/*
	2995	* Walk the list of vnodes pushing all that are dirty and
	2996	* not already on the sync list.
	2997	*/
	2998	lwkt_gettoken(&mountlist_token);
	2999	if (vfs_busy(mp, LK_EXCLUSIVE \| LK_NOWAIT, &mountlist_token, td) != 0) {
	3000	lwkt_reltoken(&mountlist_token);
	3001	return (0);
	3002	}
	3003	asyncflag = mp->mnt_flag & MNT_ASYNC;
	3004	mp->mnt_flag &= ~MNT_ASYNC;
	3005	vfs_msync(mp, MNT_NOWAIT);
	3006	VFS_SYNC(mp, MNT_LAZY, td);
	3007	if (asyncflag)
	3008	mp->mnt_flag \|= MNT_ASYNC;
	3009	vfs_unbusy(mp, td);
	3010	return (0);
	3011	}
	3012
	3013	/*
	3014	* The syncer vnode is no referenced.
	3015	*/
	3016	static int
	3017	sync_inactive(ap)
	3018	struct vop_inactive_args /* {
	3019	struct vnode *a_vp;
	3020	struct proc *a_p;
	3021	} / ap;
	3022	{
	3023
	3024	vgone(ap->a_vp);
	3025	return (0);
	3026	}
	3027
	3028	/*
	3029	* The syncer vnode is no longer needed and is being decommissioned.
	3030	*
	3031	* Modifications to the worklist must be protected at splbio().
	3032	*/
	3033	static int
	3034	sync_reclaim(ap)
	3035	struct vop_reclaim_args /* {
	3036	struct vnode *a_vp;
	3037	} / ap;
	3038	{
	3039	struct vnode *vp = ap->a_vp;
	3040	int s;
	3041
	3042	s = splbio();
	3043	vp->v_mount->mnt_syncer = NULL;
	3044	if (vp->v_flag & VONWORKLST) {
	3045	LIST_REMOVE(vp, v_synclist);
	3046	vp->v_flag &= ~VONWORKLST;
	3047	}
	3048	splx(s);
	3049
	3050	return (0);
	3051	}
	3052
	3053	/*
	3054	* Print out a syncer vnode.
	3055	*/
	3056	static int
	3057	sync_print(ap)
	3058	struct vop_print_args /* {
	3059	struct vnode *a_vp;
	3060	} / ap;
	3061	{
	3062	struct vnode *vp = ap->a_vp;
	3063
	3064	printf("syncer vnode");
	3065	if (vp->v_vnlock != NULL)
	3066	lockmgr_printinfo(vp->v_vnlock);
	3067	printf("\n");
	3068	return (0);
	3069	}
	3070
	3071	/*
	3072	* extract the dev_t from a VBLK or VCHR
	3073	*/
	3074	dev_t
	3075	vn_todev(vp)
	3076	struct vnode *vp;
	3077	{
	3078	if (vp->v_type != VBLK && vp->v_type != VCHR)
	3079	return (NODEV);
	3080	return (vp->v_rdev);
	3081	}
	3082
	3083	/*
	3084	* Check if vnode represents a disk device
	3085	*/
	3086	int
	3087	vn_isdisk(vp, errp)
	3088	struct vnode *vp;
	3089	int *errp;
	3090	{
	3091	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	3092	if (errp != NULL)
	3093	*errp = ENOTBLK;
	3094	return (0);
	3095	}
	3096	if (vp->v_rdev == NULL) {
	3097	if (errp != NULL)
	3098	*errp = ENXIO;
	3099	return (0);
	3100	}
	3101	if (!dev_dport(vp->v_rdev)) {
	3102	if (errp != NULL)
	3103	*errp = ENXIO;
	3104	return (0);
	3105	}
	3106	if (!(dev_dflags(vp->v_rdev) & D_DISK)) {
	3107	if (errp != NULL)
	3108	*errp = ENOTBLK;
	3109	return (0);
	3110	}
	3111	if (errp != NULL)
	3112	*errp = 0;
	3113	return (1);
	3114	}
	3115
	3116	void
	3117	NDFREE(ndp, flags)
	3118	struct nameidata *ndp;
	3119	const uint flags;
	3120	{
	3121	if (!(flags & NDF_NO_FREE_PNBUF) &&
	3122	(ndp->ni_cnd.cn_flags & CNP_HASBUF)) {
	3123	zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
	3124	ndp->ni_cnd.cn_flags &= ~CNP_HASBUF;
	3125	}
	3126	if (!(flags & NDF_NO_DNCP_RELE) &&
	3127	(ndp->ni_cnd.cn_flags & CNP_WANTDNCP) &&
	3128	ndp->ni_dncp) {
	3129	cache_drop(ndp->ni_dncp);
	3130	ndp->ni_dncp = NULL;
	3131	}
	3132	if (!(flags & NDF_NO_NCP_RELE) &&
	3133	(ndp->ni_cnd.cn_flags & CNP_WANTNCP) &&
	3134	ndp->ni_ncp) {
	3135	cache_drop(ndp->ni_ncp);
	3136	ndp->ni_ncp = NULL;
	3137	}
	3138	if (!(flags & NDF_NO_DVP_UNLOCK) &&
	3139	(ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) &&
	3140	ndp->ni_dvp != ndp->ni_vp) {
	3141	VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_td);
	3142	}
	3143	if (!(flags & NDF_NO_DVP_RELE) &&
	3144	(ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT\|CNP_WANTPARENT))) {
	3145	vrele(ndp->ni_dvp);
	3146	ndp->ni_dvp = NULL;
	3147	}
	3148	if (!(flags & NDF_NO_VP_UNLOCK) &&
	3149	(ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) {
	3150	VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_td);
	3151	}
	3152	if (!(flags & NDF_NO_VP_RELE) &&
	3153	ndp->ni_vp) {
	3154	vrele(ndp->ni_vp);
	3155	ndp->ni_vp = NULL;
	3156	}
	3157	if (!(flags & NDF_NO_STARTDIR_RELE) &&
	3158	(ndp->ni_cnd.cn_flags & CNP_SAVESTART)) {
	3159	vrele(ndp->ni_startdir);
	3160	ndp->ni_startdir = NULL;
	3161	}
	3162	}
	3163