gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	39	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	40	* $DragonFly: src/sys/kern/vfs_subr.c,v 1.24 2004/01/27 23:56:48 dillon Exp $
	41	*/
	42
	43	/*
	44	* External virtual filesystem routines
	45	*/
	46	#include "opt_ddb.h"
	47
	48	#include <sys/param.h>
	49	#include <sys/systm.h>
	50	#include <sys/buf.h>
	51	#include <sys/conf.h>
	52	#include <sys/dirent.h>
	53	#include <sys/domain.h>
	54	#include <sys/eventhandler.h>
	55	#include <sys/fcntl.h>
	56	#include <sys/kernel.h>
	57	#include <sys/kthread.h>
	58	#include <sys/malloc.h>
	59	#include <sys/mbuf.h>
	60	#include <sys/mount.h>
	61	#include <sys/proc.h>
	62	#include <sys/namei.h>
	63	#include <sys/reboot.h>
	64	#include <sys/socket.h>
	65	#include <sys/stat.h>
	66	#include <sys/sysctl.h>
	67	#include <sys/syslog.h>
	68	#include <sys/vmmeter.h>
	69	#include <sys/vnode.h>
	70
	71	#include <machine/limits.h>
	72
	73	#include <vm/vm.h>
	74	#include <vm/vm_object.h>
	75	#include <vm/vm_extern.h>
	76	#include <vm/vm_kern.h>
	77	#include <vm/pmap.h>
	78	#include <vm/vm_map.h>
	79	#include <vm/vm_page.h>
	80	#include <vm/vm_pager.h>
	81	#include <vm/vnode_pager.h>
	82	#include <vm/vm_zone.h>
	83
	84	#include <sys/buf2.h>
	85	#include <sys/thread2.h>
	86
	87	static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
	88
	89	static void insmntque (struct vnode vp, struct mount mp);
	90	static void vclean (struct vnode vp, int flags, struct thread td);
	91	static unsigned long numvnodes;
	92	static void vlruvp(struct vnode *vp);
	93	SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
	94
	95	enum vtype iftovt_tab[16] = {
	96	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	97	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	98	};
	99	int vttoif_tab[9] = {
	100	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	101	S_IFSOCK, S_IFIFO, S_IFMT,
	102	};
	103
	104	static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
	105
	106	static u_long wantfreevnodes = 25;
	107	SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
	108	static u_long freevnodes = 0;
	109	SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
	110
	111	static int reassignbufcalls;
	112	SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
	113	static int reassignbufloops;
	114	SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
	115	static int reassignbufsortgood;
	116	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
	117	static int reassignbufsortbad;
	118	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
	119	static int reassignbufmethod = 1;
	120	SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
	121
	122	#ifdef ENABLE_VFS_IOOPT
	123	int vfs_ioopt = 0;
	124	SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
	125	#endif
	126
	127	struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
	128	struct lwkt_token mountlist_token;
	129	struct lwkt_token mntvnode_token;
	130	int nfs_mount_type = -1;
	131	static struct lwkt_token mntid_token;
	132	static struct lwkt_token vnode_free_list_token;
	133	static struct lwkt_token spechash_token;
	134	struct nfs_public nfs_pub; /* publicly exported FS */
	135	static vm_zone_t vnode_zone;
	136
	137	/*
	138	* The workitem queue.
	139	*/
	140	#define SYNCER_MAXDELAY 32
	141	static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
	142	time_t syncdelay = 30; /* max time to delay syncing data */
	143	SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW, &syncdelay, 0,
	144	"VFS data synchronization delay");
	145	time_t filedelay = 30; /* time to delay syncing files */
	146	SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
	147	"File synchronization delay");
	148	time_t dirdelay = 29; /* time to delay syncing directories */
	149	SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
	150	"Directory synchronization delay");
	151	time_t metadelay = 28; /* time to delay syncing metadata */
	152	SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
	153	"VFS metadata synchronization delay");
	154	static int rushjob; /* number of slots to run ASAP */
	155	static int stat_rush_requests; /* number of times I/O speeded up */
	156	SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
	157
	158	static int syncer_delayno = 0;
	159	static long syncer_mask;
	160	LIST_HEAD(synclist, vnode);
	161	static struct synclist *syncer_workitem_pending;
	162
	163	int desiredvnodes;
	164	SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
	165	&desiredvnodes, 0, "Maximum number of vnodes");
	166	static int minvnodes;
	167	SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
	168	&minvnodes, 0, "Minimum number of vnodes");
	169	static int vnlru_nowhere = 0;
	170	SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
	171	"Number of times the vnlru process ran without success");
	172
	173	static void vfs_free_addrlist (struct netexport *nep);
	174	static int vfs_free_netcred (struct radix_node rn, void w);
	175	static int vfs_hang_addrlist (struct mount mp, struct netexport nep,
	176	struct export_args *argp);
	177
	178	/*
	179	* Initialize the vnode management data structures.
	180	*/
	181	void
	182	vntblinit()
	183	{
	184
	185	/*
	186	* Desired vnodes is a result of the physical page count
	187	* and the size of kernel's heap. It scales in proportion
	188	* to the amount of available physical memory. This can
	189	* cause trouble on 64-bit and large memory platforms.
	190	*/
	191	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
	192	desiredvnodes =
	193	min(maxproc + vmstats.v_page_count /4,
	194	2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
	195	(5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
	196
	197	minvnodes = desiredvnodes / 4;
	198	lwkt_inittoken(&mntvnode_token);
	199	lwkt_inittoken(&mntid_token);
	200	lwkt_inittoken(&spechash_token);
	201	TAILQ_INIT(&vnode_free_list);
	202	lwkt_inittoken(&vnode_free_list_token);
	203	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
	204	/*
	205	* Initialize the filesystem syncer.
	206	*/
	207	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
	208	&syncer_mask);
	209	syncer_maxdelay = syncer_mask + 1;
	210	}
	211
	212	/*
	213	* Mark a mount point as busy. Used to synchronize access and to delay
	214	* unmounting. Interlock is not released on failure.
	215	*/
	216	int
	217	vfs_busy(struct mount mp, int flags, struct lwkt_token interlkp,
	218	struct thread *td)
	219	{
	220	int lkflags;
	221
	222	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
	223	if (flags & LK_NOWAIT)
	224	return (ENOENT);
	225	mp->mnt_kern_flag \|= MNTK_MWAIT;
	226	if (interlkp) {
	227	lwkt_reltoken(interlkp);
	228	}
	229	/*
	230	* Since all busy locks are shared except the exclusive
	231	* lock granted when unmounting, the only place that a
	232	* wakeup needs to be done is at the release of the
	233	* exclusive lock at the end of dounmount.
	234	*/
	235	tsleep((caddr_t)mp, 0, "vfs_busy", 0);
	236	if (interlkp) {
	237	lwkt_gettoken(interlkp);
	238	}
	239	return (ENOENT);
	240	}
	241	lkflags = LK_SHARED \| LK_NOPAUSE;
	242	if (interlkp)
	243	lkflags \|= LK_INTERLOCK;
	244	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
	245	panic("vfs_busy: unexpected lock failure");
	246	return (0);
	247	}
	248
	249	/*
	250	* Free a busy filesystem.
	251	*/
	252	void
	253	vfs_unbusy(struct mount mp, struct thread td)
	254	{
	255	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
	256	}
	257
	258	/*
	259	* Lookup a filesystem type, and if found allocate and initialize
	260	* a mount structure for it.
	261	*
	262	* Devname is usually updated by mount(8) after booting.
	263	*/
	264	int
	265	vfs_rootmountalloc(char fstypename, char devname, struct mount **mpp)
	266	{
	267	struct thread td = curthread; / XXX */
	268	struct vfsconf *vfsp;
	269	struct mount *mp;
	270
	271	if (fstypename == NULL)
	272	return (ENODEV);
	273	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	274	if (!strcmp(vfsp->vfc_name, fstypename))
	275	break;
	276	if (vfsp == NULL)
	277	return (ENODEV);
	278	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
	279	bzero((char *)mp, (u_long)sizeof(struct mount));
	280	lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
	281	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
	282	TAILQ_INIT(&mp->mnt_nvnodelist);
	283	TAILQ_INIT(&mp->mnt_reservedvnlist);
	284	mp->mnt_nvnodelistsize = 0;
	285	mp->mnt_vfc = vfsp;
	286	mp->mnt_op = vfsp->vfc_vfsops;
	287	mp->mnt_flag = MNT_RDONLY;
	288	mp->mnt_vnodecovered = NULLVP;
	289	vfsp->vfc_refcount++;
	290	mp->mnt_iosize_max = DFLTPHYS;
	291	mp->mnt_stat.f_type = vfsp->vfc_typenum;
	292	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	293	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
	294	mp->mnt_stat.f_mntonname[0] = '/';
	295	mp->mnt_stat.f_mntonname[1] = 0;
	296	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
	297	*mpp = mp;
	298	return (0);
	299	}
	300
	301	/*
	302	* Find an appropriate filesystem to use for the root. If a filesystem
	303	* has not been preselected, walk through the list of known filesystems
	304	* trying those that have mountroot routines, and try them until one
	305	* works or we have tried them all.
	306	*/
	307	#ifdef notdef /* XXX JH */
	308	int
	309	lite2_vfs_mountroot()
	310	{
	311	struct vfsconf *vfsp;
	312	extern int (*lite2_mountroot) (void);
	313	int error;
	314
	315	if (lite2_mountroot != NULL)
	316	return ((*lite2_mountroot)());
	317	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	318	if (vfsp->vfc_mountroot == NULL)
	319	continue;
	320	if ((error = (*vfsp->vfc_mountroot)()) == 0)
	321	return (0);
	322	printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
	323	}
	324	return (ENODEV);
	325	}
	326	#endif
	327
	328	/*
	329	* Lookup a mount point by filesystem identifier.
	330	*/
	331	struct mount *
	332	vfs_getvfs(fsid)
	333	fsid_t *fsid;
	334	{
	335	struct mount *mp;
	336
	337	lwkt_gettoken(&mountlist_token);
	338	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	339	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	340	mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
	341	lwkt_reltoken(&mountlist_token);
	342	return (mp);
	343	}
	344	}
	345	lwkt_reltoken(&mountlist_token);
	346	return ((struct mount *) 0);
	347	}
	348
	349	/*
	350	* Get a new unique fsid. Try to make its val[0] unique, since this value
	351	* will be used to create fake device numbers for stat(). Also try (but
	352	* not so hard) make its val[0] unique mod 2^16, since some emulators only
	353	* support 16-bit device numbers. We end up with unique val[0]'s for the
	354	* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
	355	*
	356	* Keep in mind that several mounts may be running in parallel. Starting
	357	* the search one past where the previous search terminated is both a
	358	* micro-optimization and a defense against returning the same fsid to
	359	* different mounts.
	360	*/
	361	void
	362	vfs_getnewfsid(mp)
	363	struct mount *mp;
	364	{
	365	static u_int16_t mntid_base;
	366	fsid_t tfsid;
	367	int mtype;
	368
	369	lwkt_gettoken(&mntid_token);
	370	mtype = mp->mnt_vfc->vfc_typenum;
	371	tfsid.val[1] = mtype;
	372	mtype = (mtype & 0xFF) << 24;
	373	for (;;) {
	374	tfsid.val[0] = makeudev(255,
	375	mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF));
	376	mntid_base++;
	377	if (vfs_getvfs(&tfsid) == NULL)
	378	break;
	379	}
	380	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
	381	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
	382	lwkt_reltoken(&mntid_token);
	383	}
	384
	385	/*
	386	* Knob to control the precision of file timestamps:
	387	*
	388	* 0 = seconds only; nanoseconds zeroed.
	389	* 1 = seconds and nanoseconds, accurate within 1/HZ.
	390	* 2 = seconds and nanoseconds, truncated to microseconds.
	391	* >=3 = seconds and nanoseconds, maximum precision.
	392	*/
	393	enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
	394
	395	static int timestamp_precision = TSP_SEC;
	396	SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
	397	&timestamp_precision, 0, "");
	398
	399	/*
	400	* Get a current timestamp.
	401	*/
	402	void
	403	vfs_timestamp(tsp)
	404	struct timespec *tsp;
	405	{
	406	struct timeval tv;
	407
	408	switch (timestamp_precision) {
	409	case TSP_SEC:
	410	tsp->tv_sec = time_second;
	411	tsp->tv_nsec = 0;
	412	break;
	413	case TSP_HZ:
	414	getnanotime(tsp);
	415	break;
	416	case TSP_USEC:
	417	microtime(&tv);
	418	TIMEVAL_TO_TIMESPEC(&tv, tsp);
	419	break;
	420	case TSP_NSEC:
	421	default:
	422	nanotime(tsp);
	423	break;
	424	}
	425	}
	426
	427	/*
	428	* Set vnode attributes to VNOVAL
	429	*/
	430	void
	431	vattr_null(vap)
	432	struct vattr *vap;
	433	{
	434
	435	vap->va_type = VNON;
	436	vap->va_size = VNOVAL;
	437	vap->va_bytes = VNOVAL;
	438	vap->va_mode = VNOVAL;
	439	vap->va_nlink = VNOVAL;
	440	vap->va_uid = VNOVAL;
	441	vap->va_gid = VNOVAL;
	442	vap->va_fsid = VNOVAL;
	443	vap->va_fileid = VNOVAL;
	444	vap->va_blocksize = VNOVAL;
	445	vap->va_rdev = VNOVAL;
	446	vap->va_atime.tv_sec = VNOVAL;
	447	vap->va_atime.tv_nsec = VNOVAL;
	448	vap->va_mtime.tv_sec = VNOVAL;
	449	vap->va_mtime.tv_nsec = VNOVAL;
	450	vap->va_ctime.tv_sec = VNOVAL;
	451	vap->va_ctime.tv_nsec = VNOVAL;
	452	vap->va_flags = VNOVAL;
	453	vap->va_gen = VNOVAL;
	454	vap->va_vaflags = 0;
	455	}
	456
	457	/*
	458	* This routine is called when we have too many vnodes. It attempts
	459	* to free <count> vnodes and will potentially free vnodes that still
	460	* have VM backing store (VM backing store is typically the cause
	461	* of a vnode blowout so we want to do this). Therefore, this operation
	462	* is not considered cheap.
	463	*
	464	* A number of conditions may prevent a vnode from being reclaimed.
	465	* the buffer cache may have references on the vnode, a directory
	466	* vnode may still have references due to the namei cache representing
	467	* underlying files, or the vnode may be in active use. It is not
	468	* desireable to reuse such vnodes. These conditions may cause the
	469	* number of vnodes to reach some minimum value regardless of what
	470	* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
	471	*/
	472	static int
	473	vlrureclaim(struct mount *mp)
	474	{
	475	struct vnode *vp;
	476	int done;
	477	int trigger;
	478	int usevnodes;
	479	int count;
	480	int gen;
	481
	482	/*
	483	* Calculate the trigger point, don't allow user
	484	* screwups to blow us up. This prevents us from
	485	* recycling vnodes with lots of resident pages. We
	486	* aren't trying to free memory, we are trying to
	487	* free vnodes.
	488	*/
	489	usevnodes = desiredvnodes;
	490	if (usevnodes <= 0)
	491	usevnodes = 1;
	492	trigger = vmstats.v_page_count * 2 / usevnodes;
	493
	494	done = 0;
	495	gen = lwkt_gettoken(&mntvnode_token);
	496	count = mp->mnt_nvnodelistsize / 10 + 1;
	497	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
	498	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	499	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	500
	501	if (vp->v_type != VNON &&
	502	vp->v_type != VBAD &&
	503	VMIGHTFREE(vp) && /* critical path opt */
	504	(vp->v_object == NULL \|\| vp->v_object->resident_page_count < trigger)
	505	) {
	506	lwkt_gettoken(&vp->v_interlock);
	507	if (lwkt_gentoken(&mntvnode_token, &gen) == 0) {
	508	if (VMIGHTFREE(vp)) {
	509	vgonel(vp, curthread);
	510	done++;
	511	} else {
	512	lwkt_reltoken(&vp->v_interlock);
	513	}
	514	} else {
	515	lwkt_reltoken(&vp->v_interlock);
	516	}
	517	}
	518	--count;
	519	}
	520	lwkt_reltoken(&mntvnode_token);
	521	return done;
	522	}
	523
	524	/*
	525	* Attempt to recycle vnodes in a context that is always safe to block.
	526	* Calling vlrurecycle() from the bowels of file system code has some
	527	* interesting deadlock problems.
	528	*/
	529	static struct thread *vnlruthread;
	530	static int vnlruproc_sig;
	531
	532	static void
	533	vnlru_proc(void)
	534	{
	535	struct mount mp, nmp;
	536	int s;
	537	int done;
	538	struct thread *td = curthread;
	539
	540	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	541	SHUTDOWN_PRI_FIRST);
	542
	543	s = splbio();
	544	for (;;) {
	545	kproc_suspend_loop();
	546	if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
	547	vnlruproc_sig = 0;
	548	wakeup(&vnlruproc_sig);
	549	tsleep(td, 0, "vlruwt", hz);
	550	continue;
	551	}
	552	done = 0;
	553	lwkt_gettoken(&mountlist_token);
	554	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	555	if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, td)) {
	556	nmp = TAILQ_NEXT(mp, mnt_list);
	557	continue;
	558	}
	559	done += vlrureclaim(mp);
	560	lwkt_gettoken(&mountlist_token);
	561	nmp = TAILQ_NEXT(mp, mnt_list);
	562	vfs_unbusy(mp, td);
	563	}
	564	lwkt_reltoken(&mountlist_token);
	565	if (done == 0) {
	566	vnlru_nowhere++;
	567	tsleep(td, 0, "vlrup", hz * 3);
	568	}
	569	}
	570	splx(s);
	571	}
	572
	573	static struct kproc_desc vnlru_kp = {
	574	"vnlru",
	575	vnlru_proc,
	576	&vnlruthread
	577	};
	578	SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
	579
	580	/*
	581	* Routines having to do with the management of the vnode table.
	582	*/
	583	extern vop_t **dead_vnodeop_p;
	584
	585	/*
	586	* Return the next vnode from the free list.
	587	*/
	588	int
	589	getnewvnode(tag, mp, vops, vpp)
	590	enum vtagtype tag;
	591	struct mount *mp;
	592	vop_t **vops;
	593	struct vnode **vpp;
	594	{
	595	int s;
	596	int gen;
	597	int vgen;
	598	struct thread td = curthread; / XXX */
	599	struct vnode *vp = NULL;
	600	vm_object_t object;
	601
	602	s = splbio();
	603
	604	/*
	605	* Try to reuse vnodes if we hit the max. This situation only
	606	* occurs in certain large-memory (2G+) situations. We cannot
	607	* attempt to directly reclaim vnodes due to nasty recursion
	608	* problems.
	609	*/
	610	while (numvnodes - freevnodes > desiredvnodes) {
	611	if (vnlruproc_sig == 0) {
	612	vnlruproc_sig = 1; /* avoid unnecessary wakeups */
	613	wakeup(vnlruthread);
	614	}
	615	tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
	616	}
	617
	618
	619	/*
	620	* Attempt to reuse a vnode already on the free list, allocating
	621	* a new vnode if we can't find one or if we have not reached a
	622	* good minimum for good LRU performance.
	623	*/
	624	gen = lwkt_gettoken(&vnode_free_list_token);
	625	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
	626	int count;
	627
	628	for (count = 0; count < freevnodes; count++) {
	629	vp = TAILQ_FIRST(&vnode_free_list);
	630	if (vp == NULL \|\| vp->v_usecount)
	631	panic("getnewvnode: free vnode isn't");
	632
	633	/*
	634	* Get the vnode's interlock, then re-obtain
	635	* vnode_free_list_token in case we lost it. If we
	636	* did lose it while getting the vnode interlock,
	637	* even if we got it back again, then retry.
	638	*/
	639	vgen = lwkt_gettoken(&vp->v_interlock);
	640	if (lwkt_gentoken(&vnode_free_list_token, &gen) != 0) {
	641	--count;
	642	lwkt_reltoken(&vp->v_interlock);
	643	vp = NULL;
	644	continue;
	645	}
	646
	647	/*
	648	* Whew! We have both tokens. Since we didn't lose
	649	* the free list VFREE had better still be set. But
	650	* we aren't out of the woods yet. We have to get
	651	* the object (may block). If the vnode is not
	652	* suitable then move it to the end of the list
	653	* if we can. If we can't move it to the end of the
	654	* list retry again.
	655	*/
	656	if ((VOP_GETVOBJECT(vp, &object) == 0 &&
	657	(object->resident_page_count \|\| object->ref_count))
	658	) {
	659	if (lwkt_gentoken(&vp->v_interlock, &vgen) == 0 &&
	660	lwkt_gentoken(&vnode_free_list_token, &gen) == 0
	661	) {
	662	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	663	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	664	} else {
	665	--count;
	666	}
	667	lwkt_reltoken(&vp->v_interlock);
	668	vp = NULL;
	669	continue;
	670	}
	671
	672	/*
	673	* Still not out of the woods. VOBJECT might have
	674	* blocked, if we did not retain our tokens we have
	675	* to retry.
	676	*/
	677	if (lwkt_gentoken(&vp->v_interlock, &vgen) != 0 \|\|
	678	lwkt_gentoken(&vnode_free_list_token, &gen) != 0) {
	679	--count;
	680	vp = NULL;
	681	continue;
	682	}
	683	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	684	KKASSERT(vp->v_flag & VFREE);
	685
	686	/*
	687	* If we have children in the namecache we cannot
	688	* reuse the vnode yet because it will break the
	689	* namecache chain (YYY use nc_refs for the check?)
	690	*/
	691	if (TAILQ_FIRST(&vp->v_namecache)) {
	692	if (cache_leaf_test(vp) < 0) {
	693	lwkt_reltoken(&vp->v_interlock);
	694	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	695	vp = NULL;
	696	continue;
	697	}
	698	}
	699	break;
	700	}
	701	}
	702
	703	if (vp) {
	704	vp->v_flag \|= VDOOMED;
	705	vp->v_flag &= ~VFREE;
	706	freevnodes--;
	707	lwkt_reltoken(&vnode_free_list_token);
	708	cache_purge(vp); /* YYY may block */
	709	vp->v_lease = NULL;
	710	if (vp->v_type != VBAD) {
	711	vgonel(vp, td);
	712	} else {
	713	lwkt_reltoken(&vp->v_interlock);
	714	}
	715
	716	#ifdef INVARIANTS
	717	{
	718	int s;
	719
	720	if (vp->v_data)
	721	panic("cleaned vnode isn't");
	722	s = splbio();
	723	if (vp->v_numoutput)
	724	panic("Clean vnode has pending I/O's");
	725	splx(s);
	726	}
	727	#endif
	728	vp->v_flag = 0;
	729	vp->v_lastw = 0;
	730	vp->v_lasta = 0;
	731	vp->v_cstart = 0;
	732	vp->v_clen = 0;
	733	vp->v_socket = 0;
	734	vp->v_writecount = 0; /* XXX */
	735	} else {
	736	lwkt_reltoken(&vnode_free_list_token);
	737	vp = (struct vnode *) zalloc(vnode_zone);
	738	bzero((char ) vp, sizeof vp);
	739	lwkt_inittoken(&vp->v_interlock);
	740	vp->v_dd = vp;
	741	cache_purge(vp);
	742	TAILQ_INIT(&vp->v_namecache);
	743	numvnodes++;
	744	}
	745
	746	TAILQ_INIT(&vp->v_cleanblkhd);
	747	TAILQ_INIT(&vp->v_dirtyblkhd);
	748	vp->v_type = VNON;
	749	vp->v_tag = tag;
	750	vp->v_op = vops;
	751	insmntque(vp, mp);
	752	*vpp = vp;
	753	vp->v_usecount = 1;
	754	vp->v_data = 0;
	755	splx(s);
	756
	757	vfs_object_create(vp, td);
	758	return (0);
	759	}
	760
	761	/*
	762	* Move a vnode from one mount queue to another.
	763	*/
	764	static void
	765	insmntque(vp, mp)
	766	struct vnode *vp;
	767	struct mount *mp;
	768	{
	769
	770	lwkt_gettoken(&mntvnode_token);
	771	/*
	772	* Delete from old mount point vnode list, if on one.
	773	*/
	774	if (vp->v_mount != NULL) {
	775	KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
	776	("bad mount point vnode list size"));
	777	TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
	778	vp->v_mount->mnt_nvnodelistsize--;
	779	}
	780	/*
	781	* Insert into list of vnodes for the new mount point, if available.
	782	*/
	783	if ((vp->v_mount = mp) == NULL) {
	784	lwkt_reltoken(&mntvnode_token);
	785	return;
	786	}
	787	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	788	mp->mnt_nvnodelistsize++;
	789	lwkt_reltoken(&mntvnode_token);
	790	}
	791
	792	/*
	793	* Update outstanding I/O count and do wakeup if requested.
	794	*/
	795	void
	796	vwakeup(bp)
	797	struct buf *bp;
	798	{
	799	struct vnode *vp;
	800
	801	bp->b_flags &= ~B_WRITEINPROG;
	802	if ((vp = bp->b_vp)) {
	803	vp->v_numoutput--;
	804	if (vp->v_numoutput < 0)
	805	panic("vwakeup: neg numoutput");
	806	if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
	807	vp->v_flag &= ~VBWAIT;
	808	wakeup((caddr_t) &vp->v_numoutput);
	809	}
	810	}
	811	}
	812
	813	/*
	814	* Flush out and invalidate all buffers associated with a vnode.
	815	* Called with the underlying object locked.
	816	*/
	817	int
	818	vinvalbuf(struct vnode vp, int flags, struct thread td,
	819	int slpflag, int slptimeo)
	820	{
	821	struct buf *bp;
	822	struct buf nbp, blist;
	823	int s, error;
	824	vm_object_t object;
	825
	826	if (flags & V_SAVE) {
	827	s = splbio();
	828	while (vp->v_numoutput) {
	829	vp->v_flag \|= VBWAIT;
	830	error = tsleep((caddr_t)&vp->v_numoutput,
	831	slpflag, "vinvlbuf", slptimeo);
	832	if (error) {
	833	splx(s);
	834	return (error);
	835	}
	836	}
	837	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	838	splx(s);
	839	if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
	840	return (error);
	841	s = splbio();
	842	if (vp->v_numoutput > 0 \|\|
	843	!TAILQ_EMPTY(&vp->v_dirtyblkhd))
	844	panic("vinvalbuf: dirty bufs");
	845	}
	846	splx(s);
	847	}
	848	s = splbio();
	849	for (;;) {
	850	blist = TAILQ_FIRST(&vp->v_cleanblkhd);
	851	if (!blist)
	852	blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
	853	if (!blist)
	854	break;
	855
	856	for (bp = blist; bp; bp = nbp) {
	857	nbp = TAILQ_NEXT(bp, b_vnbufs);
	858	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	859	error = BUF_TIMELOCK(bp,
	860	LK_EXCLUSIVE \| LK_SLEEPFAIL,
	861	"vinvalbuf", slpflag, slptimeo);
	862	if (error == ENOLCK)
	863	break;
	864	splx(s);
	865	return (error);
	866	}
	867	/*
	868	* XXX Since there are no node locks for NFS, I
	869	* believe there is a slight chance that a delayed
	870	* write will occur while sleeping just above, so
	871	* check for it. Note that vfs_bio_awrite expects
	872	* buffers to reside on a queue, while VOP_BWRITE and
	873	* brelse do not.
	874	*/
	875	if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) &&
	876	(flags & V_SAVE)) {
	877
	878	if (bp->b_vp == vp) {
	879	if (bp->b_flags & B_CLUSTEROK) {
	880	BUF_UNLOCK(bp);
	881	vfs_bio_awrite(bp);
	882	} else {
	883	bremfree(bp);
	884	bp->b_flags \|= B_ASYNC;
	885	VOP_BWRITE(bp->b_vp, bp);
	886	}
	887	} else {
	888	bremfree(bp);
	889	(void) VOP_BWRITE(bp->b_vp, bp);
	890	}
	891	break;
	892	}
	893	bremfree(bp);
	894	bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF);
	895	bp->b_flags &= ~B_ASYNC;
	896	brelse(bp);
	897	}
	898	}
	899
	900	/*
	901	* Wait for I/O to complete. XXX needs cleaning up. The vnode can
	902	* have write I/O in-progress but if there is a VM object then the
	903	* VM object can also have read-I/O in-progress.
	904	*/
	905	do {
	906	while (vp->v_numoutput > 0) {
	907	vp->v_flag \|= VBWAIT;
	908	tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
	909	}
	910	if (VOP_GETVOBJECT(vp, &object) == 0) {
	911	while (object->paging_in_progress)
	912	vm_object_pip_sleep(object, "vnvlbx");
	913	}
	914	} while (vp->v_numoutput > 0);
	915
	916	splx(s);
	917
	918	/*
	919	* Destroy the copy in the VM cache, too.
	920	*/
	921	lwkt_gettoken(&vp->v_interlock);
	922	if (VOP_GETVOBJECT(vp, &object) == 0) {
	923	vm_object_page_remove(object, 0, 0,
	924	(flags & V_SAVE) ? TRUE : FALSE);
	925	}
	926	lwkt_reltoken(&vp->v_interlock);
	927
	928	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd))
	929	panic("vinvalbuf: flush failed");
	930	return (0);
	931	}
	932
	933	/*
	934	* Truncate a file's buffer and pages to a specified length. This
	935	* is in lieu of the old vinvalbuf mechanism, which performed unneeded
	936	* sync activity.
	937	*/
	938	int
	939	vtruncbuf(struct vnode vp, struct thread td, off_t length, int blksize)
	940	{
	941	struct buf *bp;
	942	struct buf *nbp;
	943	int s, anyfreed;
	944	int trunclbn;
	945
	946	/*
	947	* Round up to the next lbn.
	948	*/
	949	trunclbn = (length + blksize - 1) / blksize;
	950
	951	s = splbio();
	952	restart:
	953	anyfreed = 1;
	954	for (;anyfreed;) {
	955	anyfreed = 0;
	956	for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
	957	nbp = TAILQ_NEXT(bp, b_vnbufs);
	958	if (bp->b_lblkno >= trunclbn) {
	959	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	960	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	961	goto restart;
	962	} else {
	963	bremfree(bp);
	964	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	965	bp->b_flags &= ~B_ASYNC;
	966	brelse(bp);
	967	anyfreed = 1;
	968	}
	969	if (nbp &&
	970	(((nbp->b_xflags & BX_VNCLEAN) == 0) \|\|
	971	(nbp->b_vp != vp) \|\|
	972	(nbp->b_flags & B_DELWRI))) {
	973	goto restart;
	974	}
	975	}
	976	}
	977
	978	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	979	nbp = TAILQ_NEXT(bp, b_vnbufs);
	980	if (bp->b_lblkno >= trunclbn) {
	981	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	982	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	983	goto restart;
	984	} else {
	985	bremfree(bp);
	986	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	987	bp->b_flags &= ~B_ASYNC;
	988	brelse(bp);
	989	anyfreed = 1;
	990	}
	991	if (nbp &&
	992	(((nbp->b_xflags & BX_VNDIRTY) == 0) \|\|
	993	(nbp->b_vp != vp) \|\|
	994	(nbp->b_flags & B_DELWRI) == 0)) {
	995	goto restart;
	996	}
	997	}
	998	}
	999	}
	1000
	1001	if (length > 0) {
	1002	restartsync:
	1003	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	1004	nbp = TAILQ_NEXT(bp, b_vnbufs);
	1005	if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
	1006	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	1007	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	1008	goto restart;
	1009	} else {
	1010	bremfree(bp);
	1011	if (bp->b_vp == vp) {
	1012	bp->b_flags \|= B_ASYNC;
	1013	} else {
	1014	bp->b_flags &= ~B_ASYNC;
	1015	}
	1016	VOP_BWRITE(bp->b_vp, bp);
	1017	}
	1018	goto restartsync;
	1019	}
	1020
	1021	}
	1022	}
	1023
	1024	while (vp->v_numoutput > 0) {
	1025	vp->v_flag \|= VBWAIT;
	1026	tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
	1027	}
	1028
	1029	splx(s);
	1030
	1031	vnode_pager_setsize(vp, length);
	1032
	1033	return (0);
	1034	}
	1035
	1036	/*
	1037	* Associate a buffer with a vnode.
	1038	*/
	1039	void
	1040	bgetvp(vp, bp)
	1041	struct vnode *vp;
	1042	struct buf *bp;
	1043	{
	1044	int s;
	1045
	1046	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
	1047
	1048	vhold(vp);
	1049	bp->b_vp = vp;
	1050	bp->b_dev = vn_todev(vp);
	1051	/*
	1052	* Insert onto list for new vnode.
	1053	*/
	1054	s = splbio();
	1055	bp->b_xflags \|= BX_VNCLEAN;
	1056	bp->b_xflags &= ~BX_VNDIRTY;
	1057	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
	1058	splx(s);
	1059	}
	1060
	1061	/*
	1062	* Disassociate a buffer from a vnode.
	1063	*/
	1064	void
	1065	brelvp(bp)
	1066	struct buf *bp;
	1067	{
	1068	struct vnode *vp;
	1069	struct buflists *listheadp;
	1070	int s;
	1071
	1072	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
	1073
	1074	/*
	1075	* Delete from old vnode list, if on one.
	1076	*/
	1077	vp = bp->b_vp;
	1078	s = splbio();
	1079	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	1080	if (bp->b_xflags & BX_VNDIRTY)
	1081	listheadp = &vp->v_dirtyblkhd;
	1082	else
	1083	listheadp = &vp->v_cleanblkhd;
	1084	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	1085	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	1086	}
	1087	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	1088	vp->v_flag &= ~VONWORKLST;
	1089	LIST_REMOVE(vp, v_synclist);
	1090	}
	1091	splx(s);
	1092	bp->b_vp = (struct vnode *) 0;
	1093	vdrop(vp);
	1094	}
	1095
	1096	/*
	1097	* The workitem queue.
	1098	*
	1099	* It is useful to delay writes of file data and filesystem metadata
	1100	* for tens of seconds so that quickly created and deleted files need
	1101	* not waste disk bandwidth being created and removed. To realize this,
	1102	* we append vnodes to a "workitem" queue. When running with a soft
	1103	* updates implementation, most pending metadata dependencies should
	1104	* not wait for more than a few seconds. Thus, mounted on block devices
	1105	* are delayed only about a half the time that file data is delayed.
	1106	* Similarly, directory updates are more critical, so are only delayed
	1107	* about a third the time that file data is delayed. Thus, there are
	1108	* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
	1109	* one each second (driven off the filesystem syncer process). The
	1110	* syncer_delayno variable indicates the next queue that is to be processed.
	1111	* Items that need to be processed soon are placed in this queue:
	1112	*
	1113	* syncer_workitem_pending[syncer_delayno]
	1114	*
	1115	* A delay of fifteen seconds is done by placing the request fifteen
	1116	* entries later in the queue:
	1117	*
	1118	* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
	1119	*
	1120	*/
	1121
	1122	/*
	1123	* Add an item to the syncer work queue.
	1124	*/
	1125	static void
	1126	vn_syncer_add_to_worklist(struct vnode *vp, int delay)
	1127	{
	1128	int s, slot;
	1129
	1130	s = splbio();
	1131
	1132	if (vp->v_flag & VONWORKLST) {
	1133	LIST_REMOVE(vp, v_synclist);
	1134	}
	1135
	1136	if (delay > syncer_maxdelay - 2)
	1137	delay = syncer_maxdelay - 2;
	1138	slot = (syncer_delayno + delay) & syncer_mask;
	1139
	1140	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
	1141	vp->v_flag \|= VONWORKLST;
	1142	splx(s);
	1143	}
	1144
	1145	struct thread *updatethread;
	1146	static void sched_sync (void);
	1147	static struct kproc_desc up_kp = {
	1148	"syncer",
	1149	sched_sync,
	1150	&updatethread
	1151	};
	1152	SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
	1153
	1154	/*
	1155	* System filesystem synchronizer daemon.
	1156	*/
	1157	void
	1158	sched_sync(void)
	1159	{
	1160	struct synclist *slp;
	1161	struct vnode *vp;
	1162	long starttime;
	1163	int s;
	1164	struct thread *td = curthread;
	1165
	1166	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	1167	SHUTDOWN_PRI_LAST);
	1168
	1169	for (;;) {
	1170	kproc_suspend_loop();
	1171
	1172	starttime = time_second;
	1173
	1174	/*
	1175	* Push files whose dirty time has expired. Be careful
	1176	* of interrupt race on slp queue.
	1177	*/
	1178	s = splbio();
	1179	slp = &syncer_workitem_pending[syncer_delayno];
	1180	syncer_delayno += 1;
	1181	if (syncer_delayno == syncer_maxdelay)
	1182	syncer_delayno = 0;
	1183	splx(s);
	1184
	1185	while ((vp = LIST_FIRST(slp)) != NULL) {
	1186	if (VOP_ISLOCKED(vp, NULL) == 0) {
	1187	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, td);
	1188	(void) VOP_FSYNC(vp, MNT_LAZY, td);
	1189	VOP_UNLOCK(vp, 0, td);
	1190	}
	1191	s = splbio();
	1192	if (LIST_FIRST(slp) == vp) {
	1193	/*
	1194	* Note: v_tag VT_VFS vps can remain on the
	1195	* worklist too with no dirty blocks, but
	1196	* since sync_fsync() moves it to a different
	1197	* slot we are safe.
	1198	*/
	1199	if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
	1200	!vn_isdisk(vp, NULL))
	1201	panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
	1202	/*
	1203	* Put us back on the worklist. The worklist
	1204	* routine will remove us from our current
	1205	* position and then add us back in at a later
	1206	* position.
	1207	*/
	1208	vn_syncer_add_to_worklist(vp, syncdelay);
	1209	}
	1210	splx(s);
	1211	}
	1212
	1213	/*
	1214	* Do soft update processing.
	1215	*/
	1216	if (bioops.io_sync)
	1217	(*bioops.io_sync)(NULL);
	1218
	1219	/*
	1220	* The variable rushjob allows the kernel to speed up the
	1221	* processing of the filesystem syncer process. A rushjob
	1222	* value of N tells the filesystem syncer to process the next
	1223	* N seconds worth of work on its queue ASAP. Currently rushjob
	1224	* is used by the soft update code to speed up the filesystem
	1225	* syncer process when the incore state is getting so far
	1226	* ahead of the disk that the kernel memory pool is being
	1227	* threatened with exhaustion.
	1228	*/
	1229	if (rushjob > 0) {
	1230	rushjob -= 1;
	1231	continue;
	1232	}
	1233	/*
	1234	* If it has taken us less than a second to process the
	1235	* current work, then wait. Otherwise start right over
	1236	* again. We can still lose time if any single round
	1237	* takes more than two seconds, but it does not really
	1238	* matter as we are just trying to generally pace the
	1239	* filesystem activity.
	1240	*/
	1241	if (time_second == starttime)
	1242	tsleep(&lbolt, 0, "syncer", 0);
	1243	}
	1244	}
	1245
	1246	/*
	1247	* Request the syncer daemon to speed up its work.
	1248	* We never push it to speed up more than half of its
	1249	* normal turn time, otherwise it could take over the cpu.
	1250	*
	1251	* YYY wchan field protected by the BGL.
	1252	*/
	1253	int
	1254	speedup_syncer()
	1255	{
	1256	crit_enter();
	1257	if (updatethread->td_wchan == &lbolt) { /* YYY */
	1258	unsleep(updatethread);
	1259	lwkt_schedule(updatethread);
	1260	}
	1261	crit_exit();
	1262	if (rushjob < syncdelay / 2) {
	1263	rushjob += 1;
	1264	stat_rush_requests += 1;
	1265	return (1);
	1266	}
	1267	return(0);
	1268	}
	1269
	1270	/*
	1271	* Associate a p-buffer with a vnode.
	1272	*
	1273	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	1274	* with the buffer. i.e. the bp has not been linked into the vnode or
	1275	* ref-counted.
	1276	*/
	1277	void
	1278	pbgetvp(vp, bp)
	1279	struct vnode *vp;
	1280	struct buf *bp;
	1281	{
	1282
	1283	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
	1284
	1285	bp->b_vp = vp;
	1286	bp->b_flags \|= B_PAGING;
	1287	bp->b_dev = vn_todev(vp);
	1288	}
	1289
	1290	/*
	1291	* Disassociate a p-buffer from a vnode.
	1292	*/
	1293	void
	1294	pbrelvp(bp)
	1295	struct buf *bp;
	1296	{
	1297
	1298	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
	1299
	1300	/* XXX REMOVE ME */
	1301	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
	1302	panic(
	1303	"relpbuf(): b_vp was probably reassignbuf()d %p %x",
	1304	bp,
	1305	(int)bp->b_flags
	1306	);
	1307	}
	1308	bp->b_vp = (struct vnode *) 0;
	1309	bp->b_flags &= ~B_PAGING;
	1310	}
	1311
	1312	void
	1313	pbreassignbuf(bp, newvp)
	1314	struct buf *bp;
	1315	struct vnode *newvp;
	1316	{
	1317	if ((bp->b_flags & B_PAGING) == 0) {
	1318	panic(
	1319	"pbreassignbuf() on non phys bp %p",
	1320	bp
	1321	);
	1322	}
	1323	bp->b_vp = newvp;
	1324	}
	1325
	1326	/*
	1327	* Reassign a buffer from one vnode to another.
	1328	* Used to assign file specific control information
	1329	* (indirect blocks) to the vnode to which they belong.
	1330	*/
	1331	void
	1332	reassignbuf(bp, newvp)
	1333	struct buf *bp;
	1334	struct vnode *newvp;
	1335	{
	1336	struct buflists *listheadp;
	1337	int delay;
	1338	int s;
	1339
	1340	if (newvp == NULL) {
	1341	printf("reassignbuf: NULL");
	1342	return;
	1343	}
	1344	++reassignbufcalls;
	1345
	1346	/*
	1347	* B_PAGING flagged buffers cannot be reassigned because their vp
	1348	* is not fully linked in.
	1349	*/
	1350	if (bp->b_flags & B_PAGING)
	1351	panic("cannot reassign paging buffer");
	1352
	1353	s = splbio();
	1354	/*
	1355	* Delete from old vnode list, if on one.
	1356	*/
	1357	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	1358	if (bp->b_xflags & BX_VNDIRTY)
	1359	listheadp = &bp->b_vp->v_dirtyblkhd;
	1360	else
	1361	listheadp = &bp->b_vp->v_cleanblkhd;
	1362	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	1363	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	1364	if (bp->b_vp != newvp) {
	1365	vdrop(bp->b_vp);
	1366	bp->b_vp = NULL; /* for clarification */
	1367	}
	1368	}
	1369	/*
	1370	* If dirty, put on list of dirty buffers; otherwise insert onto list
	1371	* of clean buffers.
	1372	*/
	1373	if (bp->b_flags & B_DELWRI) {
	1374	struct buf *tbp;
	1375
	1376	listheadp = &newvp->v_dirtyblkhd;
	1377	if ((newvp->v_flag & VONWORKLST) == 0) {
	1378	switch (newvp->v_type) {
	1379	case VDIR:
	1380	delay = dirdelay;
	1381	break;
	1382	case VCHR:
	1383	case VBLK:
	1384	if (newvp->v_specmountpoint != NULL) {
	1385	delay = metadelay;
	1386	break;
	1387	}
	1388	/* fall through */
	1389	default:
	1390	delay = filedelay;
	1391	}
	1392	vn_syncer_add_to_worklist(newvp, delay);
	1393	}
	1394	bp->b_xflags \|= BX_VNDIRTY;
	1395	tbp = TAILQ_FIRST(listheadp);
	1396	if (tbp == NULL \|\|
	1397	bp->b_lblkno == 0 \|\|
	1398	(bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\|
	1399	(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
	1400	TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
	1401	++reassignbufsortgood;
	1402	} else if (bp->b_lblkno < 0) {
	1403	TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
	1404	++reassignbufsortgood;
	1405	} else if (reassignbufmethod == 1) {
	1406	/*
	1407	* New sorting algorithm, only handle sequential case,
	1408	* otherwise append to end (but before metadata)
	1409	*/
	1410	if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
	1411	(tbp->b_xflags & BX_VNDIRTY)) {
	1412	/*
	1413	* Found the best place to insert the buffer
	1414	*/
	1415	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1416	++reassignbufsortgood;
	1417	} else {
	1418	/*
	1419	* Missed, append to end, but before meta-data.
	1420	* We know that the head buffer in the list is
	1421	* not meta-data due to prior conditionals.
	1422	*
	1423	* Indirect effects: NFS second stage write
	1424	* tends to wind up here, giving maximum
	1425	* distance between the unstable write and the
	1426	* commit rpc.
	1427	*/
	1428	tbp = TAILQ_LAST(listheadp, buflists);
	1429	while (tbp && tbp->b_lblkno < 0)
	1430	tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
	1431	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1432	++reassignbufsortbad;
	1433	}
	1434	} else {
	1435	/*
	1436	* Old sorting algorithm, scan queue and insert
	1437	*/
	1438	struct buf *ttbp;
	1439	while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
	1440	(ttbp->b_lblkno < bp->b_lblkno)) {
	1441	++reassignbufloops;
	1442	tbp = ttbp;
	1443	}
	1444	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	1445	}
	1446	} else {
	1447	bp->b_xflags \|= BX_VNCLEAN;
	1448	TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
	1449	if ((newvp->v_flag & VONWORKLST) &&
	1450	TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
	1451	newvp->v_flag &= ~VONWORKLST;
	1452	LIST_REMOVE(newvp, v_synclist);
	1453	}
	1454	}
	1455	if (bp->b_vp != newvp) {
	1456	bp->b_vp = newvp;
	1457	vhold(bp->b_vp);
	1458	}
	1459	splx(s);
	1460	}
	1461
	1462	/*
	1463	* Create a vnode for a block device.
	1464	* Used for mounting the root file system.
	1465	*/
	1466	int
	1467	bdevvp(dev, vpp)
	1468	dev_t dev;
	1469	struct vnode **vpp;
	1470	{
	1471	struct vnode *vp;
	1472	struct vnode *nvp;
	1473	int error;
	1474
	1475	if (dev == NODEV) {
	1476	*vpp = NULLVP;
	1477	return (ENXIO);
	1478	}
	1479	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
	1480	if (error) {
	1481	*vpp = NULLVP;
	1482	return (error);
	1483	}
	1484	vp = nvp;
	1485	vp->v_type = VBLK;
	1486	addalias(vp, dev);
	1487	*vpp = vp;
	1488	return (0);
	1489	}
	1490
	1491	/*
	1492	* Add a vnode to the alias list hung off the dev_t.
	1493	*
	1494	* The reason for this gunk is that multiple vnodes can reference
	1495	* the same physical device, so checking vp->v_usecount to see
	1496	* how many users there are is inadequate; the v_usecount for
	1497	* the vnodes need to be accumulated. vcount() does that.
	1498	*/
	1499	void
	1500	addaliasu(struct vnode *nvp, udev_t nvp_rdev)
	1501	{
	1502	dev_t dev;
	1503
	1504	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	1505	panic("addaliasu on non-special vnode");
	1506	dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
	1507	if (dev != NODEV) {
	1508	nvp->v_rdev = dev;
	1509	addalias(nvp, dev);
	1510	} else
	1511	nvp->v_rdev = NULL;
	1512	}
	1513
	1514	void
	1515	addalias(struct vnode *nvp, dev_t dev)
	1516	{
	1517
	1518	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	1519	panic("addalias on non-special vnode");
	1520
	1521	nvp->v_rdev = dev;
	1522	lwkt_gettoken(&spechash_token);
	1523	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
	1524	lwkt_reltoken(&spechash_token);
	1525	}
	1526
	1527	/*
	1528	* Grab a particular vnode from the free list, increment its
	1529	* reference count and lock it. The vnode lock bit is set if the
	1530	* vnode is being eliminated in vgone. The process is awakened
	1531	* when the transition is completed, and an error returned to
	1532	* indicate that the vnode is no longer usable (possibly having
	1533	* been changed to a new file system type).
	1534	*/
	1535	int
	1536	vget(vp, flags, td)
	1537	struct vnode *vp;
	1538	int flags;
	1539	struct thread *td;
	1540	{
	1541	int error;
	1542
	1543	/*
	1544	* If the vnode is in the process of being cleaned out for
	1545	* another use, we wait for the cleaning to finish and then
	1546	* return failure. Cleaning is determined by checking that
	1547	* the VXLOCK flag is set.
	1548	*/
	1549	if (vp->v_flag & VXLOCK) {
	1550	if (vp->v_vxproc == curproc) {
	1551	#if 0
	1552	/* this can now occur in normal operation */
	1553	log(LOG_INFO, "VXLOCK interlock avoided\n");
	1554	#endif
	1555	} else {
	1556	vp->v_flag \|= VXWANT;
	1557	tsleep((caddr_t)vp, 0, "vget", 0);
	1558	return (ENOENT);
	1559	}
	1560	}
	1561
	1562	/*
	1563	* Bump v_usecount to prevent the vnode from being cleaned. If the
	1564	* vnode gets cleaned unexpectedly we could wind up calling lockmgr
	1565	* on a lock embedded in an inode which is then ripped out from
	1566	* it.
	1567	*/
	1568	vp->v_usecount++; /* XXX MP */
	1569
	1570	if ((flags & LK_INTERLOCK) == 0) {
	1571	lwkt_gettoken(&vp->v_interlock);
	1572	}
	1573
	1574	if (VSHOULDBUSY(vp))
	1575	vbusy(vp);
	1576	if (flags & LK_TYPE_MASK) {
	1577	if ((error = vn_lock(vp, flags \| LK_INTERLOCK, td)) != 0) {
	1578	/*
	1579	* must expand vrele here because we do not want
	1580	* to call VOP_INACTIVE if the reference count
	1581	* drops back to zero since it was never really
	1582	* active. We must remove it from the free list
	1583	* before sleeping so that multiple processes do
	1584	* not try to recycle it.
	1585	*/
	1586	lwkt_gettoken(&vp->v_interlock);
	1587	vp->v_usecount--;
	1588	if (VSHOULDFREE(vp))
	1589	vfree(vp);
	1590	else
	1591	vlruvp(vp);
	1592	lwkt_reltoken(&vp->v_interlock);
	1593	}
	1594	return (error);
	1595	}
	1596	lwkt_reltoken(&vp->v_interlock);
	1597	return (0);
	1598	}
	1599
	1600	void
	1601	vref(struct vnode *vp)
	1602	{
	1603	vp->v_usecount++; /* XXX MP */
	1604	}
	1605
	1606	/*
	1607	* Vnode put/release.
	1608	* If count drops to zero, call inactive routine and return to freelist.
	1609	*/
	1610	void
	1611	vrele(struct vnode *vp)
	1612	{
	1613	struct thread td = curthread; / XXX */
	1614
	1615	KASSERT(vp != NULL, ("vrele: null vp"));
	1616
	1617	lwkt_gettoken(&vp->v_interlock);
	1618
	1619	if (vp->v_usecount > 1) {
	1620
	1621	vp->v_usecount--;
	1622	lwkt_reltoken(&vp->v_interlock);
	1623
	1624	return;
	1625	}
	1626
	1627	if (vp->v_usecount == 1) {
	1628	vp->v_usecount--;
	1629	/*
	1630	* We must call VOP_INACTIVE with the node locked.
	1631	* If we are doing a vpu, the node is already locked,
	1632	* but, in the case of vrele, we must explicitly lock
	1633	* the vnode before calling VOP_INACTIVE
	1634	*/
	1635
	1636	if (vn_lock(vp, LK_EXCLUSIVE \| LK_INTERLOCK, td) == 0)
	1637	VOP_INACTIVE(vp, td);
	1638	if (VSHOULDFREE(vp))
	1639	vfree(vp);
	1640	else
	1641	vlruvp(vp);
	1642	} else {
	1643	#ifdef DIAGNOSTIC
	1644	vprint("vrele: negative ref count", vp);
	1645	lwkt_reltoken(&vp->v_interlock);
	1646	#endif
	1647	panic("vrele: negative ref cnt");
	1648	}
	1649	}
	1650
	1651	void
	1652	vput(struct vnode *vp)
	1653	{
	1654	struct thread td = curthread; / XXX */
	1655
	1656	KASSERT(vp != NULL, ("vput: null vp"));
	1657
	1658	lwkt_gettoken(&vp->v_interlock);
	1659
	1660	if (vp->v_usecount > 1) {
	1661	vp->v_usecount--;
	1662	VOP_UNLOCK(vp, LK_INTERLOCK, td);
	1663	return;
	1664	}
	1665
	1666	if (vp->v_usecount == 1) {
	1667	vp->v_usecount--;
	1668	/*
	1669	* We must call VOP_INACTIVE with the node locked.
	1670	* If we are doing a vpu, the node is already locked,
	1671	* so we just need to release the vnode mutex.
	1672	*/
	1673	lwkt_reltoken(&vp->v_interlock);
	1674	VOP_INACTIVE(vp, td);
	1675	if (VSHOULDFREE(vp))
	1676	vfree(vp);
	1677	else
	1678	vlruvp(vp);
	1679	} else {
	1680	#ifdef DIAGNOSTIC
	1681	vprint("vput: negative ref count", vp);
	1682	#endif
	1683	panic("vput: negative ref cnt");
	1684	}
	1685	}
	1686
	1687	/*
	1688	* Somebody doesn't want the vnode recycled.
	1689	*/
	1690	void
	1691	vhold(vp)
	1692	struct vnode *vp;
	1693	{
	1694	int s;
	1695
	1696	s = splbio();
	1697	vp->v_holdcnt++;
	1698	if (VSHOULDBUSY(vp))
	1699	vbusy(vp);
	1700	splx(s);
	1701	}
	1702
	1703	/*
	1704	* One less who cares about this vnode.
	1705	*/
	1706	void
	1707	vdrop(vp)
	1708	struct vnode *vp;
	1709	{
	1710	int s;
	1711
	1712	s = splbio();
	1713	if (vp->v_holdcnt <= 0)
	1714	panic("vdrop: holdcnt");
	1715	vp->v_holdcnt--;
	1716	if (VSHOULDFREE(vp))
	1717	vfree(vp);
	1718	splx(s);
	1719	}
	1720
	1721	/*
	1722	* Remove any vnodes in the vnode table belonging to mount point mp.
	1723	*
	1724	* If FORCECLOSE is not specified, there should not be any active ones,
	1725	* return error if any are found (nb: this is a user error, not a
	1726	* system error). If FORCECLOSE is specified, detach any active vnodes
	1727	* that are found.
	1728	*
	1729	* If WRITECLOSE is set, only flush out regular file vnodes open for
	1730	* writing.
	1731	*
	1732	* SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
	1733	*
	1734	* `rootrefs' specifies the base reference count for the root vnode
	1735	* of this filesystem. The root vnode is considered busy if its
	1736	* v_usecount exceeds this value. On a successful return, vflush()
	1737	* will call vrele() on the root vnode exactly rootrefs times.
	1738	* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
	1739	* be zero.
	1740	*/
	1741	#ifdef DIAGNOSTIC
	1742	static int busyprt = 0; /* print out busy vnodes */
	1743	SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
	1744	#endif
	1745
	1746	int
	1747	vflush(mp, rootrefs, flags)
	1748	struct mount *mp;
	1749	int rootrefs;
	1750	int flags;
	1751	{
	1752	struct thread td = curthread; / XXX */
	1753	struct vnode vp, nvp, *rootvp = NULL;
	1754	struct vattr vattr;
	1755	int busy = 0, error;
	1756
	1757	if (rootrefs > 0) {
	1758	KASSERT((flags & (SKIPSYSTEM \| WRITECLOSE)) == 0,
	1759	("vflush: bad args"));
	1760	/*
	1761	* Get the filesystem root vnode. We can vput() it
	1762	* immediately, since with rootrefs > 0, it won't go away.
	1763	*/
	1764	if ((error = VFS_ROOT(mp, &rootvp)) != 0)
	1765	return (error);
	1766	vput(rootvp);
	1767	}
	1768	lwkt_gettoken(&mntvnode_token);
	1769	loop:
	1770	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
	1771	/*
	1772	* Make sure this vnode wasn't reclaimed in getnewvnode().
	1773	* Start over if it has (it won't be on the list anymore).
	1774	*/
	1775	if (vp->v_mount != mp)
	1776	goto loop;
	1777	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	1778
	1779	lwkt_gettoken(&vp->v_interlock);
	1780	/*
	1781	* Skip over a vnodes marked VSYSTEM.
	1782	*/
	1783	if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
	1784	lwkt_reltoken(&vp->v_interlock);
	1785	continue;
	1786	}
	1787	/*
	1788	* If WRITECLOSE is set, flush out unlinked but still open
	1789	* files (even if open only for reading) and regular file
	1790	* vnodes open for writing.
	1791	*/
	1792	if ((flags & WRITECLOSE) &&
	1793	(vp->v_type == VNON \|\|
	1794	(VOP_GETATTR(vp, &vattr, td) == 0 &&
	1795	vattr.va_nlink > 0)) &&
	1796	(vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	1797	lwkt_reltoken(&vp->v_interlock);
	1798	continue;
	1799	}
	1800
	1801	/*
	1802	* With v_usecount == 0, all we need to do is clear out the
	1803	* vnode data structures and we are done.
	1804	*/
	1805	if (vp->v_usecount == 0) {
	1806	lwkt_reltoken(&mntvnode_token);
	1807	vgonel(vp, td);
	1808	lwkt_gettoken(&mntvnode_token);
	1809	continue;
	1810	}
	1811
	1812	/*
	1813	* If FORCECLOSE is set, forcibly close the vnode. For block
	1814	* or character devices, revert to an anonymous device. For
	1815	* all other files, just kill them.
	1816	*/
	1817	if (flags & FORCECLOSE) {
	1818	lwkt_reltoken(&mntvnode_token);
	1819	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	1820	vgonel(vp, td);
	1821	} else {
	1822	vclean(vp, 0, td);
	1823	vp->v_op = spec_vnodeop_p;
	1824	insmntque(vp, (struct mount *) 0);
	1825	}
	1826	lwkt_gettoken(&mntvnode_token);
	1827	continue;
	1828	}
	1829	#ifdef DIAGNOSTIC
	1830	if (busyprt)
	1831	vprint("vflush: busy vnode", vp);
	1832	#endif
	1833	lwkt_reltoken(&vp->v_interlock);
	1834	busy++;
	1835	}
	1836	lwkt_reltoken(&mntvnode_token);
	1837	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
	1838	/*
	1839	* If just the root vnode is busy, and if its refcount
	1840	* is equal to `rootrefs', then go ahead and kill it.
	1841	*/
	1842	lwkt_gettoken(&rootvp->v_interlock);
	1843	KASSERT(busy > 0, ("vflush: not busy"));
	1844	KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
	1845	if (busy == 1 && rootvp->v_usecount == rootrefs) {
	1846	vgonel(rootvp, td);
	1847	busy = 0;
	1848	} else
	1849	lwkt_reltoken(&rootvp->v_interlock);
	1850	}
	1851	if (busy)
	1852	return (EBUSY);
	1853	for (; rootrefs > 0; rootrefs--)
	1854	vrele(rootvp);
	1855	return (0);
	1856	}
	1857
	1858	/*
	1859	* We do not want to recycle the vnode too quickly.
	1860	*
	1861	* XXX we can't move vp's around the nvnodelist without really screwing
	1862	* up the efficiency of filesystem SYNC and friends. This code is
	1863	* disabled until we fix the syncing code's scanning algorithm.
	1864	*/
	1865	static void
	1866	vlruvp(struct vnode *vp)
	1867	{
	1868	#if 0
	1869	struct mount *mp;
	1870
	1871	if ((mp = vp->v_mount) != NULL) {
	1872	lwkt_gettoken(&mntvnode_token);
	1873	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	1874	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	1875	lwkt_reltoken(&mntvnode_token);
	1876	}
	1877	#endif
	1878	}
	1879
	1880	/*
	1881	* Disassociate the underlying file system from a vnode.
	1882	*/
	1883	static void
	1884	vclean(struct vnode vp, int flags, struct thread td)
	1885	{
	1886	int active;
	1887
	1888	/*
	1889	* Check to see if the vnode is in use. If so we have to reference it
	1890	* before we clean it out so that its count cannot fall to zero and
	1891	* generate a race against ourselves to recycle it.
	1892	*/
	1893	if ((active = vp->v_usecount))
	1894	vp->v_usecount++;
	1895
	1896	/*
	1897	* Prevent the vnode from being recycled or brought into use while we
	1898	* clean it out.
	1899	*/
	1900	if (vp->v_flag & VXLOCK)
	1901	panic("vclean: deadlock");
	1902	vp->v_flag \|= VXLOCK;
	1903	vp->v_vxproc = curproc;
	1904	/*
	1905	* Even if the count is zero, the VOP_INACTIVE routine may still
	1906	* have the object locked while it cleans it out. The VOP_LOCK
	1907	* ensures that the VOP_INACTIVE routine is done with its work.
	1908	* For active vnodes, it ensures that no other activity can
	1909	* occur while the underlying object is being cleaned out.
	1910	*/
	1911	VOP_LOCK(vp, LK_DRAIN \| LK_INTERLOCK, td);
	1912
	1913	/*
	1914	* Clean out any buffers associated with the vnode.
	1915	*/
	1916	vinvalbuf(vp, V_SAVE, td, 0, 0);
	1917
	1918	VOP_DESTROYVOBJECT(vp);
	1919
	1920	/*
	1921	* If purging an active vnode, it must be closed and
	1922	* deactivated before being reclaimed. Note that the
	1923	* VOP_INACTIVE will unlock the vnode.
	1924	*/
	1925	if (active) {
	1926	if (flags & DOCLOSE)
	1927	VOP_CLOSE(vp, FNONBLOCK, td);
	1928	VOP_INACTIVE(vp, td);
	1929	} else {
	1930	/*
	1931	* Any other processes trying to obtain this lock must first
	1932	* wait for VXLOCK to clear, then call the new lock operation.
	1933	*/
	1934	VOP_UNLOCK(vp, 0, td);
	1935	}
	1936	/*
	1937	* Reclaim the vnode.
	1938	*/
	1939	if (VOP_RECLAIM(vp, td))
	1940	panic("vclean: cannot reclaim");
	1941
	1942	if (active) {
	1943	/*
	1944	* Inline copy of vrele() since VOP_INACTIVE
	1945	* has already been called.
	1946	*/
	1947	lwkt_gettoken(&vp->v_interlock);
	1948	if (--vp->v_usecount <= 0) {
	1949	#ifdef DIAGNOSTIC
	1950	if (vp->v_usecount < 0 \|\| vp->v_writecount != 0) {
	1951	vprint("vclean: bad ref count", vp);
	1952	panic("vclean: ref cnt");
	1953	}
	1954	#endif
	1955	vfree(vp);
	1956	}
	1957	lwkt_reltoken(&vp->v_interlock);
	1958	}
	1959
	1960	cache_purge(vp);
	1961	vp->v_vnlock = NULL;
	1962
	1963	if (VSHOULDFREE(vp))
	1964	vfree(vp);
	1965
	1966	/*
	1967	* Done with purge, notify sleepers of the grim news.
	1968	*/
	1969	vp->v_op = dead_vnodeop_p;
	1970	vn_pollgone(vp);
	1971	vp->v_tag = VT_NON;
	1972	vp->v_flag &= ~VXLOCK;
	1973	vp->v_vxproc = NULL;
	1974	if (vp->v_flag & VXWANT) {
	1975	vp->v_flag &= ~VXWANT;
	1976	wakeup((caddr_t) vp);
	1977	}
	1978	}
	1979
	1980	/*
	1981	* Eliminate all activity associated with the requested vnode
	1982	* and with all vnodes aliased to the requested vnode.
	1983	*/
	1984	int
	1985	vop_revoke(ap)
	1986	struct vop_revoke_args /* {
	1987	struct vnode *a_vp;
	1988	int a_flags;
	1989	} / ap;
	1990	{
	1991	struct vnode vp, vq;
	1992	dev_t dev;
	1993
	1994	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
	1995
	1996	vp = ap->a_vp;
	1997	/*
	1998	* If a vgone (or vclean) is already in progress,
	1999	* wait until it is done and return.
	2000	*/
	2001	if (vp->v_flag & VXLOCK) {
	2002	vp->v_flag \|= VXWANT;
	2003	lwkt_reltoken(&vp->v_interlock);
	2004	tsleep((caddr_t)vp, 0, "vop_revokeall", 0);
	2005	return (0);
	2006	}
	2007	dev = vp->v_rdev;
	2008	for (;;) {
	2009	lwkt_gettoken(&spechash_token);
	2010	vq = SLIST_FIRST(&dev->si_hlist);
	2011	lwkt_reltoken(&spechash_token);
	2012	if (!vq)
	2013	break;
	2014	vgone(vq);
	2015	}
	2016	return (0);
	2017	}
	2018
	2019	/*
	2020	* Recycle an unused vnode to the front of the free list.
	2021	* Release the passed interlock if the vnode will be recycled.
	2022	*/
	2023	int
	2024	vrecycle(struct vnode vp, struct lwkt_token inter_lkp, struct thread *td)
	2025	{
	2026	lwkt_gettoken(&vp->v_interlock);
	2027	if (vp->v_usecount == 0) {
	2028	if (inter_lkp) {
	2029	lwkt_reltoken(inter_lkp);
	2030	}
	2031	vgonel(vp, td);
	2032	return (1);
	2033	}
	2034	lwkt_reltoken(&vp->v_interlock);
	2035	return (0);
	2036	}
	2037
	2038	/*
	2039	* Eliminate all activity associated with a vnode
	2040	* in preparation for reuse.
	2041	*/
	2042	void
	2043	vgone(struct vnode *vp)
	2044	{
	2045	struct thread td = curthread; / XXX */
	2046
	2047	lwkt_gettoken(&vp->v_interlock);
	2048	vgonel(vp, td);
	2049	}
	2050
	2051	/*
	2052	* vgone, with the vp interlock held.
	2053	*/
	2054	void
	2055	vgonel(struct vnode vp, struct thread td)
	2056	{
	2057	int s;
	2058
	2059	/*
	2060	* If a vgone (or vclean) is already in progress,
	2061	* wait until it is done and return.
	2062	*/
	2063	if (vp->v_flag & VXLOCK) {
	2064	vp->v_flag \|= VXWANT;
	2065	lwkt_reltoken(&vp->v_interlock);
	2066	tsleep((caddr_t)vp, 0, "vgone", 0);
	2067	return;
	2068	}
	2069
	2070	/*
	2071	* Clean out the filesystem specific data.
	2072	*/
	2073	vclean(vp, DOCLOSE, td);
	2074	lwkt_gettoken(&vp->v_interlock);
	2075
	2076	/*
	2077	* Delete from old mount point vnode list, if on one.
	2078	*/
	2079	if (vp->v_mount != NULL)
	2080	insmntque(vp, (struct mount *)0);
	2081	/*
	2082	* If special device, remove it from special device alias list
	2083	* if it is on one.
	2084	*/
	2085	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) {
	2086	lwkt_gettoken(&spechash_token);
	2087	SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
	2088	freedev(vp->v_rdev);
	2089	lwkt_reltoken(&spechash_token);
	2090	vp->v_rdev = NULL;
	2091	}
	2092
	2093	/*
	2094	* If it is on the freelist and not already at the head,
	2095	* move it to the head of the list. The test of the
	2096	* VDOOMED flag and the reference count of zero is because
	2097	* it will be removed from the free list by getnewvnode,
	2098	* but will not have its reference count incremented until
	2099	* after calling vgone. If the reference count were
	2100	* incremented first, vgone would (incorrectly) try to
	2101	* close the previous instance of the underlying object.
	2102	*/
	2103	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
	2104	s = splbio();
	2105	lwkt_gettoken(&vnode_free_list_token);
	2106	if (vp->v_flag & VFREE)
	2107	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	2108	else
	2109	freevnodes++;
	2110	vp->v_flag \|= VFREE;
	2111	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2112	lwkt_reltoken(&vnode_free_list_token);
	2113	splx(s);
	2114	}
	2115
	2116	vp->v_type = VBAD;
	2117	lwkt_reltoken(&vp->v_interlock);
	2118	}
	2119
	2120	/*
	2121	* Lookup a vnode by device number.
	2122	*/
	2123	int
	2124	vfinddev(dev, type, vpp)
	2125	dev_t dev;
	2126	enum vtype type;
	2127	struct vnode **vpp;
	2128	{
	2129	struct vnode *vp;
	2130
	2131	lwkt_gettoken(&spechash_token);
	2132	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	2133	if (type == vp->v_type) {
	2134	*vpp = vp;
	2135	lwkt_reltoken(&spechash_token);
	2136	return (1);
	2137	}
	2138	}
	2139	lwkt_reltoken(&spechash_token);
	2140	return (0);
	2141	}
	2142
	2143	/*
	2144	* Calculate the total number of references to a special device.
	2145	*/
	2146	int
	2147	vcount(vp)
	2148	struct vnode *vp;
	2149	{
	2150	struct vnode *vq;
	2151	int count;
	2152
	2153	count = 0;
	2154	lwkt_gettoken(&spechash_token);
	2155	SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
	2156	count += vq->v_usecount;
	2157	lwkt_reltoken(&spechash_token);
	2158	return (count);
	2159	}
	2160
	2161	/*
	2162	* Same as above, but using the dev_t as argument
	2163	*/
	2164
	2165	int
	2166	count_dev(dev)
	2167	dev_t dev;
	2168	{
	2169	struct vnode *vp;
	2170
	2171	vp = SLIST_FIRST(&dev->si_hlist);
	2172	if (vp == NULL)
	2173	return (0);
	2174	return(vcount(vp));
	2175	}
	2176
	2177	/*
	2178	* Print out a description of a vnode.
	2179	*/
	2180	static char *typename[] =
	2181	{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
	2182
	2183	void
	2184	vprint(label, vp)
	2185	char *label;
	2186	struct vnode *vp;
	2187	{
	2188	char buf[96];
	2189
	2190	if (label != NULL)
	2191	printf("%s: %p: ", label, (void *)vp);
	2192	else
	2193	printf("%p: ", (void *)vp);
	2194	printf("type %s, usecount %d, writecount %d, refcount %d,",
	2195	typename[vp->v_type], vp->v_usecount, vp->v_writecount,
	2196	vp->v_holdcnt);
	2197	buf[0] = '\0';
	2198	if (vp->v_flag & VROOT)
	2199	strcat(buf, "\|VROOT");
	2200	if (vp->v_flag & VTEXT)
	2201	strcat(buf, "\|VTEXT");
	2202	if (vp->v_flag & VSYSTEM)
	2203	strcat(buf, "\|VSYSTEM");
	2204	if (vp->v_flag & VXLOCK)
	2205	strcat(buf, "\|VXLOCK");
	2206	if (vp->v_flag & VXWANT)
	2207	strcat(buf, "\|VXWANT");
	2208	if (vp->v_flag & VBWAIT)
	2209	strcat(buf, "\|VBWAIT");
	2210	if (vp->v_flag & VDOOMED)
	2211	strcat(buf, "\|VDOOMED");
	2212	if (vp->v_flag & VFREE)
	2213	strcat(buf, "\|VFREE");
	2214	if (vp->v_flag & VOBJBUF)
	2215	strcat(buf, "\|VOBJBUF");
	2216	if (buf[0] != '\0')
	2217	printf(" flags (%s)", &buf[1]);
	2218	if (vp->v_data == NULL) {
	2219	printf("\n");
	2220	} else {
	2221	printf("\n\t");
	2222	VOP_PRINT(vp);
	2223	}
	2224	}
	2225
	2226	#ifdef DDB
	2227	#include <ddb/ddb.h>
	2228	/*
	2229	* List all of the locked vnodes in the system.
	2230	* Called when debugging the kernel.
	2231	*/
	2232	DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
	2233	{
	2234	struct thread td = curthread; / XXX */
	2235	struct mount mp, nmp;
	2236	struct vnode *vp;
	2237
	2238	printf("Locked vnodes\n");
	2239	lwkt_gettoken(&mountlist_token);
	2240	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	2241	if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, td)) {
	2242	nmp = TAILQ_NEXT(mp, mnt_list);
	2243	continue;
	2244	}
	2245	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	2246	if (VOP_ISLOCKED(vp, NULL))
	2247	vprint((char *)0, vp);
	2248	}
	2249	lwkt_gettoken(&mountlist_token);
	2250	nmp = TAILQ_NEXT(mp, mnt_list);
	2251	vfs_unbusy(mp, td);
	2252	}
	2253	lwkt_reltoken(&mountlist_token);
	2254	}
	2255	#endif
	2256
	2257	/*
	2258	* Top level filesystem related information gathering.
	2259	*/
	2260	static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
	2261
	2262	static int
	2263	vfs_sysctl(SYSCTL_HANDLER_ARGS)
	2264	{
	2265	int name = (int )arg1 - 1; /* XXX */
	2266	u_int namelen = arg2 + 1; /* XXX */
	2267	struct vfsconf *vfsp;
	2268
	2269	#if 1 \|\| defined(COMPAT_PRELITE2)
	2270	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
	2271	if (namelen == 1)
	2272	return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
	2273	#endif
	2274
	2275	#ifdef notyet
	2276	/* all sysctl names at this level are at least name and field */
	2277	if (namelen < 2)
	2278	return (ENOTDIR); /* overloaded */
	2279	if (name[0] != VFS_GENERIC) {
	2280	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	2281	if (vfsp->vfc_typenum == name[0])
	2282	break;
	2283	if (vfsp == NULL)
	2284	return (EOPNOTSUPP);
	2285	return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
	2286	oldp, oldlenp, newp, newlen, p));
	2287	}
	2288	#endif
	2289	switch (name[1]) {
	2290	case VFS_MAXTYPENUM:
	2291	if (namelen != 2)
	2292	return (ENOTDIR);
	2293	return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
	2294	case VFS_CONF:
	2295	if (namelen != 3)
	2296	return (ENOTDIR); /* overloaded */
	2297	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	2298	if (vfsp->vfc_typenum == name[2])
	2299	break;
	2300	if (vfsp == NULL)
	2301	return (EOPNOTSUPP);
	2302	return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
	2303	}
	2304	return (EOPNOTSUPP);
	2305	}
	2306
	2307	SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
	2308	"Generic filesystem");
	2309
	2310	#if 1 \|\| defined(COMPAT_PRELITE2)
	2311
	2312	static int
	2313	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
	2314	{
	2315	int error;
	2316	struct vfsconf *vfsp;
	2317	struct ovfsconf ovfs;
	2318
	2319	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	2320	ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
	2321	strcpy(ovfs.vfc_name, vfsp->vfc_name);
	2322	ovfs.vfc_index = vfsp->vfc_typenum;
	2323	ovfs.vfc_refcount = vfsp->vfc_refcount;
	2324	ovfs.vfc_flags = vfsp->vfc_flags;
	2325	error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
	2326	if (error)
	2327	return error;
	2328	}
	2329	return 0;
	2330	}
	2331
	2332	#endif /* 1 \|\| COMPAT_PRELITE2 */
	2333
	2334	#if 0
	2335	#define KINFO_VNODESLOP 10
	2336	/*
	2337	* Dump vnode list (via sysctl).
	2338	* Copyout address of vnode followed by vnode.
	2339	*/
	2340	/* ARGSUSED */
	2341	static int
	2342	sysctl_vnode(SYSCTL_HANDLER_ARGS)
	2343	{
	2344	struct proc p = curproc; / XXX */
	2345	struct mount mp, nmp;
	2346	struct vnode nvp, vp;
	2347	int error;
	2348
	2349	#define VPTRSZ sizeof (struct vnode *)
	2350	#define VNODESZ sizeof (struct vnode)
	2351
	2352	req->lock = 0;
	2353	if (!req->oldptr) /* Make an estimate */
	2354	return (SYSCTL_OUT(req, 0,
	2355	(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
	2356
	2357	lwkt_gettoken(&mountlist_token);
	2358	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	2359	if (vfs_busy(mp, LK_NOWAIT, &mountlist_token, p)) {
	2360	nmp = TAILQ_NEXT(mp, mnt_list);
	2361	continue;
	2362	}
	2363	again:
	2364	lwkt_gettoken(&mntvnode_token);
	2365	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	2366	vp != NULL;
	2367	vp = nvp) {
	2368	/*
	2369	* Check that the vp is still associated with
	2370	* this filesystem. RACE: could have been
	2371	* recycled onto the same filesystem.
	2372	*/
	2373	if (vp->v_mount != mp) {
	2374	lwkt_reltoken(&mntvnode_token);
	2375	goto again;
	2376	}
	2377	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	2378	lwkt_reltoken(&mntvnode_token);
	2379	if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\|
	2380	(error = SYSCTL_OUT(req, vp, VNODESZ)))
	2381	return (error);
	2382	lwkt_gettoken(&mntvnode_token);
	2383	}
	2384	lwkt_reltoken(&mntvnode_token);
	2385	lwkt_gettoken(&mountlist_token);
	2386	nmp = TAILQ_NEXT(mp, mnt_list);
	2387	vfs_unbusy(mp, p);
	2388	}
	2389	lwkt_reltoken(&mountlist_token);
	2390
	2391	return (0);
	2392	}
	2393	#endif
	2394
	2395	/*
	2396	* XXX
	2397	* Exporting the vnode list on large systems causes them to crash.
	2398	* Exporting the vnode list on medium systems causes sysctl to coredump.
	2399	*/
	2400	#if 0
	2401	SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	2402	0, 0, sysctl_vnode, "S,vnode", "");
	2403	#endif
	2404
	2405	/*
	2406	* Check to see if a filesystem is mounted on a block device.
	2407	*/
	2408	int
	2409	vfs_mountedon(vp)
	2410	struct vnode *vp;
	2411	{
	2412
	2413	if (vp->v_specmountpoint != NULL)
	2414	return (EBUSY);
	2415	return (0);
	2416	}
	2417
	2418	/*
	2419	* Unmount all filesystems. The list is traversed in reverse order
	2420	* of mounting to avoid dependencies.
	2421	*/
	2422	void
	2423	vfs_unmountall()
	2424	{
	2425	struct mount *mp;
	2426	struct thread *td = curthread;
	2427	int error;
	2428
	2429	if (td->td_proc == NULL)
	2430	td = initproc->p_thread; /* XXX XXX use proc0 instead? */
	2431
	2432	/*
	2433	* Since this only runs when rebooting, it is not interlocked.
	2434	*/
	2435	while(!TAILQ_EMPTY(&mountlist)) {
	2436	mp = TAILQ_LAST(&mountlist, mntlist);
	2437	error = dounmount(mp, MNT_FORCE, td);
	2438	if (error) {
	2439	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	2440	printf("unmount of %s failed (",
	2441	mp->mnt_stat.f_mntonname);
	2442	if (error == EBUSY)
	2443	printf("BUSY)\n");
	2444	else
	2445	printf("%d)\n", error);
	2446	} else {
	2447	/* The unmount has removed mp from the mountlist */
	2448	}
	2449	}
	2450	}
	2451
	2452	/*
	2453	* Build hash lists of net addresses and hang them off the mount point.
	2454	* Called by ufs_mount() to set up the lists of export addresses.
	2455	*/
	2456	static int
	2457	vfs_hang_addrlist(mp, nep, argp)
	2458	struct mount *mp;
	2459	struct netexport *nep;
	2460	struct export_args *argp;
	2461	{
	2462	struct netcred *np;
	2463	struct radix_node_head *rnh;
	2464	int i;
	2465	struct radix_node *rn;
	2466	struct sockaddr saddr, smask = 0;
	2467	struct domain *dom;
	2468	int error;
	2469
	2470	if (argp->ex_addrlen == 0) {
	2471	if (mp->mnt_flag & MNT_DEFEXPORTED)
	2472	return (EPERM);
	2473	np = &nep->ne_defexported;
	2474	np->netc_exflags = argp->ex_flags;
	2475	np->netc_anon = argp->ex_anon;
	2476	np->netc_anon.cr_ref = 1;
	2477	mp->mnt_flag \|= MNT_DEFEXPORTED;
	2478	return (0);
	2479	}
	2480
	2481	if (argp->ex_addrlen < 0 \|\| argp->ex_addrlen > MLEN)
	2482	return (EINVAL);
	2483	if (argp->ex_masklen < 0 \|\| argp->ex_masklen > MLEN)
	2484	return (EINVAL);
	2485
	2486	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
	2487	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
	2488	bzero((caddr_t) np, i);
	2489	saddr = (struct sockaddr *) (np + 1);
	2490	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
	2491	goto out;
	2492	if (saddr->sa_len > argp->ex_addrlen)
	2493	saddr->sa_len = argp->ex_addrlen;
	2494	if (argp->ex_masklen) {
	2495	smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
	2496	error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
	2497	if (error)
	2498	goto out;
	2499	if (smask->sa_len > argp->ex_masklen)
	2500	smask->sa_len = argp->ex_masklen;
	2501	}
	2502	i = saddr->sa_family;
	2503	if ((rnh = nep->ne_rtable[i]) == 0) {
	2504	/*
	2505	* Seems silly to initialize every AF when most are not used,
	2506	* do so on demand here
	2507	*/
	2508	for (dom = domains; dom; dom = dom->dom_next)
	2509	if (dom->dom_family == i && dom->dom_rtattach) {
	2510	dom->dom_rtattach((void **) &nep->ne_rtable[i],
	2511	dom->dom_rtoffset);
	2512	break;
	2513	}
	2514	if ((rnh = nep->ne_rtable[i]) == 0) {
	2515	error = ENOBUFS;
	2516	goto out;
	2517	}
	2518	}
	2519	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
	2520	np->netc_rnodes);
	2521	if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists */
	2522	error = EPERM;
	2523	goto out;
	2524	}
	2525	np->netc_exflags = argp->ex_flags;
	2526	np->netc_anon = argp->ex_anon;
	2527	np->netc_anon.cr_ref = 1;
	2528	return (0);
	2529	out:
	2530	free(np, M_NETADDR);
	2531	return (error);
	2532	}
	2533
	2534	/* ARGSUSED */
	2535	static int
	2536	vfs_free_netcred(rn, w)
	2537	struct radix_node *rn;
	2538	void *w;
	2539	{
	2540	struct radix_node_head rnh = (struct radix_node_head ) w;
	2541
	2542	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
	2543	free((caddr_t) rn, M_NETADDR);
	2544	return (0);
	2545	}
	2546
	2547	/*
	2548	* Free the net address hash lists that are hanging off the mount points.
	2549	*/
	2550	static void
	2551	vfs_free_addrlist(nep)
	2552	struct netexport *nep;
	2553	{
	2554	int i;
	2555	struct radix_node_head *rnh;
	2556
	2557	for (i = 0; i <= AF_MAX; i++)
	2558	if ((rnh = nep->ne_rtable[i])) {
	2559	(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
	2560	(caddr_t) rnh);
	2561	free((caddr_t) rnh, M_RTABLE);
	2562	nep->ne_rtable[i] = 0;
	2563	}
	2564	}
	2565
	2566	int
	2567	vfs_export(mp, nep, argp)
	2568	struct mount *mp;
	2569	struct netexport *nep;
	2570	struct export_args *argp;
	2571	{
	2572	int error;
	2573
	2574	if (argp->ex_flags & MNT_DELEXPORT) {
	2575	if (mp->mnt_flag & MNT_EXPUBLIC) {
	2576	vfs_setpublicfs(NULL, NULL, NULL);
	2577	mp->mnt_flag &= ~MNT_EXPUBLIC;
	2578	}
	2579	vfs_free_addrlist(nep);
	2580	mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED);
	2581	}
	2582	if (argp->ex_flags & MNT_EXPORTED) {
	2583	if (argp->ex_flags & MNT_EXPUBLIC) {
	2584	if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
	2585	return (error);
	2586	mp->mnt_flag \|= MNT_EXPUBLIC;
	2587	}
	2588	if ((error = vfs_hang_addrlist(mp, nep, argp)))
	2589	return (error);
	2590	mp->mnt_flag \|= MNT_EXPORTED;
	2591	}
	2592	return (0);
	2593	}
	2594
	2595
	2596	/*
	2597	* Set the publicly exported filesystem (WebNFS). Currently, only
	2598	* one public filesystem is possible in the spec (RFC 2054 and 2055)
	2599	*/
	2600	int
	2601	vfs_setpublicfs(mp, nep, argp)
	2602	struct mount *mp;
	2603	struct netexport *nep;
	2604	struct export_args *argp;
	2605	{
	2606	int error;
	2607	struct vnode *rvp;
	2608	char *cp;
	2609
	2610	/*
	2611	* mp == NULL -> invalidate the current info, the FS is
	2612	* no longer exported. May be called from either vfs_export
	2613	* or unmount, so check if it hasn't already been done.
	2614	*/
	2615	if (mp == NULL) {
	2616	if (nfs_pub.np_valid) {
	2617	nfs_pub.np_valid = 0;
	2618	if (nfs_pub.np_index != NULL) {
	2619	FREE(nfs_pub.np_index, M_TEMP);
	2620	nfs_pub.np_index = NULL;
	2621	}
	2622	}
	2623	return (0);
	2624	}
	2625
	2626	/*
	2627	* Only one allowed at a time.
	2628	*/
	2629	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
	2630	return (EBUSY);
	2631
	2632	/*
	2633	* Get real filehandle for root of exported FS.
	2634	*/
	2635	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
	2636	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
	2637
	2638	if ((error = VFS_ROOT(mp, &rvp)))
	2639	return (error);
	2640
	2641	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
	2642	return (error);
	2643
	2644	vput(rvp);
	2645
	2646	/*
	2647	* If an indexfile was specified, pull it in.
	2648	*/
	2649	if (argp->ex_indexfile != NULL) {
	2650	MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
	2651	M_WAITOK);
	2652	error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
	2653	MAXNAMLEN, (size_t *)0);
	2654	if (!error) {
	2655	/*
	2656	* Check for illegal filenames.
	2657	*/
	2658	for (cp = nfs_pub.np_index; *cp; cp++) {
	2659	if (*cp == '/') {
	2660	error = EINVAL;
	2661	break;
	2662	}
	2663	}
	2664	}
	2665	if (error) {
	2666	FREE(nfs_pub.np_index, M_TEMP);
	2667	return (error);
	2668	}
	2669	}
	2670
	2671	nfs_pub.np_mount = mp;
	2672	nfs_pub.np_valid = 1;
	2673	return (0);
	2674	}
	2675
	2676	struct netcred *
	2677	vfs_export_lookup(mp, nep, nam)
	2678	struct mount *mp;
	2679	struct netexport *nep;
	2680	struct sockaddr *nam;
	2681	{
	2682	struct netcred *np;
	2683	struct radix_node_head *rnh;
	2684	struct sockaddr *saddr;
	2685
	2686	np = NULL;
	2687	if (mp->mnt_flag & MNT_EXPORTED) {
	2688	/*
	2689	* Lookup in the export list first.
	2690	*/
	2691	if (nam != NULL) {
	2692	saddr = nam;
	2693	rnh = nep->ne_rtable[saddr->sa_family];
	2694	if (rnh != NULL) {
	2695	np = (struct netcred *)
	2696	(*rnh->rnh_matchaddr)((caddr_t)saddr,
	2697	rnh);
	2698	if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
	2699	np = NULL;
	2700	}
	2701	}
	2702	/*
	2703	* If no address match, use the default if it exists.
	2704	*/
	2705	if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
	2706	np = &nep->ne_defexported;
	2707	}
	2708	return (np);
	2709	}
	2710
	2711	/*
	2712	* perform msync on all vnodes under a mount point
	2713	* the mount point must be locked.
	2714	*/
	2715	void
	2716	vfs_msync(struct mount *mp, int flags)
	2717	{
	2718	struct thread td = curthread; / XXX */
	2719	struct vnode vp, nvp;
	2720	struct vm_object *obj;
	2721	int tries;
	2722
	2723	tries = 5;
	2724	lwkt_gettoken(&mntvnode_token);
	2725	loop:
	2726	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
	2727	if (vp->v_mount != mp) {
	2728	if (--tries > 0)
	2729	goto loop;
	2730	break;
	2731	}
	2732	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	2733
	2734	if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
	2735	continue;
	2736
	2737	/*
	2738	* There could be hundreds of thousands of vnodes, we cannot
	2739	* afford to do anything heavy-weight until we have a fairly
	2740	* good indication that there is something to do.
	2741	*/
	2742	if ((vp->v_flag & VOBJDIRTY) &&
	2743	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	2744	lwkt_reltoken(&mntvnode_token);
	2745	if (!vget(vp,
	2746	LK_EXCLUSIVE \| LK_RETRY \| LK_NOOBJ, td)) {
	2747	if (VOP_GETVOBJECT(vp, &obj) == 0) {
	2748	vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
	2749	}
	2750	vput(vp);
	2751	}
	2752	lwkt_gettoken(&mntvnode_token);
	2753	if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
	2754	if (--tries > 0)
	2755	goto loop;
	2756	break;
	2757	}
	2758	}
	2759	}
	2760	lwkt_reltoken(&mntvnode_token);
	2761	}
	2762
	2763	/*
	2764	* Create the VM object needed for VMIO and mmap support. This
	2765	* is done for all VREG files in the system. Some filesystems might
	2766	* afford the additional metadata buffering capability of the
	2767	* VMIO code by making the device node be VMIO mode also.
	2768	*
	2769	* vp must be locked when vfs_object_create is called.
	2770	*/
	2771	int
	2772	vfs_object_create(struct vnode vp, struct thread td)
	2773	{
	2774	return (VOP_CREATEVOBJECT(vp, td));
	2775	}
	2776
	2777	void
	2778	vfree(vp)
	2779	struct vnode *vp;
	2780	{
	2781	int s;
	2782
	2783	s = splbio();
	2784	lwkt_gettoken(&vnode_free_list_token);
	2785	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
	2786	if (vp->v_flag & VAGE) {
	2787	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2788	} else {
	2789	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	2790	}
	2791	freevnodes++;
	2792	lwkt_reltoken(&vnode_free_list_token);
	2793	vp->v_flag &= ~VAGE;
	2794	vp->v_flag \|= VFREE;
	2795	splx(s);
	2796	}
	2797
	2798	void
	2799	vbusy(vp)
	2800	struct vnode *vp;
	2801	{
	2802	int s;
	2803
	2804	s = splbio();
	2805	lwkt_gettoken(&vnode_free_list_token);
	2806	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
	2807	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
	2808	freevnodes--;
	2809	lwkt_reltoken(&vnode_free_list_token);
	2810	vp->v_flag &= ~(VFREE\|VAGE);
	2811	splx(s);
	2812	}
	2813
	2814	/*
	2815	* Record a process's interest in events which might happen to
	2816	* a vnode. Because poll uses the historic select-style interface
	2817	* internally, this routine serves as both the ``check for any
	2818	* pending events'' and the ``record my interest in future events''
	2819	* functions. (These are done together, while the lock is held,
	2820	* to avoid race conditions.)
	2821	*/
	2822	int
	2823	vn_pollrecord(struct vnode vp, struct thread td, int events)
	2824	{
	2825	lwkt_gettoken(&vp->v_pollinfo.vpi_token);
	2826	if (vp->v_pollinfo.vpi_revents & events) {
	2827	/*
	2828	* This leaves events we are not interested
	2829	* in available for the other process which
	2830	* which presumably had requested them
	2831	* (otherwise they would never have been
	2832	* recorded).
	2833	*/
	2834	events &= vp->v_pollinfo.vpi_revents;
	2835	vp->v_pollinfo.vpi_revents &= ~events;
	2836
	2837	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2838	return events;
	2839	}
	2840	vp->v_pollinfo.vpi_events \|= events;
	2841	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
	2842	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2843	return 0;
	2844	}
	2845
	2846	/*
	2847	* Note the occurrence of an event. If the VN_POLLEVENT macro is used,
	2848	* it is possible for us to miss an event due to race conditions, but
	2849	* that condition is expected to be rare, so for the moment it is the
	2850	* preferred interface.
	2851	*/
	2852	void
	2853	vn_pollevent(vp, events)
	2854	struct vnode *vp;
	2855	short events;
	2856	{
	2857	lwkt_gettoken(&vp->v_pollinfo.vpi_token);
	2858	if (vp->v_pollinfo.vpi_events & events) {
	2859	/*
	2860	* We clear vpi_events so that we don't
	2861	* call selwakeup() twice if two events are
	2862	* posted before the polling process(es) is
	2863	* awakened. This also ensures that we take at
	2864	* most one selwakeup() if the polling process
	2865	* is no longer interested. However, it does
	2866	* mean that only one event can be noticed at
	2867	* a time. (Perhaps we should only clear those
	2868	* event bits which we note?) XXX
	2869	*/
	2870	vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
	2871	vp->v_pollinfo.vpi_revents \|= events;
	2872	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	2873	}
	2874	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2875	}
	2876
	2877	/*
	2878	* Wake up anyone polling on vp because it is being revoked.
	2879	* This depends on dead_poll() returning POLLHUP for correct
	2880	* behavior.
	2881	*/
	2882	void
	2883	vn_pollgone(vp)
	2884	struct vnode *vp;
	2885	{
	2886	lwkt_gettoken(&vp->v_pollinfo.vpi_token);
	2887	if (vp->v_pollinfo.vpi_events) {
	2888	vp->v_pollinfo.vpi_events = 0;
	2889	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	2890	}
	2891	lwkt_reltoken(&vp->v_pollinfo.vpi_token);
	2892	}
	2893
	2894
	2895
	2896	/*
	2897	* Routine to create and manage a filesystem syncer vnode.
	2898	*/
	2899	#define sync_close ((int () (struct vop_close_args ))nullop)
	2900	static int sync_fsync (struct vop_fsync_args *);
	2901	static int sync_inactive (struct vop_inactive_args *);
	2902	static int sync_reclaim (struct vop_reclaim_args *);
	2903	#define sync_lock ((int () (struct vop_lock_args ))vop_nolock)
	2904	#define sync_unlock ((int () (struct vop_unlock_args ))vop_nounlock)
	2905	static int sync_print (struct vop_print_args *);
	2906	#define sync_islocked ((int() (struct vop_islocked_args ))vop_noislocked)
	2907
	2908	static vop_t **sync_vnodeop_p;
	2909	static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
	2910	{ &vop_default_desc, (vop_t *) vop_eopnotsupp },
	2911	{ &vop_close_desc, (vop_t ) sync_close }, / close */
	2912	{ &vop_fsync_desc, (vop_t ) sync_fsync }, / fsync */
	2913	{ &vop_inactive_desc, (vop_t ) sync_inactive }, / inactive */
	2914	{ &vop_reclaim_desc, (vop_t ) sync_reclaim }, / reclaim */
	2915	{ &vop_lock_desc, (vop_t ) sync_lock }, / lock */
	2916	{ &vop_unlock_desc, (vop_t ) sync_unlock }, / unlock */
	2917	{ &vop_print_desc, (vop_t ) sync_print }, / print */
	2918	{ &vop_islocked_desc, (vop_t ) sync_islocked }, / islocked */
	2919	{ NULL, NULL }
	2920	};
	2921	static struct vnodeopv_desc sync_vnodeop_opv_desc =
	2922	{ &sync_vnodeop_p, sync_vnodeop_entries };
	2923
	2924	VNODEOP_SET(sync_vnodeop_opv_desc);
	2925
	2926	/*
	2927	* Create a new filesystem syncer vnode for the specified mount point.
	2928	*/
	2929	int
	2930	vfs_allocate_syncvnode(mp)
	2931	struct mount *mp;
	2932	{
	2933	struct vnode *vp;
	2934	static long start, incr, next;
	2935	int error;
	2936
	2937	/* Allocate a new vnode */
	2938	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
	2939	mp->mnt_syncer = NULL;
	2940	return (error);
	2941	}
	2942	vp->v_type = VNON;
	2943	/*
	2944	* Place the vnode onto the syncer worklist. We attempt to
	2945	* scatter them about on the list so that they will go off
	2946	* at evenly distributed times even if all the filesystems
	2947	* are mounted at once.
	2948	*/
	2949	next += incr;
	2950	if (next == 0 \|\| next > syncer_maxdelay) {
	2951	start /= 2;
	2952	incr /= 2;
	2953	if (start == 0) {
	2954	start = syncer_maxdelay / 2;
	2955	incr = syncer_maxdelay;
	2956	}
	2957	next = start;
	2958	}
	2959	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
	2960	mp->mnt_syncer = vp;
	2961	return (0);
	2962	}
	2963
	2964	/*
	2965	* Do a lazy sync of the filesystem.
	2966	*/
	2967	static int
	2968	sync_fsync(ap)
	2969	struct vop_fsync_args /* {
	2970	struct vnode *a_vp;
	2971	struct ucred *a_cred;
	2972	int a_waitfor;
	2973	struct thread *a_td;
	2974	} / ap;
	2975	{
	2976	struct vnode *syncvp = ap->a_vp;
	2977	struct mount *mp = syncvp->v_mount;
	2978	struct thread *td = ap->a_td;
	2979	int asyncflag;
	2980
	2981	/*
	2982	* We only need to do something if this is a lazy evaluation.
	2983	*/
	2984	if (ap->a_waitfor != MNT_LAZY)
	2985	return (0);
	2986
	2987	/*
	2988	* Move ourselves to the back of the sync list.
	2989	*/
	2990	vn_syncer_add_to_worklist(syncvp, syncdelay);
	2991
	2992	/*
	2993	* Walk the list of vnodes pushing all that are dirty and
	2994	* not already on the sync list.
	2995	*/
	2996	lwkt_gettoken(&mountlist_token);
	2997	if (vfs_busy(mp, LK_EXCLUSIVE \| LK_NOWAIT, &mountlist_token, td) != 0) {
	2998	lwkt_reltoken(&mountlist_token);
	2999	return (0);
	3000	}
	3001	asyncflag = mp->mnt_flag & MNT_ASYNC;
	3002	mp->mnt_flag &= ~MNT_ASYNC;
	3003	vfs_msync(mp, MNT_NOWAIT);
	3004	VFS_SYNC(mp, MNT_LAZY, td);
	3005	if (asyncflag)
	3006	mp->mnt_flag \|= MNT_ASYNC;
	3007	vfs_unbusy(mp, td);
	3008	return (0);
	3009	}
	3010
	3011	/*
	3012	* The syncer vnode is no referenced.
	3013	*/
	3014	static int
	3015	sync_inactive(ap)
	3016	struct vop_inactive_args /* {
	3017	struct vnode *a_vp;
	3018	struct proc *a_p;
	3019	} / ap;
	3020	{
	3021
	3022	vgone(ap->a_vp);
	3023	return (0);
	3024	}
	3025
	3026	/*
	3027	* The syncer vnode is no longer needed and is being decommissioned.
	3028	*
	3029	* Modifications to the worklist must be protected at splbio().
	3030	*/
	3031	static int
	3032	sync_reclaim(ap)
	3033	struct vop_reclaim_args /* {
	3034	struct vnode *a_vp;
	3035	} / ap;
	3036	{
	3037	struct vnode *vp = ap->a_vp;
	3038	int s;
	3039
	3040	s = splbio();
	3041	vp->v_mount->mnt_syncer = NULL;
	3042	if (vp->v_flag & VONWORKLST) {
	3043	LIST_REMOVE(vp, v_synclist);
	3044	vp->v_flag &= ~VONWORKLST;
	3045	}
	3046	splx(s);
	3047
	3048	return (0);
	3049	}
	3050
	3051	/*
	3052	* Print out a syncer vnode.
	3053	*/
	3054	static int
	3055	sync_print(ap)
	3056	struct vop_print_args /* {
	3057	struct vnode *a_vp;
	3058	} / ap;
	3059	{
	3060	struct vnode *vp = ap->a_vp;
	3061
	3062	printf("syncer vnode");
	3063	if (vp->v_vnlock != NULL)
	3064	lockmgr_printinfo(vp->v_vnlock);
	3065	printf("\n");
	3066	return (0);
	3067	}
	3068
	3069	/*
	3070	* extract the dev_t from a VBLK or VCHR
	3071	*/
	3072	dev_t
	3073	vn_todev(vp)
	3074	struct vnode *vp;
	3075	{
	3076	if (vp->v_type != VBLK && vp->v_type != VCHR)
	3077	return (NODEV);
	3078	return (vp->v_rdev);
	3079	}
	3080
	3081	/*
	3082	* Check if vnode represents a disk device
	3083	*/
	3084	int
	3085	vn_isdisk(vp, errp)
	3086	struct vnode *vp;
	3087	int *errp;
	3088	{
	3089	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	3090	if (errp != NULL)
	3091	*errp = ENOTBLK;
	3092	return (0);
	3093	}
	3094	if (vp->v_rdev == NULL) {
	3095	if (errp != NULL)
	3096	*errp = ENXIO;
	3097	return (0);
	3098	}
	3099	if (!dev_dport(vp->v_rdev)) {
	3100	if (errp != NULL)
	3101	*errp = ENXIO;
	3102	return (0);
	3103	}
	3104	if (!(dev_dflags(vp->v_rdev) & D_DISK)) {
	3105	if (errp != NULL)
	3106	*errp = ENOTBLK;
	3107	return (0);
	3108	}
	3109	if (errp != NULL)
	3110	*errp = 0;
	3111	return (1);
	3112	}
	3113
	3114	void
	3115	NDFREE(ndp, flags)
	3116	struct nameidata *ndp;
	3117	const uint flags;
	3118	{
	3119	if (!(flags & NDF_NO_FREE_PNBUF) &&
	3120	(ndp->ni_cnd.cn_flags & CNP_HASBUF)) {
	3121	zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
	3122	ndp->ni_cnd.cn_flags &= ~CNP_HASBUF;
	3123	}
	3124	if (!(flags & NDF_NO_DNCP_RELE) &&
	3125	(ndp->ni_cnd.cn_flags & CNP_WANTDNCP) &&
	3126	ndp->ni_dncp) {
	3127	cache_drop(ndp->ni_dncp);
	3128	ndp->ni_dncp = NULL;
	3129	}
	3130	if (!(flags & NDF_NO_NCP_RELE) &&
	3131	(ndp->ni_cnd.cn_flags & CNP_WANTNCP) &&
	3132	ndp->ni_ncp) {
	3133	cache_drop(ndp->ni_ncp);
	3134	ndp->ni_ncp = NULL;
	3135	}
	3136	if (!(flags & NDF_NO_DVP_UNLOCK) &&
	3137	(ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) &&
	3138	ndp->ni_dvp != ndp->ni_vp) {
	3139	VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_td);
	3140	}
	3141	if (!(flags & NDF_NO_DVP_RELE) &&
	3142	(ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT\|CNP_WANTPARENT))) {
	3143	vrele(ndp->ni_dvp);
	3144	ndp->ni_dvp = NULL;
	3145	}
	3146	if (!(flags & NDF_NO_VP_UNLOCK) &&
	3147	(ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) {
	3148	VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_td);
	3149	}
	3150	if (!(flags & NDF_NO_VP_RELE) &&
	3151	ndp->ni_vp) {
	3152	vrele(ndp->ni_vp);
	3153	ndp->ni_vp = NULL;
	3154	}
	3155	if (!(flags & NDF_NO_STARTDIR_RELE) &&
	3156	(ndp->ni_cnd.cn_flags & CNP_SAVESTART)) {
	3157	vrele(ndp->ni_startdir);
	3158	ndp->ni_startdir = NULL;
	3159	}
	3160	}
	3161