gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1989, 1993
	35	* The Regents of the University of California. All rights reserved.
	36	* (c) UNIX System Laboratories, Inc.
	37	* All or some portions of this file are derived from material licensed
	38	* to the University of California by American Telephone and Telegraph
	39	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	40	* the permission of UNIX System Laboratories, Inc.
	41	*
	42	* Redistribution and use in source and binary forms, with or without
	43	* modification, are permitted provided that the following conditions
	44	* are met:
	45	* 1. Redistributions of source code must retain the above copyright
	46	* notice, this list of conditions and the following disclaimer.
	47	* 2. Redistributions in binary form must reproduce the above copyright
	48	* notice, this list of conditions and the following disclaimer in the
	49	* documentation and/or other materials provided with the distribution.
	50	* 3. All advertising materials mentioning features or use of this software
	51	* must display the following acknowledgement:
	52	* This product includes software developed by the University of
	53	* California, Berkeley and its contributors.
	54	* 4. Neither the name of the University nor the names of its contributors
	55	* may be used to endorse or promote products derived from this software
	56	* without specific prior written permission.
	57	*
	58	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	59	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	60	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	61	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	62	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	63	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	64	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	65	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	66	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	67	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	68	* SUCH DAMAGE.
	69	*
	70	* $DragonFly: src/sys/kern/vfs_mount.c,v 1.24 2006/12/23 00:35:04 swildner Exp $
	71	*/
	72
	73	/*
	74	* External virtual filesystem routines
	75	*/
	76	#include "opt_ddb.h"
	77
	78	#include <sys/param.h>
	79	#include <sys/systm.h>
	80	#include <sys/kernel.h>
	81	#include <sys/malloc.h>
	82	#include <sys/mount.h>
	83	#include <sys/proc.h>
	84	#include <sys/vnode.h>
	85	#include <sys/buf.h>
	86	#include <sys/eventhandler.h>
	87	#include <sys/kthread.h>
	88	#include <sys/sysctl.h>
	89
	90	#include <machine/limits.h>
	91
	92	#include <sys/buf2.h>
	93	#include <sys/thread2.h>
	94
	95	#include <vm/vm.h>
	96	#include <vm/vm_object.h>
	97
	98	struct mountscan_info {
	99	TAILQ_ENTRY(mountscan_info) msi_entry;
	100	int msi_how;
	101	struct mount *msi_node;
	102	};
	103
	104	struct vmntvnodescan_info {
	105	TAILQ_ENTRY(vmntvnodescan_info) entry;
	106	struct vnode *vp;
	107	};
	108
	109	static int vnlru_nowhere = 0;
	110	SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RD,
	111	&vnlru_nowhere, 0,
	112	"Number of times the vnlru process ran without success");
	113
	114
	115	static struct lwkt_token mntid_token;
	116
	117	static struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
	118	static TAILQ_HEAD(,mountscan_info) mountscan_list;
	119	static struct lwkt_token mountlist_token;
	120	static TAILQ_HEAD(,vmntvnodescan_info) mntvnodescan_list;
	121	struct lwkt_token mntvnode_token;
	122
	123	/*
	124	* Called from vfsinit()
	125	*/
	126	void
	127	vfs_mount_init(void)
	128	{
	129	lwkt_token_init(&mountlist_token);
	130	lwkt_token_init(&mntvnode_token);
	131	lwkt_token_init(&mntid_token);
	132	TAILQ_INIT(&mountscan_list);
	133	TAILQ_INIT(&mntvnodescan_list);
	134	}
	135
	136	/*
	137	* Support function called with mntvnode_token held to remove a vnode
	138	* from the mountlist. We must update any list scans which are in progress.
	139	*/
	140	static void
	141	vremovevnodemnt(struct vnode *vp)
	142	{
	143	struct vmntvnodescan_info *info;
	144
	145	TAILQ_FOREACH(info, &mntvnodescan_list, entry) {
	146	if (info->vp == vp)
	147	info->vp = TAILQ_NEXT(vp, v_nmntvnodes);
	148	}
	149	TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
	150	}
	151
	152	/*
	153	* Support function called with mntvnode_token held to move a vnode to
	154	* the end of the list.
	155	*/
	156	static void
	157	vmovevnodetoend(struct mount mp, struct vnode vp)
	158	{
	159	vremovevnodemnt(vp);
	160	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	161	}
	162
	163
	164	/*
	165	* Allocate a new vnode and associate it with a tag, mount point, and
	166	* operations vector.
	167	*
	168	* A VX locked and refd vnode is returned. The caller should setup the
	169	* remaining fields and vx_put() or, if he wishes to leave a vref,
	170	* vx_unlock() the vnode.
	171	*/
	172	int
	173	getnewvnode(enum vtagtype tag, struct mount *mp,
	174	struct vnode **vpp, int lktimeout, int lkflags)
	175	{
	176	struct vnode *vp;
	177
	178	KKASSERT(mp != NULL);
	179
	180	vp = allocvnode(lktimeout, lkflags);
	181	vp->v_tag = tag;
	182	vp->v_data = NULL;
	183
	184	/*
	185	* By default the vnode is assigned the mount point's normal
	186	* operations vector.
	187	*/
	188	vp->v_ops = &mp->mnt_vn_use_ops;
	189
	190	/*
	191	* Placing the vnode on the mount point's queue makes it visible.
	192	* VNON prevents it from being messed with, however.
	193	*/
	194	insmntque(vp, mp);
	195
	196	/*
	197	* A VX locked & refd vnode is returned.
	198	*/
	199	*vpp = vp;
	200	return (0);
	201	}
	202
	203	/*
	204	* This function creates vnodes with special operations vectors. The
	205	* mount point is optional.
	206	*
	207	* This routine is being phased out.
	208	*/
	209	int
	210	getspecialvnode(enum vtagtype tag, struct mount *mp,
	211	struct vop_ops **ops,
	212	struct vnode **vpp, int lktimeout, int lkflags)
	213	{
	214	struct vnode *vp;
	215
	216	vp = allocvnode(lktimeout, lkflags);
	217	vp->v_tag = tag;
	218	vp->v_data = NULL;
	219	vp->v_ops = ops;
	220
	221	/*
	222	* Placing the vnode on the mount point's queue makes it visible.
	223	* VNON prevents it from being messed with, however.
	224	*/
	225	insmntque(vp, mp);
	226
	227	/*
	228	* A VX locked & refd vnode is returned.
	229	*/
	230	*vpp = vp;
	231	return (0);
	232	}
	233
	234	/*
	235	* Interlock against an unmount, return 0 on success, non-zero on failure.
	236	*
	237	* The passed flag may be 0 or LK_NOWAIT and is only used if an unmount
	238	* is in-progress.
	239	*
	240	* If no unmount is in-progress LK_NOWAIT is ignored. No other flag bits
	241	* are used. A shared locked will be obtained and the filesystem will not
	242	* be unmountable until the lock is released.
	243	*/
	244	int
	245	vfs_busy(struct mount *mp, int flags)
	246	{
	247	int lkflags;
	248
	249	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
	250	if (flags & LK_NOWAIT)
	251	return (ENOENT);
	252	/* XXX not MP safe */
	253	mp->mnt_kern_flag \|= MNTK_MWAIT;
	254	/*
	255	* Since all busy locks are shared except the exclusive
	256	* lock granted when unmounting, the only place that a
	257	* wakeup needs to be done is at the release of the
	258	* exclusive lock at the end of dounmount.
	259	*/
	260	tsleep((caddr_t)mp, 0, "vfs_busy", 0);
	261	return (ENOENT);
	262	}
	263	lkflags = LK_SHARED;
	264	if (lockmgr(&mp->mnt_lock, lkflags))
	265	panic("vfs_busy: unexpected lock failure");
	266	return (0);
	267	}
	268
	269	/*
	270	* Free a busy filesystem.
	271	*/
	272	void
	273	vfs_unbusy(struct mount *mp)
	274	{
	275	lockmgr(&mp->mnt_lock, LK_RELEASE);
	276	}
	277
	278	/*
	279	* Lookup a filesystem type, and if found allocate and initialize
	280	* a mount structure for it.
	281	*
	282	* Devname is usually updated by mount(8) after booting.
	283	*/
	284	int
	285	vfs_rootmountalloc(char fstypename, char devname, struct mount **mpp)
	286	{
	287	struct vfsconf *vfsp;
	288	struct mount *mp;
	289
	290	if (fstypename == NULL)
	291	return (ENODEV);
	292	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	293	if (!strcmp(vfsp->vfc_name, fstypename))
	294	break;
	295	}
	296	if (vfsp == NULL)
	297	return (ENODEV);
	298	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_WAITOK);
	299	bzero((char *)mp, (u_long)sizeof(struct mount));
	300	lockinit(&mp->mnt_lock, "vfslock", VLKTIMEOUT, 0);
	301	vfs_busy(mp, LK_NOWAIT);
	302	TAILQ_INIT(&mp->mnt_nvnodelist);
	303	TAILQ_INIT(&mp->mnt_reservedvnlist);
	304	TAILQ_INIT(&mp->mnt_jlist);
	305	mp->mnt_nvnodelistsize = 0;
	306	mp->mnt_vfc = vfsp;
	307	mp->mnt_op = vfsp->vfc_vfsops;
	308	mp->mnt_flag = MNT_RDONLY;
	309	vfsp->vfc_refcount++;
	310	mp->mnt_iosize_max = DFLTPHYS;
	311	mp->mnt_stat.f_type = vfsp->vfc_typenum;
	312	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	313	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
	314	copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
	315	*mpp = mp;
	316	return (0);
	317	}
	318
	319	/*
	320	* Lookup a mount point by filesystem identifier.
	321	*/
	322	struct mount *
	323	vfs_getvfs(fsid_t *fsid)
	324	{
	325	struct mount *mp;
	326	lwkt_tokref ilock;
	327
	328	lwkt_gettoken(&ilock, &mountlist_token);
	329	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	330	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	331	mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
	332	break;
	333	}
	334	}
	335	lwkt_reltoken(&ilock);
	336	return (mp);
	337	}
	338
	339	/*
	340	* Get a new unique fsid. Try to make its val[0] unique, since this value
	341	* will be used to create fake device numbers for stat(). Also try (but
	342	* not so hard) make its val[0] unique mod 2^16, since some emulators only
	343	* support 16-bit device numbers. We end up with unique val[0]'s for the
	344	* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
	345	*
	346	* Keep in mind that several mounts may be running in parallel. Starting
	347	* the search one past where the previous search terminated is both a
	348	* micro-optimization and a defense against returning the same fsid to
	349	* different mounts.
	350	*/
	351	void
	352	vfs_getnewfsid(struct mount *mp)
	353	{
	354	static u_int16_t mntid_base;
	355	lwkt_tokref ilock;
	356	fsid_t tfsid;
	357	int mtype;
	358
	359	lwkt_gettoken(&ilock, &mntid_token);
	360	mtype = mp->mnt_vfc->vfc_typenum;
	361	tfsid.val[1] = mtype;
	362	mtype = (mtype & 0xFF) << 24;
	363	for (;;) {
	364	tfsid.val[0] = makeudev(255,
	365	mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF));
	366	mntid_base++;
	367	if (vfs_getvfs(&tfsid) == NULL)
	368	break;
	369	}
	370	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
	371	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
	372	lwkt_reltoken(&ilock);
	373	}
	374
	375	/*
	376	* This routine is called when we have too many vnodes. It attempts
	377	* to free <count> vnodes and will potentially free vnodes that still
	378	* have VM backing store (VM backing store is typically the cause
	379	* of a vnode blowout so we want to do this). Therefore, this operation
	380	* is not considered cheap.
	381	*
	382	* A number of conditions may prevent a vnode from being reclaimed.
	383	* the buffer cache may have references on the vnode, a directory
	384	* vnode may still have references due to the namei cache representing
	385	* underlying files, or the vnode may be in active use. It is not
	386	* desireable to reuse such vnodes. These conditions may cause the
	387	* number of vnodes to reach some minimum value regardless of what
	388	* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
	389	*/
	390
	391	/*
	392	* This is a quick non-blocking check to determine if the vnode is a good
	393	* candidate for being (eventually) vgone()'d. Returns 0 if the vnode is
	394	* not a good candidate, 1 if it is.
	395	*
	396	* Note that a vnode can be marked VFREE without really being free, so
	397	* we don't use the flag for any tests.
	398	*/
	399	static __inline int
	400	vmightfree(struct vnode *vp, int page_count)
	401	{
	402	if (vp->v_flag & VRECLAIMED)
	403	return (0);
	404	#if 0
	405	if ((vp->v_flag & VFREE) && TAILQ_EMPTY(&vp->v_namecache))
	406	return (0);
	407	#endif
	408	if (vp->v_usecount != 0)
	409	return (0);
	410	if (vp->v_object && vp->v_object->resident_page_count >= page_count)
	411	return (0);
	412	return (1);
	413	}
	414
	415	/*
	416	* The vnode was found to be possibly vgone()able and the caller has locked it
	417	* (thus the usecount should be 1 now). Determine if the vnode is actually
	418	* vgone()able, doing some cleanups in the process. Returns 1 if the vnode
	419	* can be vgone()'d, 0 otherwise.
	420	*
	421	* Note that v_holdcnt may be non-zero because (A) this vnode is not a leaf
	422	* in the namecache topology and (B) this vnode has buffer cache bufs.
	423	* We cannot remove vnodes with non-leaf namecache associations. We do a
	424	* tentitive leaf check prior to attempting to flush out any buffers but the
	425	* 'real' test when all is said in done is that v_holdcnt must become 0 for
	426	* the vnode to be freeable.
	427	*
	428	* We could theoretically just unconditionally flush when v_holdcnt != 0,
	429	* but flushing data associated with non-leaf nodes (which are always
	430	* directories), just throws it away for no benefit. It is the buffer
	431	* cache's responsibility to choose buffers to recycle from the cached
	432	* data point of view.
	433	*/
	434	static int
	435	visleaf(struct vnode *vp)
	436	{
	437	struct namecache *ncp;
	438
	439	TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
	440	if (!TAILQ_EMPTY(&ncp->nc_list))
	441	return(0);
	442	}
	443	return(1);
	444	}
	445
	446	/*
	447	* Try to clean up the vnode to the point where it can be vgone()'d, returning
	448	* 0 if it cannot be vgone()'d (or already has been), 1 if it can. Unlike
	449	* vmightfree() this routine may flush the vnode and block. Vnodes marked
	450	* VFREE are still candidates for vgone()ing because they may hold namecache
	451	* resources and could be blocking the namecache directory hierarchy (and
	452	* related vnodes) from being freed.
	453	*/
	454	static int
	455	vtrytomakegoneable(struct vnode *vp, int page_count)
	456	{
	457	if (vp->v_flag & VRECLAIMED)
	458	return (0);
	459	if (vp->v_usecount != 1)
	460	return (0);
	461	if (vp->v_object && vp->v_object->resident_page_count >= page_count)
	462	return (0);
	463	if (vp->v_holdcnt && visleaf(vp)) {
	464	vinvalbuf(vp, V_SAVE, 0, 0);
	465	#if 0 /* DEBUG */
	466	kprintf((vp->v_holdcnt ? "vrecycle: vp %p failed: %s\n" :
	467	"vrecycle: vp %p succeeded: %s\n"), vp,
	468	(TAILQ_FIRST(&vp->v_namecache) ?
	469	TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"));
	470	#endif
	471	}
	472	return(vp->v_usecount == 1 && vp->v_holdcnt == 0);
	473	}
	474
	475	/*
	476	* Reclaim up to 1/10 of the vnodes associated with a mount point. Try
	477	* to avoid vnodes which have lots of resident pages (we are trying to free
	478	* vnodes, not memory).
	479	*
	480	* This routine is a callback from the mountlist scan. The mount point
	481	* in question will be busied.
	482	*/
	483	static int
	484	vlrureclaim(struct mount mp, void data)
	485	{
	486	struct vnode *vp;
	487	lwkt_tokref ilock;
	488	int done;
	489	int trigger;
	490	int usevnodes;
	491	int count;
	492	int trigger_mult = vnlru_nowhere;
	493
	494	/*
	495	* Calculate the trigger point for the resident pages check. The
	496	* minimum trigger value is approximately the number of pages in
	497	* the system divded by the number of vnodes. However, due to
	498	* various other system memory overheads unrelated to data caching
	499	* it is a good idea to double the trigger (at least).
	500	*
	501	* trigger_mult starts at 0. If the recycler is having problems
	502	* finding enough freeable vnodes it will increase trigger_mult.
	503	* This should not happen in normal operation, even on machines with
	504	* low amounts of memory, but extraordinary memory use by the system
	505	* verses the amount of cached data can trigger it.
	506	*/
	507	usevnodes = desiredvnodes;
	508	if (usevnodes <= 0)
	509	usevnodes = 1;
	510	trigger = vmstats.v_page_count * (trigger_mult + 2) / usevnodes;
	511
	512	done = 0;
	513	lwkt_gettoken(&ilock, &mntvnode_token);
	514	count = mp->mnt_nvnodelistsize / 10 + 1;
	515	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
	516	/*
	517	* __VNODESCAN__
	518	*
	519	* The VP will stick around while we hold mntvnode_token,
	520	* at least until we block, so we can safely do an initial
	521	* check, and then must check again after we lock the vnode.
	522	*/
	523	if (vp->v_type == VNON \|\| /* XXX */
	524	vp->v_type == VBAD \|\| /* XXX */
	525	!vmightfree(vp, trigger) /* critical path opt */
	526	) {
	527	vmovevnodetoend(mp, vp);
	528	--count;
	529	continue;
	530	}
	531
	532	/*
	533	* VX get the candidate vnode. If the VX get fails the
	534	* vnode might still be on the mountlist. Our loop depends
	535	* on us at least cycling the vnode to the end of the
	536	* mountlist.
	537	*/
	538	if (vx_get_nonblock(vp) != 0) {
	539	if (vp->v_mount == mp)
	540	vmovevnodetoend(mp, vp);
	541	--count;
	542	continue;
	543	}
	544
	545	/*
	546	* Since we blocked locking the vp, make sure it is still
	547	* a candidate for reclamation. That is, it has not already
	548	* been reclaimed and only has our VX reference associated
	549	* with it.
	550	*/
	551	if (vp->v_type == VNON \|\| /* XXX */
	552	vp->v_type == VBAD \|\| /* XXX */
	553	(vp->v_flag & VRECLAIMED) \|\|
	554	vp->v_mount != mp \|\|
	555	!vtrytomakegoneable(vp, trigger) /* critical path opt */
	556	) {
	557	if (vp->v_mount == mp)
	558	vmovevnodetoend(mp, vp);
	559	--count;
	560	vx_put(vp);
	561	continue;
	562	}
	563
	564	/*
	565	* All right, we are good, move the vp to the end of the
	566	* mountlist and clean it out. The vget will have returned
	567	* an error if the vnode was destroyed (VRECLAIMED set), so we
	568	* do not have to check again. The vput() will move the
	569	* vnode to the free list if the vgone() was successful.
	570	*/
	571	KKASSERT(vp->v_mount == mp);
	572	vmovevnodetoend(mp, vp);
	573	vgone(vp);
	574	vx_put(vp);
	575	++done;
	576	--count;
	577	}
	578	lwkt_reltoken(&ilock);
	579	return (done);
	580	}
	581
	582	/*
	583	* Attempt to recycle vnodes in a context that is always safe to block.
	584	* Calling vlrurecycle() from the bowels of file system code has some
	585	* interesting deadlock problems.
	586	*/
	587	static struct thread *vnlruthread;
	588	static int vnlruproc_sig;
	589
	590	void
	591	vnlru_proc_wait(void)
	592	{
	593	if (vnlruproc_sig == 0) {
	594	vnlruproc_sig = 1; /* avoid unnecessary wakeups */
	595	wakeup(vnlruthread);
	596	}
	597	tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
	598	}
	599
	600	static void
	601	vnlru_proc(void)
	602	{
	603	struct thread *td = curthread;
	604	int done;
	605
	606	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
	607	SHUTDOWN_PRI_FIRST);
	608
	609	crit_enter();
	610	for (;;) {
	611	kproc_suspend_loop();
	612	if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
	613	vnlruproc_sig = 0;
	614	wakeup(&vnlruproc_sig);
	615	tsleep(td, 0, "vlruwt", hz);
	616	continue;
	617	}
	618	cache_cleanneg(0);
	619	done = mountlist_scan(vlrureclaim, NULL, MNTSCAN_FORWARD);
	620
	621	/*
	622	* The vlrureclaim() call only processes 1/10 of the vnodes
	623	* on each mount. If we couldn't find any repeat the loop
	624	* at least enough times to cover all available vnodes before
	625	* we start sleeping. Complain if the failure extends past
	626	* 30 second, every 30 seconds.
	627	*/
	628	if (done == 0) {
	629	++vnlru_nowhere;
	630	if (vnlru_nowhere % 10 == 0)
	631	tsleep(td, 0, "vlrup", hz * 3);
	632	if (vnlru_nowhere % 100 == 0)
	633	kprintf("vnlru_proc: vnode recycler stopped working!\n");
	634	if (vnlru_nowhere == 1000)
	635	vnlru_nowhere = 900;
	636	} else {
	637	vnlru_nowhere = 0;
	638	}
	639	}
	640	crit_exit();
	641	}
	642
	643	/*
	644	* MOUNTLIST FUNCTIONS
	645	*/
	646
	647	/*
	648	* mountlist_insert (MP SAFE)
	649	*
	650	* Add a new mount point to the mount list.
	651	*/
	652	void
	653	mountlist_insert(struct mount *mp, int how)
	654	{
	655	lwkt_tokref ilock;
	656
	657	lwkt_gettoken(&ilock, &mountlist_token);
	658	if (how == MNTINS_FIRST)
	659	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
	660	else
	661	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
	662	lwkt_reltoken(&ilock);
	663	}
	664
	665	/*
	666	* mountlist_interlock (MP SAFE)
	667	*
	668	* Execute the specified interlock function with the mountlist token
	669	* held. The function will be called in a serialized fashion verses
	670	* other functions called through this mechanism.
	671	*/
	672	int
	673	mountlist_interlock(int (callback)(struct mount ), struct mount *mp)
	674	{
	675	lwkt_tokref ilock;
	676	int error;
	677
	678	lwkt_gettoken(&ilock, &mountlist_token);
	679	error = callback(mp);
	680	lwkt_reltoken(&ilock);
	681	return (error);
	682	}
	683
	684	/*
	685	* mountlist_boot_getfirst (DURING BOOT ONLY)
	686	*
	687	* This function returns the first mount on the mountlist, which is
	688	* expected to be the root mount. Since no interlocks are obtained
	689	* this function is only safe to use during booting.
	690	*/
	691
	692	struct mount *
	693	mountlist_boot_getfirst(void)
	694	{
	695	return(TAILQ_FIRST(&mountlist));
	696	}
	697
	698	/*
	699	* mountlist_remove (MP SAFE)
	700	*
	701	* Remove a node from the mountlist. If this node is the next scan node
	702	* for any active mountlist scans, the active mountlist scan will be
	703	* adjusted to skip the node, thus allowing removals during mountlist
	704	* scans.
	705	*/
	706	void
	707	mountlist_remove(struct mount *mp)
	708	{
	709	struct mountscan_info *msi;
	710	lwkt_tokref ilock;
	711
	712	lwkt_gettoken(&ilock, &mountlist_token);
	713	TAILQ_FOREACH(msi, &mountscan_list, msi_entry) {
	714	if (msi->msi_node == mp) {
	715	if (msi->msi_how & MNTSCAN_FORWARD)
	716	msi->msi_node = TAILQ_NEXT(mp, mnt_list);
	717	else
	718	msi->msi_node = TAILQ_PREV(mp, mntlist, mnt_list);
	719	}
	720	}
	721	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	722	lwkt_reltoken(&ilock);
	723	}
	724
	725	/*
	726	* mountlist_scan (MP SAFE)
	727	*
	728	* Safely scan the mount points on the mount list. Unless otherwise
	729	* specified each mount point will be busied prior to the callback and
	730	* unbusied afterwords. The callback may safely remove any mount point
	731	* without interfering with the scan. If the current callback
	732	* mount is removed the scanner will not attempt to unbusy it.
	733	*
	734	* If a mount node cannot be busied it is silently skipped.
	735	*
	736	* The callback return value is aggregated and a total is returned. A return
	737	* value of < 0 is not aggregated and will terminate the scan.
	738	*
	739	* MNTSCAN_FORWARD - the mountlist is scanned in the forward direction
	740	* MNTSCAN_REVERSE - the mountlist is scanned in reverse
	741	* MNTSCAN_NOBUSY - the scanner will make the callback without busying
	742	* the mount node.
	743	*/
	744	int
	745	mountlist_scan(int (callback)(struct mount , void ), void data, int how)
	746	{
	747	struct mountscan_info info;
	748	lwkt_tokref ilock;
	749	struct mount *mp;
	750	thread_t td;
	751	int count;
	752	int res;
	753
	754	lwkt_gettoken(&ilock, &mountlist_token);
	755
	756	info.msi_how = how;
	757	info.msi_node = NULL; /* paranoia */
	758	TAILQ_INSERT_TAIL(&mountscan_list, &info, msi_entry);
	759
	760	res = 0;
	761	td = curthread;
	762
	763	if (how & MNTSCAN_FORWARD) {
	764	info.msi_node = TAILQ_FIRST(&mountlist);
	765	while ((mp = info.msi_node) != NULL) {
	766	if (how & MNTSCAN_NOBUSY) {
	767	count = callback(mp, data);
	768	} else if (vfs_busy(mp, LK_NOWAIT) == 0) {
	769	count = callback(mp, data);
	770	if (mp == info.msi_node)
	771	vfs_unbusy(mp);
	772	} else {
	773	count = 0;
	774	}
	775	if (count < 0)
	776	break;
	777	res += count;
	778	if (mp == info.msi_node)
	779	info.msi_node = TAILQ_NEXT(mp, mnt_list);
	780	}
	781	} else if (how & MNTSCAN_REVERSE) {
	782	info.msi_node = TAILQ_LAST(&mountlist, mntlist);
	783	while ((mp = info.msi_node) != NULL) {
	784	if (how & MNTSCAN_NOBUSY) {
	785	count = callback(mp, data);
	786	} else if (vfs_busy(mp, LK_NOWAIT) == 0) {
	787	count = callback(mp, data);
	788	if (mp == info.msi_node)
	789	vfs_unbusy(mp);
	790	} else {
	791	count = 0;
	792	}
	793	if (count < 0)
	794	break;
	795	res += count;
	796	if (mp == info.msi_node)
	797	info.msi_node = TAILQ_PREV(mp, mntlist, mnt_list);
	798	}
	799	}
	800	TAILQ_REMOVE(&mountscan_list, &info, msi_entry);
	801	lwkt_reltoken(&ilock);
	802	return(res);
	803	}
	804
	805	/*
	806	* MOUNT RELATED VNODE FUNCTIONS
	807	*/
	808
	809	static struct kproc_desc vnlru_kp = {
	810	"vnlru",
	811	vnlru_proc,
	812	&vnlruthread
	813	};
	814	SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
	815
	816	/*
	817	* Move a vnode from one mount queue to another.
	818	*/
	819	void
	820	insmntque(struct vnode vp, struct mount mp)
	821	{
	822	lwkt_tokref ilock;
	823
	824	lwkt_gettoken(&ilock, &mntvnode_token);
	825	/*
	826	* Delete from old mount point vnode list, if on one.
	827	*/
	828	if (vp->v_mount != NULL) {
	829	KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
	830	("bad mount point vnode list size"));
	831	vremovevnodemnt(vp);
	832	vp->v_mount->mnt_nvnodelistsize--;
	833	}
	834	/*
	835	* Insert into list of vnodes for the new mount point, if available.
	836	*/
	837	if ((vp->v_mount = mp) == NULL) {
	838	lwkt_reltoken(&ilock);
	839	return;
	840	}
	841	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	842	mp->mnt_nvnodelistsize++;
	843	lwkt_reltoken(&ilock);
	844	}
	845
	846
	847	/*
	848	* Scan the vnodes under a mount point and issue appropriate callbacks.
	849	*
	850	* The fastfunc() callback is called with just the mountlist token held
	851	* (no vnode lock). It may not block and the vnode may be undergoing
	852	* modifications while the caller is processing it. The vnode will
	853	* not be entirely destroyed, however, due to the fact that the mountlist
	854	* token is held. A return value < 0 skips to the next vnode without calling
	855	* the slowfunc(), a return value > 0 terminates the loop.
	856	*
	857	* The slowfunc() callback is called after the vnode has been successfully
	858	* locked based on passed flags. The vnode is skipped if it gets rearranged
	859	* or destroyed while blocking on the lock. A non-zero return value from
	860	* the slow function terminates the loop. The slow function is allowed to
	861	* arbitrarily block. The scanning code guarentees consistency of operation
	862	* even if the slow function deletes or moves the node, or blocks and some
	863	* other thread deletes or moves the node.
	864	*/
	865	int
	866	vmntvnodescan(
	867	struct mount *mp,
	868	int flags,
	869	int (fastfunc)(struct mount mp, struct vnode vp, void data),
	870	int (slowfunc)(struct mount mp, struct vnode vp, void data),
	871	void *data
	872	) {
	873	struct vmntvnodescan_info info;
	874	lwkt_tokref ilock;
	875	struct vnode *vp;
	876	int r = 0;
	877	int maxcount = 1000000;
	878
	879	lwkt_gettoken(&ilock, &mntvnode_token);
	880
	881	info.vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	882	TAILQ_INSERT_TAIL(&mntvnodescan_list, &info, entry);
	883	while ((vp = info.vp) != NULL) {
	884	if (--maxcount == 0)
	885	panic("maxcount reached during vmntvnodescan");
	886
	887	if (vp->v_type == VNON) /* visible but not ready */
	888	goto next;
	889	KKASSERT(vp->v_mount == mp);
	890
	891	/*
	892	* Quick test. A negative return continues the loop without
	893	* calling the slow test. 0 continues onto the slow test.
	894	* A positive number aborts the loop.
	895	*/
	896	if (fastfunc) {
	897	if ((r = fastfunc(mp, vp, data)) < 0)
	898	goto next;
	899	if (r)
	900	break;
	901	}
	902
	903	/*
	904	* Get a vxlock on the vnode, retry if it has moved or isn't
	905	* in the mountlist where we expect it.
	906	*/
	907	if (slowfunc) {
	908	int error;
	909
	910	switch(flags) {
	911	case VMSC_GETVP:
	912	error = vget(vp, LK_EXCLUSIVE);
	913	break;
	914	case VMSC_GETVP\|VMSC_NOWAIT:
	915	error = vget(vp, LK_EXCLUSIVE\|LK_NOWAIT);
	916	break;
	917	case VMSC_GETVX:
	918	vx_get(vp);
	919	error = 0;
	920	break;
	921	default:
	922	error = 0;
	923	break;
	924	}
	925	if (error)
	926	goto next;
	927	/*
	928	* Do not call the slow function if the vnode is
	929	* invalid or if it was ripped out from under us
	930	* while we (potentially) blocked.
	931	*/
	932	if (info.vp == vp && vp->v_type != VNON)
	933	r = slowfunc(mp, vp, data);
	934
	935	/*
	936	* Cleanup
	937	*/
	938	switch(flags) {
	939	case VMSC_GETVP:
	940	case VMSC_GETVP\|VMSC_NOWAIT:
	941	vput(vp);
	942	break;
	943	case VMSC_GETVX:
	944	vx_put(vp);
	945	break;
	946	default:
	947	break;
	948	}
	949	if (r != 0)
	950	break;
	951	}
	952
	953	/*
	954	* Iterate. If the vnode was ripped out from under us
	955	* info.vp will already point to the next vnode, otherwise
	956	* we have to obtain the next valid vnode ourselves.
	957	*/
	958	next:
	959	if (info.vp == vp)
	960	info.vp = TAILQ_NEXT(vp, v_nmntvnodes);
	961	}
	962	TAILQ_REMOVE(&mntvnodescan_list, &info, entry);
	963	lwkt_reltoken(&ilock);
	964	return(r);
	965	}
	966
	967	/*
	968	* Remove any vnodes in the vnode table belonging to mount point mp.
	969	*
	970	* If FORCECLOSE is not specified, there should not be any active ones,
	971	* return error if any are found (nb: this is a user error, not a
	972	* system error). If FORCECLOSE is specified, detach any active vnodes
	973	* that are found.
	974	*
	975	* If WRITECLOSE is set, only flush out regular file vnodes open for
	976	* writing.
	977	*
	978	* SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
	979	*
	980	* `rootrefs' specifies the base reference count for the root vnode
	981	* of this filesystem. The root vnode is considered busy if its
	982	* v_usecount exceeds this value. On a successful return, vflush()
	983	* will call vrele() on the root vnode exactly rootrefs times.
	984	* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
	985	* be zero.
	986	*/
	987	#ifdef DIAGNOSTIC
	988	static int busyprt = 0; /* print out busy vnodes */
	989	SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
	990	#endif
	991
	992	static int vflush_scan(struct mount mp, struct vnode vp, void *data);
	993
	994	struct vflush_info {
	995	int flags;
	996	int busy;
	997	thread_t td;
	998	};
	999
	1000	int
	1001	vflush(struct mount *mp, int rootrefs, int flags)
	1002	{
	1003	struct thread td = curthread; / XXX */
	1004	struct vnode *rootvp = NULL;
	1005	int error;
	1006	struct vflush_info vflush_info;
	1007
	1008	if (rootrefs > 0) {
	1009	KASSERT((flags & (SKIPSYSTEM \| WRITECLOSE)) == 0,
	1010	("vflush: bad args"));
	1011	/*
	1012	* Get the filesystem root vnode. We can vput() it
	1013	* immediately, since with rootrefs > 0, it won't go away.
	1014	*/
	1015	if ((error = VFS_ROOT(mp, &rootvp)) != 0)
	1016	return (error);
	1017	vput(rootvp);
	1018	}
	1019
	1020	vflush_info.busy = 0;
	1021	vflush_info.flags = flags;
	1022	vflush_info.td = td;
	1023	vmntvnodescan(mp, VMSC_GETVX, NULL, vflush_scan, &vflush_info);
	1024
	1025	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
	1026	/*
	1027	* If just the root vnode is busy, and if its refcount
	1028	* is equal to `rootrefs', then go ahead and kill it.
	1029	*/
	1030	KASSERT(vflush_info.busy > 0, ("vflush: not busy"));
	1031	KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
	1032	if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) {
	1033	vx_lock(rootvp);
	1034	vgone(rootvp);
	1035	vx_unlock(rootvp);
	1036	vflush_info.busy = 0;
	1037	}
	1038	}
	1039	if (vflush_info.busy)
	1040	return (EBUSY);
	1041	for (; rootrefs > 0; rootrefs--)
	1042	vrele(rootvp);
	1043	return (0);
	1044	}
	1045
	1046	/*
	1047	* The scan callback is made with an VX locked vnode.
	1048	*/
	1049	static int
	1050	vflush_scan(struct mount mp, struct vnode vp, void *data)
	1051	{
	1052	struct vflush_info *info = data;
	1053	struct vattr vattr;
	1054
	1055	/*
	1056	* Skip over a vnodes marked VSYSTEM.
	1057	*/
	1058	if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
	1059	return(0);
	1060	}
	1061
	1062	/*
	1063	* If WRITECLOSE is set, flush out unlinked but still open
	1064	* files (even if open only for reading) and regular file
	1065	* vnodes open for writing.
	1066	*/
	1067	if ((info->flags & WRITECLOSE) &&
	1068	(vp->v_type == VNON \|\|
	1069	(VOP_GETATTR(vp, &vattr) == 0 &&
	1070	vattr.va_nlink > 0)) &&
	1071	(vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	1072	return(0);
	1073	}
	1074
	1075	/*
	1076	* With v_usecount == 0, all we need to do is clear out the
	1077	* vnode data structures and we are done.
	1078	*/
	1079	if (vp->v_usecount == 1) {
	1080	vgone(vp);
	1081	return(0);
	1082	}
	1083
	1084	/*
	1085	* If FORCECLOSE is set, forcibly close the vnode. For block
	1086	* or character devices, revert to an anonymous device. For
	1087	* all other files, just kill them.
	1088	*/
	1089	if (info->flags & FORCECLOSE) {
	1090	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	1091	vgone(vp);
	1092	} else {
	1093	spin_lock_wr(&vp->v_spinlock);
	1094	vclean_interlocked(vp, 0);
	1095	/* spinlock unlocked */
	1096	vp->v_ops = &spec_vnode_vops_p;
	1097	insmntque(vp, NULL);
	1098	}
	1099	return(0);
	1100	}
	1101	#ifdef DIAGNOSTIC
	1102	if (busyprt)
	1103	vprint("vflush: busy vnode", vp);
	1104	#endif
	1105	++info->busy;
	1106	return(0);
	1107	}
	1108