gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	39	* $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
	40	* $DragonFly: src/sys/kern/vfs_subr.c,v 1.43 2004/10/12 19:20:46 dillon Exp $
	41	*/
	42
	43	/*
	44	* External virtual filesystem routines
	45	*/
	46	#include "opt_ddb.h"
	47
	48	#include <sys/param.h>
	49	#include <sys/systm.h>
	50	#include <sys/buf.h>
	51	#include <sys/conf.h>
	52	#include <sys/dirent.h>
	53	#include <sys/domain.h>
	54	#include <sys/eventhandler.h>
	55	#include <sys/fcntl.h>
	56	#include <sys/kernel.h>
	57	#include <sys/kthread.h>
	58	#include <sys/malloc.h>
	59	#include <sys/mbuf.h>
	60	#include <sys/mount.h>
	61	#include <sys/proc.h>
	62	#include <sys/namei.h>
	63	#include <sys/reboot.h>
	64	#include <sys/socket.h>
	65	#include <sys/stat.h>
	66	#include <sys/sysctl.h>
	67	#include <sys/syslog.h>
	68	#include <sys/vmmeter.h>
	69	#include <sys/vnode.h>
	70
	71	#include <machine/limits.h>
	72
	73	#include <vm/vm.h>
	74	#include <vm/vm_object.h>
	75	#include <vm/vm_extern.h>
	76	#include <vm/vm_kern.h>
	77	#include <vm/pmap.h>
	78	#include <vm/vm_map.h>
	79	#include <vm/vm_page.h>
	80	#include <vm/vm_pager.h>
	81	#include <vm/vnode_pager.h>
	82	#include <vm/vm_zone.h>
	83
	84	#include <sys/buf2.h>
	85	#include <sys/thread2.h>
	86
	87	static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
	88
	89	int numvnodes;
	90	SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
	91
	92	enum vtype iftovt_tab[16] = {
	93	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	94	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	95	};
	96	int vttoif_tab[9] = {
	97	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	98	S_IFSOCK, S_IFIFO, S_IFMT,
	99	};
	100
	101	static int reassignbufcalls;
	102	SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW,
	103	&reassignbufcalls, 0, "");
	104	static int reassignbufloops;
	105	SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW,
	106	&reassignbufloops, 0, "");
	107	static int reassignbufsortgood;
	108	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW,
	109	&reassignbufsortgood, 0, "");
	110	static int reassignbufsortbad;
	111	SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW,
	112	&reassignbufsortbad, 0, "");
	113	static int reassignbufmethod = 1;
	114	SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW,
	115	&reassignbufmethod, 0, "");
	116
	117	#ifdef ENABLE_VFS_IOOPT
	118	int vfs_ioopt = 0;
	119	SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
	120	#endif
	121
	122	int nfs_mount_type = -1;
	123	static struct lwkt_token spechash_token;
	124	struct nfs_public nfs_pub; /* publicly exported FS */
	125
	126	int desiredvnodes;
	127	SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
	128	&desiredvnodes, 0, "Maximum number of vnodes");
	129
	130	static void vfs_free_addrlist (struct netexport *nep);
	131	static int vfs_free_netcred (struct radix_node rn, void w);
	132	static int vfs_hang_addrlist (struct mount mp, struct netexport nep,
	133	struct export_args *argp);
	134
	135	extern int dev_ref_debug;
	136	extern struct vnodeopv_entry_desc spec_vnodeop_entries[];
	137
	138	/*
	139	* Return 0 if the vnode is already on the free list or cannot be placed
	140	* on the free list. Return 1 if the vnode can be placed on the free list.
	141	*/
	142	static __inline int
	143	vshouldfree(struct vnode *vp, int usecount)
	144	{
	145	if (vp->v_flag & VFREE)
	146	return (0); /* already free */
	147	if (vp->v_holdcnt != 0 \|\| vp->v_usecount != usecount)
	148	return (0); /* other holderse */
	149	if (vp->v_object &&
	150	(vp->v_object->ref_count \|\| vp->v_object->resident_page_count)) {
	151	return (0);
	152	}
	153	return (1);
	154	}
	155
	156	/*
	157	* Initialize the vnode management data structures.
	158	*
	159	* Called from vfsinit()
	160	*/
	161	void
	162	vfs_subr_init(void)
	163	{
	164	/*
	165	* Desired vnodes is a result of the physical page count
	166	* and the size of kernel's heap. It scales in proportion
	167	* to the amount of available physical memory. This can
	168	* cause trouble on 64-bit and large memory platforms.
	169	*/
	170	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
	171	desiredvnodes =
	172	min(maxproc + vmstats.v_page_count /4,
	173	2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
	174	(5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
	175
	176	lwkt_token_init(&spechash_token);
	177	}
	178
	179	/*
	180	* Knob to control the precision of file timestamps:
	181	*
	182	* 0 = seconds only; nanoseconds zeroed.
	183	* 1 = seconds and nanoseconds, accurate within 1/HZ.
	184	* 2 = seconds and nanoseconds, truncated to microseconds.
	185	* >=3 = seconds and nanoseconds, maximum precision.
	186	*/
	187	enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
	188
	189	static int timestamp_precision = TSP_SEC;
	190	SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
	191	&timestamp_precision, 0, "");
	192
	193	/*
	194	* Get a current timestamp.
	195	*/
	196	void
	197	vfs_timestamp(struct timespec *tsp)
	198	{
	199	struct timeval tv;
	200
	201	switch (timestamp_precision) {
	202	case TSP_SEC:
	203	tsp->tv_sec = time_second;
	204	tsp->tv_nsec = 0;
	205	break;
	206	case TSP_HZ:
	207	getnanotime(tsp);
	208	break;
	209	case TSP_USEC:
	210	microtime(&tv);
	211	TIMEVAL_TO_TIMESPEC(&tv, tsp);
	212	break;
	213	case TSP_NSEC:
	214	default:
	215	nanotime(tsp);
	216	break;
	217	}
	218	}
	219
	220	/*
	221	* Set vnode attributes to VNOVAL
	222	*/
	223	void
	224	vattr_null(struct vattr *vap)
	225	{
	226	vap->va_type = VNON;
	227	vap->va_size = VNOVAL;
	228	vap->va_bytes = VNOVAL;
	229	vap->va_mode = VNOVAL;
	230	vap->va_nlink = VNOVAL;
	231	vap->va_uid = VNOVAL;
	232	vap->va_gid = VNOVAL;
	233	vap->va_fsid = VNOVAL;
	234	vap->va_fileid = VNOVAL;
	235	vap->va_blocksize = VNOVAL;
	236	vap->va_rdev = VNOVAL;
	237	vap->va_atime.tv_sec = VNOVAL;
	238	vap->va_atime.tv_nsec = VNOVAL;
	239	vap->va_mtime.tv_sec = VNOVAL;
	240	vap->va_mtime.tv_nsec = VNOVAL;
	241	vap->va_ctime.tv_sec = VNOVAL;
	242	vap->va_ctime.tv_nsec = VNOVAL;
	243	vap->va_flags = VNOVAL;
	244	vap->va_gen = VNOVAL;
	245	vap->va_vaflags = 0;
	246	}
	247
	248	/*
	249	* Update outstanding I/O count and do wakeup if requested.
	250	*/
	251	void
	252	vwakeup(struct buf *bp)
	253	{
	254	struct vnode *vp;
	255
	256	bp->b_flags &= ~B_WRITEINPROG;
	257	if ((vp = bp->b_vp)) {
	258	vp->v_numoutput--;
	259	if (vp->v_numoutput < 0)
	260	panic("vwakeup: neg numoutput");
	261	if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
	262	vp->v_flag &= ~VBWAIT;
	263	wakeup((caddr_t) &vp->v_numoutput);
	264	}
	265	}
	266	}
	267
	268	/*
	269	* Flush out and invalidate all buffers associated with a vnode.
	270	*
	271	* vp must be locked.
	272	*/
	273	int
	274	vinvalbuf(struct vnode vp, int flags, struct thread td,
	275	int slpflag, int slptimeo)
	276	{
	277	struct buf *bp;
	278	struct buf nbp, blist;
	279	int s, error;
	280	vm_object_t object;
	281
	282	if (flags & V_SAVE) {
	283	s = splbio();
	284	while (vp->v_numoutput) {
	285	vp->v_flag \|= VBWAIT;
	286	error = tsleep((caddr_t)&vp->v_numoutput,
	287	slpflag, "vinvlbuf", slptimeo);
	288	if (error) {
	289	splx(s);
	290	return (error);
	291	}
	292	}
	293	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	294	splx(s);
	295	if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
	296	return (error);
	297	s = splbio();
	298	if (vp->v_numoutput > 0 \|\|
	299	!TAILQ_EMPTY(&vp->v_dirtyblkhd))
	300	panic("vinvalbuf: dirty bufs");
	301	}
	302	splx(s);
	303	}
	304	s = splbio();
	305	for (;;) {
	306	blist = TAILQ_FIRST(&vp->v_cleanblkhd);
	307	if (!blist)
	308	blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
	309	if (!blist)
	310	break;
	311
	312	for (bp = blist; bp; bp = nbp) {
	313	nbp = TAILQ_NEXT(bp, b_vnbufs);
	314	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	315	error = BUF_TIMELOCK(bp,
	316	LK_EXCLUSIVE \| LK_SLEEPFAIL,
	317	"vinvalbuf", slpflag, slptimeo);
	318	if (error == ENOLCK)
	319	break;
	320	splx(s);
	321	return (error);
	322	}
	323	/*
	324	* XXX Since there are no node locks for NFS, I
	325	* believe there is a slight chance that a delayed
	326	* write will occur while sleeping just above, so
	327	* check for it. Note that vfs_bio_awrite expects
	328	* buffers to reside on a queue, while VOP_BWRITE and
	329	* brelse do not.
	330	*/
	331	if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) &&
	332	(flags & V_SAVE)) {
	333
	334	if (bp->b_vp == vp) {
	335	if (bp->b_flags & B_CLUSTEROK) {
	336	BUF_UNLOCK(bp);
	337	vfs_bio_awrite(bp);
	338	} else {
	339	bremfree(bp);
	340	bp->b_flags \|= B_ASYNC;
	341	VOP_BWRITE(bp->b_vp, bp);
	342	}
	343	} else {
	344	bremfree(bp);
	345	(void) VOP_BWRITE(bp->b_vp, bp);
	346	}
	347	break;
	348	}
	349	bremfree(bp);
	350	bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF);
	351	bp->b_flags &= ~B_ASYNC;
	352	brelse(bp);
	353	}
	354	}
	355
	356	/*
	357	* Wait for I/O to complete. XXX needs cleaning up. The vnode can
	358	* have write I/O in-progress but if there is a VM object then the
	359	* VM object can also have read-I/O in-progress.
	360	*/
	361	do {
	362	while (vp->v_numoutput > 0) {
	363	vp->v_flag \|= VBWAIT;
	364	tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
	365	}
	366	if (VOP_GETVOBJECT(vp, &object) == 0) {
	367	while (object->paging_in_progress)
	368	vm_object_pip_sleep(object, "vnvlbx");
	369	}
	370	} while (vp->v_numoutput > 0);
	371
	372	splx(s);
	373
	374	/*
	375	* Destroy the copy in the VM cache, too.
	376	*/
	377	if (VOP_GETVOBJECT(vp, &object) == 0) {
	378	vm_object_page_remove(object, 0, 0,
	379	(flags & V_SAVE) ? TRUE : FALSE);
	380	}
	381
	382	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd))
	383	panic("vinvalbuf: flush failed");
	384	return (0);
	385	}
	386
	387	/*
	388	* Truncate a file's buffer and pages to a specified length. This
	389	* is in lieu of the old vinvalbuf mechanism, which performed unneeded
	390	* sync activity.
	391	*
	392	* The vnode must be locked.
	393	*/
	394	int
	395	vtruncbuf(struct vnode vp, struct thread td, off_t length, int blksize)
	396	{
	397	struct buf *bp;
	398	struct buf *nbp;
	399	int s, anyfreed;
	400	int trunclbn;
	401
	402	/*
	403	* Round up to the next lbn.
	404	*/
	405	trunclbn = (length + blksize - 1) / blksize;
	406
	407	s = splbio();
	408	restart:
	409	anyfreed = 1;
	410	for (;anyfreed;) {
	411	anyfreed = 0;
	412	for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
	413	nbp = TAILQ_NEXT(bp, b_vnbufs);
	414	if (bp->b_lblkno >= trunclbn) {
	415	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	416	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	417	goto restart;
	418	} else {
	419	bremfree(bp);
	420	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	421	bp->b_flags &= ~B_ASYNC;
	422	brelse(bp);
	423	anyfreed = 1;
	424	}
	425	if (nbp &&
	426	(((nbp->b_xflags & BX_VNCLEAN) == 0) \|\|
	427	(nbp->b_vp != vp) \|\|
	428	(nbp->b_flags & B_DELWRI))) {
	429	goto restart;
	430	}
	431	}
	432	}
	433
	434	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	435	nbp = TAILQ_NEXT(bp, b_vnbufs);
	436	if (bp->b_lblkno >= trunclbn) {
	437	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	438	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	439	goto restart;
	440	} else {
	441	bremfree(bp);
	442	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	443	bp->b_flags &= ~B_ASYNC;
	444	brelse(bp);
	445	anyfreed = 1;
	446	}
	447	if (nbp &&
	448	(((nbp->b_xflags & BX_VNDIRTY) == 0) \|\|
	449	(nbp->b_vp != vp) \|\|
	450	(nbp->b_flags & B_DELWRI) == 0)) {
	451	goto restart;
	452	}
	453	}
	454	}
	455	}
	456
	457	if (length > 0) {
	458	restartsync:
	459	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
	460	nbp = TAILQ_NEXT(bp, b_vnbufs);
	461	if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
	462	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	463	BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL);
	464	goto restart;
	465	} else {
	466	bremfree(bp);
	467	if (bp->b_vp == vp) {
	468	bp->b_flags \|= B_ASYNC;
	469	} else {
	470	bp->b_flags &= ~B_ASYNC;
	471	}
	472	VOP_BWRITE(bp->b_vp, bp);
	473	}
	474	goto restartsync;
	475	}
	476
	477	}
	478	}
	479
	480	while (vp->v_numoutput > 0) {
	481	vp->v_flag \|= VBWAIT;
	482	tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
	483	}
	484
	485	splx(s);
	486
	487	vnode_pager_setsize(vp, length);
	488
	489	return (0);
	490	}
	491
	492	/*
	493	* Associate a buffer with a vnode.
	494	*/
	495	void
	496	bgetvp(struct vnode vp, struct buf bp)
	497	{
	498	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
	499
	500	vhold(vp);
	501	bp->b_vp = vp;
	502	bp->b_dev = vn_todev(vp);
	503	/*
	504	* Insert onto list for new vnode.
	505	*/
	506	crit_enter();
	507	bp->b_xflags \|= BX_VNCLEAN;
	508	bp->b_xflags &= ~BX_VNDIRTY;
	509	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
	510	crit_exit();
	511	}
	512
	513	/*
	514	* Disassociate a buffer from a vnode.
	515	*/
	516	void
	517	brelvp(struct buf *bp)
	518	{
	519	struct vnode *vp;
	520	struct buflists *listheadp;
	521
	522	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
	523
	524	/*
	525	* Delete from old vnode list, if on one.
	526	*/
	527	vp = bp->b_vp;
	528	crit_enter();
	529	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	530	if (bp->b_xflags & BX_VNDIRTY)
	531	listheadp = &vp->v_dirtyblkhd;
	532	else
	533	listheadp = &vp->v_cleanblkhd;
	534	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	535	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	536	}
	537	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
	538	vp->v_flag &= ~VONWORKLST;
	539	LIST_REMOVE(vp, v_synclist);
	540	}
	541	crit_exit();
	542	bp->b_vp = NULL;
	543	vdrop(vp);
	544	}
	545
	546	/*
	547	* Associate a p-buffer with a vnode.
	548	*
	549	* Also sets B_PAGING flag to indicate that vnode is not fully associated
	550	* with the buffer. i.e. the bp has not been linked into the vnode or
	551	* ref-counted.
	552	*/
	553	void
	554	pbgetvp(struct vnode vp, struct buf bp)
	555	{
	556	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
	557
	558	bp->b_vp = vp;
	559	bp->b_flags \|= B_PAGING;
	560	bp->b_dev = vn_todev(vp);
	561	}
	562
	563	/*
	564	* Disassociate a p-buffer from a vnode.
	565	*/
	566	void
	567	pbrelvp(struct buf *bp)
	568	{
	569	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
	570
	571	/* XXX REMOVE ME */
	572	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
	573	panic(
	574	"relpbuf(): b_vp was probably reassignbuf()d %p %x",
	575	bp,
	576	(int)bp->b_flags
	577	);
	578	}
	579	bp->b_vp = (struct vnode *) 0;
	580	bp->b_flags &= ~B_PAGING;
	581	}
	582
	583	void
	584	pbreassignbuf(struct buf bp, struct vnode newvp)
	585	{
	586	if ((bp->b_flags & B_PAGING) == 0) {
	587	panic(
	588	"pbreassignbuf() on non phys bp %p",
	589	bp
	590	);
	591	}
	592	bp->b_vp = newvp;
	593	}
	594
	595	/*
	596	* Reassign a buffer from one vnode to another.
	597	* Used to assign file specific control information
	598	* (indirect blocks) to the vnode to which they belong.
	599	*/
	600	void
	601	reassignbuf(struct buf bp, struct vnode newvp)
	602	{
	603	struct buflists *listheadp;
	604	int delay;
	605
	606	if (newvp == NULL) {
	607	printf("reassignbuf: NULL");
	608	return;
	609	}
	610	++reassignbufcalls;
	611
	612	/*
	613	* B_PAGING flagged buffers cannot be reassigned because their vp
	614	* is not fully linked in.
	615	*/
	616	if (bp->b_flags & B_PAGING)
	617	panic("cannot reassign paging buffer");
	618
	619	crit_enter();
	620	/*
	621	* Delete from old vnode list, if on one.
	622	*/
	623	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) {
	624	if (bp->b_xflags & BX_VNDIRTY)
	625	listheadp = &bp->b_vp->v_dirtyblkhd;
	626	else
	627	listheadp = &bp->b_vp->v_cleanblkhd;
	628	TAILQ_REMOVE(listheadp, bp, b_vnbufs);
	629	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	630	if (bp->b_vp != newvp) {
	631	vdrop(bp->b_vp);
	632	bp->b_vp = NULL; /* for clarification */
	633	}
	634	}
	635	/*
	636	* If dirty, put on list of dirty buffers; otherwise insert onto list
	637	* of clean buffers.
	638	*/
	639	if (bp->b_flags & B_DELWRI) {
	640	struct buf *tbp;
	641
	642	listheadp = &newvp->v_dirtyblkhd;
	643	if ((newvp->v_flag & VONWORKLST) == 0) {
	644	switch (newvp->v_type) {
	645	case VDIR:
	646	delay = dirdelay;
	647	break;
	648	case VCHR:
	649	case VBLK:
	650	if (newvp->v_rdev &&
	651	newvp->v_rdev->si_mountpoint != NULL) {
	652	delay = metadelay;
	653	break;
	654	}
	655	/* fall through */
	656	default:
	657	delay = filedelay;
	658	}
	659	vn_syncer_add_to_worklist(newvp, delay);
	660	}
	661	bp->b_xflags \|= BX_VNDIRTY;
	662	tbp = TAILQ_FIRST(listheadp);
	663	if (tbp == NULL \|\|
	664	bp->b_lblkno == 0 \|\|
	665	(bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\|
	666	(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
	667	TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
	668	++reassignbufsortgood;
	669	} else if (bp->b_lblkno < 0) {
	670	TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
	671	++reassignbufsortgood;
	672	} else if (reassignbufmethod == 1) {
	673	/*
	674	* New sorting algorithm, only handle sequential case,
	675	* otherwise append to end (but before metadata)
	676	*/
	677	if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
	678	(tbp->b_xflags & BX_VNDIRTY)) {
	679	/*
	680	* Found the best place to insert the buffer
	681	*/
	682	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	683	++reassignbufsortgood;
	684	} else {
	685	/*
	686	* Missed, append to end, but before meta-data.
	687	* We know that the head buffer in the list is
	688	* not meta-data due to prior conditionals.
	689	*
	690	* Indirect effects: NFS second stage write
	691	* tends to wind up here, giving maximum
	692	* distance between the unstable write and the
	693	* commit rpc.
	694	*/
	695	tbp = TAILQ_LAST(listheadp, buflists);
	696	while (tbp && tbp->b_lblkno < 0)
	697	tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
	698	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	699	++reassignbufsortbad;
	700	}
	701	} else {
	702	/*
	703	* Old sorting algorithm, scan queue and insert
	704	*/
	705	struct buf *ttbp;
	706	while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
	707	(ttbp->b_lblkno < bp->b_lblkno)) {
	708	++reassignbufloops;
	709	tbp = ttbp;
	710	}
	711	TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
	712	}
	713	} else {
	714	bp->b_xflags \|= BX_VNCLEAN;
	715	TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
	716	if ((newvp->v_flag & VONWORKLST) &&
	717	TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
	718	newvp->v_flag &= ~VONWORKLST;
	719	LIST_REMOVE(newvp, v_synclist);
	720	}
	721	}
	722	if (bp->b_vp != newvp) {
	723	bp->b_vp = newvp;
	724	vhold(bp->b_vp);
	725	}
	726	crit_exit();
	727	}
	728
	729	/*
	730	* Create a vnode for a block device.
	731	* Used for mounting the root file system.
	732	*/
	733	int
	734	bdevvp(dev_t dev, struct vnode **vpp)
	735	{
	736	struct vnode *vp;
	737	struct vnode *nvp;
	738	int error;
	739
	740	if (dev == NODEV) {
	741	*vpp = NULLVP;
	742	return (ENXIO);
	743	}
	744	error = getnewvnode(VT_NON, NULL, spec_vnode_vops, &nvp, 0, 0);
	745	if (error) {
	746	*vpp = NULLVP;
	747	return (error);
	748	}
	749	vp = nvp;
	750	vp->v_type = VCHR;
	751	vp->v_udev = dev->si_udev;
	752	vx_unlock(vp);
	753	*vpp = vp;
	754	return (0);
	755	}
	756
	757	int
	758	v_associate_rdev(struct vnode *vp, dev_t dev)
	759	{
	760	lwkt_tokref ilock;
	761
	762	if (dev == NULL \|\| dev == NODEV)
	763	return(ENXIO);
	764	if (dev_is_good(dev) == 0)
	765	return(ENXIO);
	766	KKASSERT(vp->v_rdev == NULL);
	767	if (dev_ref_debug)
	768	printf("Z1");
	769	vp->v_rdev = reference_dev(dev);
	770	lwkt_gettoken(&ilock, &spechash_token);
	771	SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext);
	772	lwkt_reltoken(&ilock);
	773	return(0);
	774	}
	775
	776	void
	777	v_release_rdev(struct vnode *vp)
	778	{
	779	lwkt_tokref ilock;
	780	dev_t dev;
	781
	782	if ((dev = vp->v_rdev) != NULL) {
	783	lwkt_gettoken(&ilock, &spechash_token);
	784	SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext);
	785	if (dev_ref_debug && vp->v_opencount != 0) {
	786	printf("releasing rdev with non-0 "
	787	"v_opencount(%d) (revoked?)\n",
	788	vp->v_opencount);
	789	}
	790	vp->v_rdev = NULL;
	791	vp->v_opencount = 0;
	792	release_dev(dev);
	793	lwkt_reltoken(&ilock);
	794	}
	795	}
	796
	797	/*
	798	* Add a vnode to the alias list hung off the dev_t. We only associate
	799	* the device number with the vnode. The actual device is not associated
	800	* until the vnode is opened (usually in spec_open()), and will be
	801	* disassociated on last close.
	802	*/
	803	void
	804	addaliasu(struct vnode *nvp, udev_t nvp_udev)
	805	{
	806	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
	807	panic("addaliasu on non-special vnode");
	808	nvp->v_udev = nvp_udev;
	809	}
	810
	811	/*
	812	* Disassociate a vnode from its underlying filesystem.
	813	*
	814	* The vnode must be VX locked and refd
	815	*
	816	* If there are v_usecount references to the vnode other then ours we have
	817	* to VOP_CLOSE the vnode before we can deactivate and reclaim it.
	818	*/
	819	void
	820	vclean(struct vnode vp, int flags, struct thread td)
	821	{
	822	int active;
	823
	824	/*
	825	* If the vnode has already been reclaimed we have nothing to do.
	826	*/
	827	if (vp->v_flag & VRECLAIMED)
	828	return;
	829	vp->v_flag \|= VRECLAIMED;
	830
	831	/*
	832	* Scrap the vfs cache
	833	*/
	834	cache_inval_vp(vp, CINV_SELF);
	835
	836	/*
	837	* Check to see if the vnode is in use. If so we have to reference it
	838	* before we clean it out so that its count cannot fall to zero and
	839	* generate a race against ourselves to recycle it.
	840	*/
	841	active = (vp->v_usecount > 1);
	842
	843	/*
	844	* Clean out any buffers associated with the vnode and destroy its
	845	* object, if it has one.
	846	*/
	847	vinvalbuf(vp, V_SAVE, td, 0, 0);
	848	VOP_DESTROYVOBJECT(vp);
	849
	850	/*
	851	* If purging an active vnode, it must be closed and
	852	* deactivated before being reclaimed. XXX
	853	*
	854	* Note that neither of these routines unlocks the vnode.
	855	*/
	856	if (active) {
	857	if (flags & DOCLOSE)
	858	VOP_CLOSE(vp, FNONBLOCK, td);
	859	}
	860
	861	/*
	862	* If the vnode has not be deactivated, deactivated it.
	863	*/
	864	if ((vp->v_flag & VINACTIVE) == 0) {
	865	vp->v_flag \|= VINACTIVE;
	866	VOP_INACTIVE(vp, td);
	867	}
	868
	869	/*
	870	* Reclaim the vnode.
	871	*/
	872	if (VOP_RECLAIM(vp, td))
	873	panic("vclean: cannot reclaim");
	874
	875	/*
	876	* Done with purge, notify sleepers of the grim news.
	877	*/
	878	vp->v_ops = dead_vnode_vops;
	879	vn_pollgone(vp);
	880	vp->v_tag = VT_NON;
	881	}
	882
	883	/*
	884	* Eliminate all activity associated with the requested vnode
	885	* and with all vnodes aliased to the requested vnode.
	886	*
	887	* The vnode must be referenced and vx_lock()'d
	888	*
	889	* revoke { struct vnode *a_vp, int a_flags }
	890	*/
	891	int
	892	vop_stdrevoke(struct vop_revoke_args *ap)
	893	{
	894	struct vnode vp, vq;
	895	lwkt_tokref ilock;
	896	dev_t dev;
	897
	898	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
	899
	900	vp = ap->a_vp;
	901
	902	/*
	903	* If the vnode is already dead don't try to revoke it
	904	*/
	905	if (vp->v_flag & VRECLAIMED)
	906	return (0);
	907
	908	/*
	909	* If the vnode has a device association, scrap all vnodes associated
	910	* with the device. Don't let the device disappear on us while we
	911	* are scrapping the vnodes.
	912	*
	913	* The passed vp will probably show up in the list, do not VX lock
	914	* it twice!
	915	*/
	916	if (vp->v_type != VCHR && vp->v_type != VBLK)
	917	return(0);
	918	if ((dev = vp->v_rdev) == NULL) {
	919	if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV)
	920	return(0);
	921	}
	922	reference_dev(dev);
	923	lwkt_gettoken(&ilock, &spechash_token);
	924	while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) {
	925	if (vp == vq \|\| vx_get(vq) == 0) {
	926	if (vq == SLIST_FIRST(&dev->si_hlist))
	927	vgone(vq);
	928	if (vp != vq)
	929	vx_put(vq);
	930	}
	931	}
	932	lwkt_reltoken(&ilock);
	933	release_dev(dev);
	934	return (0);
	935	}
	936
	937	/*
	938	* Recycle an unused vnode to the front of the free list.
	939	*
	940	* Returns 1 if we were successfully able to recycle the vnode,
	941	* 0 otherwise.
	942	*/
	943	int
	944	vrecycle(struct vnode vp, struct thread td)
	945	{
	946	if (vp->v_usecount == 1) {
	947	vgone(vp);
	948	return (1);
	949	}
	950	return (0);
	951	}
	952
	953	/*
	954	* Eliminate all activity associated with a vnode in preparation for reuse.
	955	*
	956	* The vnode must be VX locked and will remain VX locked on return. This
	957	* routine may be called with the vnode in any state, as long as it is
	958	* VX locked. The vnode will be cleaned out and marked VRECLAIMED but will
	959	* not actually be reused until all existing refs and holds go away.
	960	*
	961	* NOTE: This routine may be called on a vnode which has not yet been
	962	* already been deactivated (VOP_INACTIVE), or on a vnode which has
	963	* already been reclaimed.
	964	*
	965	* This routine is not responsible for placing us back on the freelist.
	966	* Instead, it happens automatically when the caller releases the VX lock
	967	* (assuming there aren't any other references).
	968	*/
	969	void
	970	vgone(struct vnode *vp)
	971	{
	972	/*
	973	* assert that the VX lock is held. This is an absolute requirement
	974	* now for vgone() to be called.
	975	*/
	976	KKASSERT(vp->v_lock.lk_exclusivecount == 1);
	977
	978	/*
	979	* Clean out the filesystem specific data and set the VRECLAIMED
	980	* bit. Also deactivate the vnode if necessary.
	981	*/
	982	vclean(vp, DOCLOSE, curthread);
	983
	984	/*
	985	* Delete from old mount point vnode list, if on one.
	986	*/
	987	if (vp->v_mount != NULL)
	988	insmntque(vp, NULL);
	989
	990	/*
	991	* If special device, remove it from special device alias list
	992	* if it is on one. This should normally only occur if a vnode is
	993	* being revoked as the device should otherwise have been released
	994	* naturally.
	995	*/
	996	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) {
	997	v_release_rdev(vp);
	998	}
	999
	1000	/*
	1001	* Set us to VBAD
	1002	*/
	1003	vp->v_type = VBAD;
	1004	}
	1005
	1006	/*
	1007	* Lookup a vnode by device number.
	1008	*/
	1009	int
	1010	vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
	1011	{
	1012	lwkt_tokref ilock;
	1013	struct vnode *vp;
	1014
	1015	lwkt_gettoken(&ilock, &spechash_token);
	1016	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	1017	if (type == vp->v_type) {
	1018	*vpp = vp;
	1019	lwkt_reltoken(&ilock);
	1020	return (1);
	1021	}
	1022	}
	1023	lwkt_reltoken(&ilock);
	1024	return (0);
	1025	}
	1026
	1027	/*
	1028	* Calculate the total number of references to a special device. This
	1029	* routine may only be called for VBLK and VCHR vnodes since v_rdev is
	1030	* an overloaded field. Since udev2dev can now return NODEV, we have
	1031	* to check for a NULL v_rdev.
	1032	*/
	1033	int
	1034	count_dev(dev_t dev)
	1035	{
	1036	lwkt_tokref ilock;
	1037	struct vnode *vp;
	1038	int count = 0;
	1039
	1040	if (SLIST_FIRST(&dev->si_hlist)) {
	1041	lwkt_gettoken(&ilock, &spechash_token);
	1042	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
	1043	count += vp->v_usecount;
	1044	}
	1045	lwkt_reltoken(&ilock);
	1046	}
	1047	return(count);
	1048	}
	1049
	1050	int
	1051	count_udev(udev_t udev)
	1052	{
	1053	dev_t dev;
	1054
	1055	if ((dev = udev2dev(udev, 0)) == NODEV)
	1056	return(0);
	1057	return(count_dev(dev));
	1058	}
	1059
	1060	int
	1061	vcount(struct vnode *vp)
	1062	{
	1063	if (vp->v_rdev == NULL)
	1064	return(0);
	1065	return(count_dev(vp->v_rdev));
	1066	}
	1067
	1068	/*
	1069	* Print out a description of a vnode.
	1070	*/
	1071	static char *typename[] =
	1072	{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
	1073
	1074	void
	1075	vprint(char label, struct vnode vp)
	1076	{
	1077	char buf[96];
	1078
	1079	if (label != NULL)
	1080	printf("%s: %p: ", label, (void *)vp);
	1081	else
	1082	printf("%p: ", (void *)vp);
	1083	printf("type %s, usecount %d, writecount %d, refcount %d,",
	1084	typename[vp->v_type], vp->v_usecount, vp->v_writecount,
	1085	vp->v_holdcnt);
	1086	buf[0] = '\0';
	1087	if (vp->v_flag & VROOT)
	1088	strcat(buf, "\|VROOT");
	1089	if (vp->v_flag & VTEXT)
	1090	strcat(buf, "\|VTEXT");
	1091	if (vp->v_flag & VSYSTEM)
	1092	strcat(buf, "\|VSYSTEM");
	1093	if (vp->v_flag & VBWAIT)
	1094	strcat(buf, "\|VBWAIT");
	1095	if (vp->v_flag & VFREE)
	1096	strcat(buf, "\|VFREE");
	1097	if (vp->v_flag & VOBJBUF)
	1098	strcat(buf, "\|VOBJBUF");
	1099	if (buf[0] != '\0')
	1100	printf(" flags (%s)", &buf[1]);
	1101	if (vp->v_data == NULL) {
	1102	printf("\n");
	1103	} else {
	1104	printf("\n\t");
	1105	VOP_PRINT(vp);
	1106	}
	1107	}
	1108
	1109	#ifdef DDB
	1110	#include <ddb/ddb.h>
	1111	/*
	1112	* List all of the locked vnodes in the system.
	1113	* Called when debugging the kernel.
	1114	*/
	1115	DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
	1116	{
	1117	struct thread td = curthread; / XXX */
	1118	lwkt_tokref ilock;
	1119	struct mount mp, nmp;
	1120	struct vnode *vp;
	1121
	1122	printf("Locked vnodes\n");
	1123	lwkt_gettoken(&ilock, &mountlist_token);
	1124	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	1125	if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
	1126	nmp = TAILQ_NEXT(mp, mnt_list);
	1127	continue;
	1128	}
	1129	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	1130	if (VOP_ISLOCKED(vp, NULL))
	1131	vprint((char *)0, vp);
	1132	}
	1133	lwkt_gettokref(&ilock);
	1134	nmp = TAILQ_NEXT(mp, mnt_list);
	1135	vfs_unbusy(mp, td);
	1136	}
	1137	lwkt_reltoken(&ilock);
	1138	}
	1139	#endif
	1140
	1141	/*
	1142	* Top level filesystem related information gathering.
	1143	*/
	1144	static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
	1145
	1146	static int
	1147	vfs_sysctl(SYSCTL_HANDLER_ARGS)
	1148	{
	1149	int name = (int )arg1 - 1; /* XXX */
	1150	u_int namelen = arg2 + 1; /* XXX */
	1151	struct vfsconf *vfsp;
	1152
	1153	#if 1 \|\| defined(COMPAT_PRELITE2)
	1154	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
	1155	if (namelen == 1)
	1156	return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
	1157	#endif
	1158
	1159	#ifdef notyet
	1160	/* all sysctl names at this level are at least name and field */
	1161	if (namelen < 2)
	1162	return (ENOTDIR); /* overloaded */
	1163	if (name[0] != VFS_GENERIC) {
	1164	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	1165	if (vfsp->vfc_typenum == name[0])
	1166	break;
	1167	if (vfsp == NULL)
	1168	return (EOPNOTSUPP);
	1169	return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
	1170	oldp, oldlenp, newp, newlen, p));
	1171	}
	1172	#endif
	1173	switch (name[1]) {
	1174	case VFS_MAXTYPENUM:
	1175	if (namelen != 2)
	1176	return (ENOTDIR);
	1177	return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
	1178	case VFS_CONF:
	1179	if (namelen != 3)
	1180	return (ENOTDIR); /* overloaded */
	1181	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	1182	if (vfsp->vfc_typenum == name[2])
	1183	break;
	1184	if (vfsp == NULL)
	1185	return (EOPNOTSUPP);
	1186	return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
	1187	}
	1188	return (EOPNOTSUPP);
	1189	}
	1190
	1191	SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
	1192	"Generic filesystem");
	1193
	1194	#if 1 \|\| defined(COMPAT_PRELITE2)
	1195
	1196	static int
	1197	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
	1198	{
	1199	int error;
	1200	struct vfsconf *vfsp;
	1201	struct ovfsconf ovfs;
	1202
	1203	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	1204	ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
	1205	strcpy(ovfs.vfc_name, vfsp->vfc_name);
	1206	ovfs.vfc_index = vfsp->vfc_typenum;
	1207	ovfs.vfc_refcount = vfsp->vfc_refcount;
	1208	ovfs.vfc_flags = vfsp->vfc_flags;
	1209	error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
	1210	if (error)
	1211	return error;
	1212	}
	1213	return 0;
	1214	}
	1215
	1216	#endif /* 1 \|\| COMPAT_PRELITE2 */
	1217
	1218	#if 0
	1219	#define KINFO_VNODESLOP 10
	1220	/*
	1221	* Dump vnode list (via sysctl).
	1222	* Copyout address of vnode followed by vnode.
	1223	*/
	1224	/* ARGSUSED */
	1225	static int
	1226	sysctl_vnode(SYSCTL_HANDLER_ARGS)
	1227	{
	1228	struct proc p = curproc; / XXX */
	1229	struct mount mp, nmp;
	1230	struct vnode nvp, vp;
	1231	lwkt_tokref ilock;
	1232	lwkt_tokref jlock;
	1233	int error;
	1234
	1235	#define VPTRSZ sizeof (struct vnode *)
	1236	#define VNODESZ sizeof (struct vnode)
	1237
	1238	req->lock = 0;
	1239	if (!req->oldptr) /* Make an estimate */
	1240	return (SYSCTL_OUT(req, 0,
	1241	(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
	1242
	1243	lwkt_gettoken(&ilock, &mountlist_token);
	1244	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	1245	if (vfs_busy(mp, LK_NOWAIT, &ilock, p)) {
	1246	nmp = TAILQ_NEXT(mp, mnt_list);
	1247	continue;
	1248	}
	1249	lwkt_gettoken(&jlock, &mntvnode_token);
	1250	again:
	1251	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	1252	vp != NULL;
	1253	vp = nvp) {
	1254	/*
	1255	* Check that the vp is still associated with
	1256	* this filesystem. RACE: could have been
	1257	* recycled onto the same filesystem.
	1258	*/
	1259	if (vp->v_mount != mp)
	1260	goto again;
	1261	nvp = TAILQ_NEXT(vp, v_nmntvnodes);
	1262	if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\|
	1263	(error = SYSCTL_OUT(req, vp, VNODESZ))) {
	1264	lwkt_reltoken(&jlock);
	1265	return (error);
	1266	}
	1267	}
	1268	lwkt_reltoken(&jlock);
	1269	lwkt_gettokref(&ilock);
	1270	nmp = TAILQ_NEXT(mp, mnt_list); /* ZZZ */
	1271	vfs_unbusy(mp, p);
	1272	}
	1273	lwkt_reltoken(&ilock);
	1274
	1275	return (0);
	1276	}
	1277	#endif
	1278
	1279	/*
	1280	* XXX
	1281	* Exporting the vnode list on large systems causes them to crash.
	1282	* Exporting the vnode list on medium systems causes sysctl to coredump.
	1283	*/
	1284	#if 0
	1285	SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	1286	0, 0, sysctl_vnode, "S,vnode", "");
	1287	#endif
	1288
	1289	/*
	1290	* Check to see if a filesystem is mounted on a block device.
	1291	*/
	1292	int
	1293	vfs_mountedon(struct vnode *vp)
	1294	{
	1295	dev_t dev;
	1296
	1297	if ((dev = vp->v_rdev) == NULL)
	1298	dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
	1299	if (dev != NODEV && dev->si_mountpoint)
	1300	return (EBUSY);
	1301	return (0);
	1302	}
	1303
	1304	/*
	1305	* Unmount all filesystems. The list is traversed in reverse order
	1306	* of mounting to avoid dependencies.
	1307	*/
	1308	void
	1309	vfs_unmountall(void)
	1310	{
	1311	struct mount *mp;
	1312	struct thread *td = curthread;
	1313	int error;
	1314
	1315	if (td->td_proc == NULL)
	1316	td = initproc->p_thread; /* XXX XXX use proc0 instead? */
	1317
	1318	/*
	1319	* Since this only runs when rebooting, it is not interlocked.
	1320	*/
	1321	while(!TAILQ_EMPTY(&mountlist)) {
	1322	mp = TAILQ_LAST(&mountlist, mntlist);
	1323	error = dounmount(mp, MNT_FORCE, td);
	1324	if (error) {
	1325	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	1326	printf("unmount of %s failed (",
	1327	mp->mnt_stat.f_mntonname);
	1328	if (error == EBUSY)
	1329	printf("BUSY)\n");
	1330	else
	1331	printf("%d)\n", error);
	1332	} else {
	1333	/* The unmount has removed mp from the mountlist */
	1334	}
	1335	}
	1336	}
	1337
	1338	/*
	1339	* Build hash lists of net addresses and hang them off the mount point.
	1340	* Called by ufs_mount() to set up the lists of export addresses.
	1341	*/
	1342	static int
	1343	vfs_hang_addrlist(struct mount mp, struct netexport nep,
	1344	struct export_args *argp)
	1345	{
	1346	struct netcred *np;
	1347	struct radix_node_head *rnh;
	1348	int i;
	1349	struct radix_node *rn;
	1350	struct sockaddr saddr, smask = 0;
	1351	struct domain *dom;
	1352	int error;
	1353
	1354	if (argp->ex_addrlen == 0) {
	1355	if (mp->mnt_flag & MNT_DEFEXPORTED)
	1356	return (EPERM);
	1357	np = &nep->ne_defexported;
	1358	np->netc_exflags = argp->ex_flags;
	1359	np->netc_anon = argp->ex_anon;
	1360	np->netc_anon.cr_ref = 1;
	1361	mp->mnt_flag \|= MNT_DEFEXPORTED;
	1362	return (0);
	1363	}
	1364
	1365	if (argp->ex_addrlen < 0 \|\| argp->ex_addrlen > MLEN)
	1366	return (EINVAL);
	1367	if (argp->ex_masklen < 0 \|\| argp->ex_masklen > MLEN)
	1368	return (EINVAL);
	1369
	1370	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
	1371	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
	1372	bzero((caddr_t) np, i);
	1373	saddr = (struct sockaddr *) (np + 1);
	1374	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
	1375	goto out;
	1376	if (saddr->sa_len > argp->ex_addrlen)
	1377	saddr->sa_len = argp->ex_addrlen;
	1378	if (argp->ex_masklen) {
	1379	smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
	1380	error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
	1381	if (error)
	1382	goto out;
	1383	if (smask->sa_len > argp->ex_masklen)
	1384	smask->sa_len = argp->ex_masklen;
	1385	}
	1386	i = saddr->sa_family;
	1387	if ((rnh = nep->ne_rtable[i]) == 0) {
	1388	/*
	1389	* Seems silly to initialize every AF when most are not used,
	1390	* do so on demand here
	1391	*/
	1392	for (dom = domains; dom; dom = dom->dom_next)
	1393	if (dom->dom_family == i && dom->dom_rtattach) {
	1394	dom->dom_rtattach((void **) &nep->ne_rtable[i],
	1395	dom->dom_rtoffset);
	1396	break;
	1397	}
	1398	if ((rnh = nep->ne_rtable[i]) == 0) {
	1399	error = ENOBUFS;
	1400	goto out;
	1401	}
	1402	}
	1403	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
	1404	np->netc_rnodes);
	1405	if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists */
	1406	error = EPERM;
	1407	goto out;
	1408	}
	1409	np->netc_exflags = argp->ex_flags;
	1410	np->netc_anon = argp->ex_anon;
	1411	np->netc_anon.cr_ref = 1;
	1412	return (0);
	1413	out:
	1414	free(np, M_NETADDR);
	1415	return (error);
	1416	}
	1417
	1418	/* ARGSUSED */
	1419	static int
	1420	vfs_free_netcred(struct radix_node rn, void w)
	1421	{
	1422	struct radix_node_head rnh = (struct radix_node_head ) w;
	1423
	1424	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
	1425	free((caddr_t) rn, M_NETADDR);
	1426	return (0);
	1427	}
	1428
	1429	/*
	1430	* Free the net address hash lists that are hanging off the mount points.
	1431	*/
	1432	static void
	1433	vfs_free_addrlist(struct netexport *nep)
	1434	{
	1435	int i;
	1436	struct radix_node_head *rnh;
	1437
	1438	for (i = 0; i <= AF_MAX; i++)
	1439	if ((rnh = nep->ne_rtable[i])) {
	1440	(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
	1441	(caddr_t) rnh);
	1442	free((caddr_t) rnh, M_RTABLE);
	1443	nep->ne_rtable[i] = 0;
	1444	}
	1445	}
	1446
	1447	int
	1448	vfs_export(struct mount mp, struct netexport nep, struct export_args *argp)
	1449	{
	1450	int error;
	1451
	1452	if (argp->ex_flags & MNT_DELEXPORT) {
	1453	if (mp->mnt_flag & MNT_EXPUBLIC) {
	1454	vfs_setpublicfs(NULL, NULL, NULL);
	1455	mp->mnt_flag &= ~MNT_EXPUBLIC;
	1456	}
	1457	vfs_free_addrlist(nep);
	1458	mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED);
	1459	}
	1460	if (argp->ex_flags & MNT_EXPORTED) {
	1461	if (argp->ex_flags & MNT_EXPUBLIC) {
	1462	if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
	1463	return (error);
	1464	mp->mnt_flag \|= MNT_EXPUBLIC;
	1465	}
	1466	if ((error = vfs_hang_addrlist(mp, nep, argp)))
	1467	return (error);
	1468	mp->mnt_flag \|= MNT_EXPORTED;
	1469	}
	1470	return (0);
	1471	}
	1472
	1473
	1474	/*
	1475	* Set the publicly exported filesystem (WebNFS). Currently, only
	1476	* one public filesystem is possible in the spec (RFC 2054 and 2055)
	1477	*/
	1478	int
	1479	vfs_setpublicfs(struct mount mp, struct netexport nep,
	1480	struct export_args *argp)
	1481	{
	1482	int error;
	1483	struct vnode *rvp;
	1484	char *cp;
	1485
	1486	/*
	1487	* mp == NULL -> invalidate the current info, the FS is
	1488	* no longer exported. May be called from either vfs_export
	1489	* or unmount, so check if it hasn't already been done.
	1490	*/
	1491	if (mp == NULL) {
	1492	if (nfs_pub.np_valid) {
	1493	nfs_pub.np_valid = 0;
	1494	if (nfs_pub.np_index != NULL) {
	1495	FREE(nfs_pub.np_index, M_TEMP);
	1496	nfs_pub.np_index = NULL;
	1497	}
	1498	}
	1499	return (0);
	1500	}
	1501
	1502	/*
	1503	* Only one allowed at a time.
	1504	*/
	1505	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
	1506	return (EBUSY);
	1507
	1508	/*
	1509	* Get real filehandle for root of exported FS.
	1510	*/
	1511	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
	1512	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
	1513
	1514	if ((error = VFS_ROOT(mp, &rvp)))
	1515	return (error);
	1516
	1517	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
	1518	return (error);
	1519
	1520	vput(rvp);
	1521
	1522	/*
	1523	* If an indexfile was specified, pull it in.
	1524	*/
	1525	if (argp->ex_indexfile != NULL) {
	1526	MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
	1527	M_WAITOK);
	1528	error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
	1529	MAXNAMLEN, (size_t *)0);
	1530	if (!error) {
	1531	/*
	1532	* Check for illegal filenames.
	1533	*/
	1534	for (cp = nfs_pub.np_index; *cp; cp++) {
	1535	if (*cp == '/') {
	1536	error = EINVAL;
	1537	break;
	1538	}
	1539	}
	1540	}
	1541	if (error) {
	1542	FREE(nfs_pub.np_index, M_TEMP);
	1543	return (error);
	1544	}
	1545	}
	1546
	1547	nfs_pub.np_mount = mp;
	1548	nfs_pub.np_valid = 1;
	1549	return (0);
	1550	}
	1551
	1552	struct netcred *
	1553	vfs_export_lookup(struct mount mp, struct netexport nep,
	1554	struct sockaddr *nam)
	1555	{
	1556	struct netcred *np;
	1557	struct radix_node_head *rnh;
	1558	struct sockaddr *saddr;
	1559
	1560	np = NULL;
	1561	if (mp->mnt_flag & MNT_EXPORTED) {
	1562	/*
	1563	* Lookup in the export list first.
	1564	*/
	1565	if (nam != NULL) {
	1566	saddr = nam;
	1567	rnh = nep->ne_rtable[saddr->sa_family];
	1568	if (rnh != NULL) {
	1569	np = (struct netcred *)
	1570	(*rnh->rnh_matchaddr)((caddr_t)saddr,
	1571	rnh);
	1572	if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
	1573	np = NULL;
	1574	}
	1575	}
	1576	/*
	1577	* If no address match, use the default if it exists.
	1578	*/
	1579	if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
	1580	np = &nep->ne_defexported;
	1581	}
	1582	return (np);
	1583	}
	1584
	1585	/*
	1586	* perform msync on all vnodes under a mount point. The mount point must
	1587	* be locked. This code is also responsible for lazy-freeing unreferenced
	1588	* vnodes whos VM objects no longer contain pages.
	1589	*
	1590	* NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
	1591	*/
	1592	static int vfs_msync_scan1(struct mount mp, struct vnode vp, void *data);
	1593	static int vfs_msync_scan2(struct mount mp, struct vnode vp, void *data);
	1594
	1595	void
	1596	vfs_msync(struct mount *mp, int flags)
	1597	{
	1598	vmntvnodescan(mp, VMSC_REFVP, vfs_msync_scan1, vfs_msync_scan2,
	1599	(void *)flags);
	1600	}
	1601
	1602	/*
	1603	* scan1 is a fast pre-check. There could be hundreds of thousands of
	1604	* vnodes, we cannot afford to do anything heavy weight until we have a
	1605	* fairly good indication that there is work to do.
	1606	*/
	1607	static
	1608	int
	1609	vfs_msync_scan1(struct mount mp, struct vnode vp, void *data)
	1610	{
	1611	int flags = (int)data;
	1612
	1613	if ((vp->v_flag & VRECLAIMED) == 0) {
	1614	if (vshouldfree(vp, 0))
	1615	return(0); /* call scan2 */
	1616	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	1617	(vp->v_flag & VOBJDIRTY) &&
	1618	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	1619	return(0); /* call scan2 */
	1620	}
	1621	}
	1622
	1623	/*
	1624	* do not call scan2, continue the loop
	1625	*/
	1626	return(-1);
	1627	}
	1628
	1629	static
	1630	int
	1631	vfs_msync_scan2(struct mount mp, struct vnode vp, void *data)
	1632	{
	1633	vm_object_t obj;
	1634	int flags = (int)data;
	1635
	1636	if (vp->v_flag & VRECLAIMED)
	1637	return(0);
	1638
	1639	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	1640	(vp->v_flag & VOBJDIRTY) &&
	1641	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp, NULL) == 0)) {
	1642	if (VOP_GETVOBJECT(vp, &obj) == 0) {
	1643	vm_object_page_clean(obj, 0, 0,
	1644	flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
	1645	}
	1646	}
	1647	return(0);
	1648	}
	1649
	1650	/*
	1651	* Create the VM object needed for VMIO and mmap support. This
	1652	* is done for all VREG files in the system. Some filesystems might
	1653	* afford the additional metadata buffering capability of the
	1654	* VMIO code by making the device node be VMIO mode also.
	1655	*
	1656	* vp must be locked when vfs_object_create is called.
	1657	*/
	1658	int
	1659	vfs_object_create(struct vnode vp, struct thread td)
	1660	{
	1661	return (VOP_CREATEVOBJECT(vp, td));
	1662	}
	1663
	1664	/*
	1665	* Record a process's interest in events which might happen to
	1666	* a vnode. Because poll uses the historic select-style interface
	1667	* internally, this routine serves as both the ``check for any
	1668	* pending events'' and the ``record my interest in future events''
	1669	* functions. (These are done together, while the lock is held,
	1670	* to avoid race conditions.)
	1671	*/
	1672	int
	1673	vn_pollrecord(struct vnode vp, struct thread td, int events)
	1674	{
	1675	lwkt_tokref ilock;
	1676
	1677	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	1678	if (vp->v_pollinfo.vpi_revents & events) {
	1679	/*
	1680	* This leaves events we are not interested
	1681	* in available for the other process which
	1682	* which presumably had requested them
	1683	* (otherwise they would never have been
	1684	* recorded).
	1685	*/
	1686	events &= vp->v_pollinfo.vpi_revents;
	1687	vp->v_pollinfo.vpi_revents &= ~events;
	1688
	1689	lwkt_reltoken(&ilock);
	1690	return events;
	1691	}
	1692	vp->v_pollinfo.vpi_events \|= events;
	1693	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
	1694	lwkt_reltoken(&ilock);
	1695	return 0;
	1696	}
	1697
	1698	/*
	1699	* Note the occurrence of an event. If the VN_POLLEVENT macro is used,
	1700	* it is possible for us to miss an event due to race conditions, but
	1701	* that condition is expected to be rare, so for the moment it is the
	1702	* preferred interface.
	1703	*/
	1704	void
	1705	vn_pollevent(struct vnode *vp, int events)
	1706	{
	1707	lwkt_tokref ilock;
	1708
	1709	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	1710	if (vp->v_pollinfo.vpi_events & events) {
	1711	/*
	1712	* We clear vpi_events so that we don't
	1713	* call selwakeup() twice if two events are
	1714	* posted before the polling process(es) is
	1715	* awakened. This also ensures that we take at
	1716	* most one selwakeup() if the polling process
	1717	* is no longer interested. However, it does
	1718	* mean that only one event can be noticed at
	1719	* a time. (Perhaps we should only clear those
	1720	* event bits which we note?) XXX
	1721	*/
	1722	vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
	1723	vp->v_pollinfo.vpi_revents \|= events;
	1724	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	1725	}
	1726	lwkt_reltoken(&ilock);
	1727	}
	1728
	1729	/*
	1730	* Wake up anyone polling on vp because it is being revoked.
	1731	* This depends on dead_poll() returning POLLHUP for correct
	1732	* behavior.
	1733	*/
	1734	void
	1735	vn_pollgone(struct vnode *vp)
	1736	{
	1737	lwkt_tokref ilock;
	1738
	1739	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
	1740	if (vp->v_pollinfo.vpi_events) {
	1741	vp->v_pollinfo.vpi_events = 0;
	1742	selwakeup(&vp->v_pollinfo.vpi_selinfo);
	1743	}
	1744	lwkt_reltoken(&ilock);
	1745	}
	1746
	1747	/*
	1748	* extract the dev_t from a VBLK or VCHR. The vnode must have been opened
	1749	* (or v_rdev might be NULL).
	1750	*/
	1751	dev_t
	1752	vn_todev(struct vnode *vp)
	1753	{
	1754	if (vp->v_type != VBLK && vp->v_type != VCHR)
	1755	return (NODEV);
	1756	KKASSERT(vp->v_rdev != NULL);
	1757	return (vp->v_rdev);
	1758	}
	1759
	1760	/*
	1761	* Check if vnode represents a disk device. The vnode does not need to be
	1762	* opened.
	1763	*/
	1764	int
	1765	vn_isdisk(struct vnode vp, int errp)
	1766	{
	1767	dev_t dev;
	1768
	1769	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	1770	if (errp != NULL)
	1771	*errp = ENOTBLK;
	1772	return (0);
	1773	}
	1774
	1775	if ((dev = vp->v_rdev) == NULL)
	1776	dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
	1777	if (dev == NULL \|\| dev == NODEV) {
	1778	if (errp != NULL)
	1779	*errp = ENXIO;
	1780	return (0);
	1781	}
	1782	if (dev_is_good(dev) == 0) {
	1783	if (errp != NULL)
	1784	*errp = ENXIO;
	1785	return (0);
	1786	}
	1787	if ((dev_dflags(dev) & D_DISK) == 0) {
	1788	if (errp != NULL)
	1789	*errp = ENOTBLK;
	1790	return (0);
	1791	}
	1792	if (errp != NULL)
	1793	*errp = 0;
	1794	return (1);
	1795	}
	1796
	1797	void
	1798	NDFREE(struct nameidata *ndp, const uint flags)
	1799	{
	1800	if (!(flags & NDF_NO_FREE_PNBUF) &&
	1801	(ndp->ni_cnd.cn_flags & CNP_HASBUF)) {
	1802	zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
	1803	ndp->ni_cnd.cn_flags &= ~CNP_HASBUF;
	1804	}
	1805	if (!(flags & NDF_NO_DVP_UNLOCK) &&
	1806	(ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) &&
	1807	ndp->ni_dvp != ndp->ni_vp) {
	1808	VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_td);
	1809	}
	1810	if (!(flags & NDF_NO_DVP_RELE) &&
	1811	(ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT\|CNP_WANTPARENT))) {
	1812	vrele(ndp->ni_dvp);
	1813	ndp->ni_dvp = NULL;
	1814	}
	1815	if (!(flags & NDF_NO_VP_UNLOCK) &&
	1816	(ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) {
	1817	VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_td);
	1818	}
	1819	if (!(flags & NDF_NO_VP_RELE) &&
	1820	ndp->ni_vp) {
	1821	vrele(ndp->ni_vp);
	1822	ndp->ni_vp = NULL;
	1823	}
	1824	if (!(flags & NDF_NO_STARTDIR_RELE) &&
	1825	(ndp->ni_cnd.cn_flags & CNP_SAVESTART)) {
	1826	vrele(ndp->ni_startdir);
	1827	ndp->ni_startdir = NULL;
	1828	}
	1829	}
	1830
	1831	#ifdef DEBUG_VFS_LOCKS
	1832
	1833	void
	1834	assert_vop_locked(struct vnode vp, const char str)
	1835	{
	1836	if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) {
	1837	panic("%s: %p is not locked shared but should be", str, vp);
	1838	}
	1839	}
	1840
	1841	void
	1842	assert_vop_unlocked(struct vnode vp, const char str)
	1843	{
	1844	if (vp && IS_LOCKING_VFS(vp)) {
	1845	if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) {
	1846	panic("%s: %p is locked but should not be", str, vp);
	1847	}
	1848	}
	1849	}
	1850
	1851	#endif