gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. Neither the name of the University nor the names of its contributors
	19	* may be used to endorse or promote products derived from this software
	20	* without specific prior written permission.
	21	*
	22	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	23	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	24	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	25	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	26	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	27	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	28	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	29	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	30	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	31	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
	35	* $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
	36	*/
	37
	38	#include <sys/param.h>
	39	#include <sys/systm.h>
	40	#include <sys/buf.h>
	41	#include <sys/conf.h>
	42	#include <sys/sysent.h>
	43	#include <sys/malloc.h>
	44	#include <sys/mount.h>
	45	#include <sys/mountctl.h>
	46	#include <sys/sysmsg.h>
	47	#include <sys/filedesc.h>
	48	#include <sys/kernel.h>
	49	#include <sys/fcntl.h>
	50	#include <sys/file.h>
	51	#include <sys/linker.h>
	52	#include <sys/stat.h>
	53	#include <sys/unistd.h>
	54	#include <sys/vnode.h>
	55	#include <sys/proc.h>
	56	#include <sys/caps.h>
	57	#include <sys/jail.h>
	58	#include <sys/namei.h>
	59	#include <sys/nlookup.h>
	60	#include <sys/dirent.h>
	61	#include <sys/extattr.h>
	62	#include <sys/spinlock.h>
	63	#include <sys/kern_syscall.h>
	64	#include <sys/objcache.h>
	65	#include <sys/sysctl.h>
	66
	67	#include <sys/buf2.h>
	68	#include <sys/file2.h>
	69	#include <sys/spinlock2.h>
	70
	71	#include <vm/vm.h>
	72	#include <vm/vm_object.h>
	73	#include <vm/vm_page.h>
	74
	75	#include <machine/limits.h>
	76	#include <machine/stdarg.h>
	77
	78	static void mount_warning(struct mount mp, const char ctl, ...)
	79	__printflike(2, 3);
	80	static int mount_path(struct proc p, struct mount mp, char rb, char fb);
	81	static int checkvp_chdir (struct vnode vn, struct thread td);
	82	static void checkdirs (struct nchandle old_nch, struct nchandle new_nch);
	83	static int get_fscap(const char *);
	84	static int chroot_refuse_vdir_fds (thread_t td, struct filedesc *fdp);
	85	static int chroot_visible_mnt(struct mount mp, struct proc p);
	86	static int getutimes (struct timeval , struct timespec );
	87	static int getutimens (const struct timespec , struct timespec , int *);
	88	static int setfown (struct mount , struct vnode , uid_t, gid_t);
	89	static int setfmode (struct vnode *, int);
	90	static int setfflags (struct vnode *, u_long);
	91	static int setutimes (struct vnode , struct vattr ,
	92	const struct timespec *, int);
	93
	94	static int usermount = 0; /* if 1, non-root can mount fs. */
	95	SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
	96	"Allow non-root users to mount filesystems");
	97
	98	static int debug_unmount = 0; /* if 1 loop until unmount success */
	99	SYSCTL_INT(_vfs, OID_AUTO, debug_unmount, CTLFLAG_RW, &debug_unmount, 0,
	100	"Stall failed unmounts in loop");
	101
	102	static struct krate krate_rename = { 1 };
	103
	104	/*
	105	* Virtual File System System Calls
	106	*/
	107
	108	/*
	109	* Mount a file system.
	110	*
	111	* mount_args(char type, char path, int flags, caddr_t data)
	112	*
	113	* MPALMOSTSAFE
	114	*/
	115	int
	116	sys_mount(struct sysmsg sysmsg, const struct mount_args uap)
	117	{
	118	struct thread *td = curthread;
	119	struct vnode *vp;
	120	struct nchandle nch;
	121	struct mount mp, nullmp;
	122	struct vfsconf *vfsp;
	123	int error, flag = 0, flag2 = 0;
	124	int hasmount;
	125	int priv = 0;
	126	int flags = uap->flags;
	127	struct vattr va;
	128	struct nlookupdata nd;
	129	char fstypename[MFSNAMELEN];
	130	struct ucred *cred;
	131
	132	cred = td->td_ucred;
	133
	134	/* We do not allow user mounts inside a jail for now */
	135	if (usermount && jailed(cred)) {
	136	error = EPERM;
	137	goto done;
	138	}
	139
	140	/*
	141	* Extract the file system type. We need to know this early, to take
	142	* appropriate actions for jails and the filesystems to mount.
	143	*/
	144	if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0)
	145	goto done;
	146
	147	/*
	148	* Select the correct cap according to the file system type.
	149	*/
	150	priv = get_fscap(fstypename);
	151
	152	if (usermount == 0 && (error = caps_priv_check_td(td, priv)))
	153	goto done;
	154
	155	/*
	156	* Do not allow NFS export by non-root users.
	157	*/
	158	if (flags & MNT_EXPORTED) {
	159	error = caps_priv_check_td(td, priv);
	160	if (error)
	161	goto done;
	162	}
	163	/*
	164	* Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
	165	*/
	166	if (caps_priv_check_td(td, priv))
	167	flags \|= MNT_NOSUID \| MNT_NODEV;
	168
	169	/*
	170	* Lookup the requested path and extract the nch and vnode.
	171	*/
	172	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	173	if (error == 0) {
	174	if ((error = nlookup(&nd)) == 0) {
	175	if (nd.nl_nch.ncp->nc_vp == NULL)
	176	error = ENOENT;
	177	}
	178	}
	179	if (error) {
	180	nlookup_done(&nd);
	181	goto done;
	182	}
	183
	184	/*
	185	* If the target filesystem is resolved via a nullfs mount, then
	186	* nd.nl_nch.mount will be pointing to the nullfs mount structure
	187	* instead of the target file system. We need it in case we are
	188	* doing an update.
	189	*/
	190	nullmp = nd.nl_nch.mount;
	191
	192	/*
	193	* Extract the locked+refd ncp and cleanup the nd structure
	194	*/
	195	nch = nd.nl_nch;
	196	cache_zero(&nd.nl_nch);
	197	nlookup_done(&nd);
	198
	199	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
	200	(mp = cache_findmount(&nch)) != NULL) {
	201	cache_dropmount(mp);
	202	hasmount = 1;
	203	} else {
	204	hasmount = 0;
	205	}
	206
	207
	208	/*
	209	* now we have the locked ref'd nch and unreferenced vnode.
	210	*/
	211	vp = nch.ncp->nc_vp;
	212	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
	213	cache_put(&nch);
	214	goto done;
	215	}
	216	cache_unlock(&nch);
	217
	218	/*
	219	* Now we have an unlocked ref'd nch and a locked ref'd vp
	220	*/
	221	if (flags & MNT_UPDATE) {
	222	if ((vp->v_flag & (VROOT\|VPFSROOT)) == 0) {
	223	cache_drop(&nch);
	224	vput(vp);
	225	error = EINVAL;
	226	goto done;
	227	}
	228
	229	if (strncmp(fstypename, "null", 5) == 0) {
	230	KKASSERT(nullmp);
	231	mp = nullmp;
	232	} else {
	233	mp = vp->v_mount;
	234	}
	235
	236	flag = mp->mnt_flag;
	237	flag2 = mp->mnt_kern_flag;
	238	/*
	239	* We only allow the filesystem to be reloaded if it
	240	* is currently mounted read-only.
	241	*/
	242	if ((flags & MNT_RELOAD) &&
	243	((mp->mnt_flag & MNT_RDONLY) == 0)) {
	244	cache_drop(&nch);
	245	vput(vp);
	246	error = EOPNOTSUPP; /* Needs translation */
	247	goto done;
	248	}
	249	/*
	250	* Only root, or the user that did the original mount is
	251	* permitted to update it.
	252	*/
	253	if (mp->mnt_stat.f_owner != cred->cr_uid &&
	254	(error = caps_priv_check_td(td, priv))) {
	255	cache_drop(&nch);
	256	vput(vp);
	257	goto done;
	258	}
	259	if (vfs_busy(mp, LK_NOWAIT)) {
	260	cache_drop(&nch);
	261	vput(vp);
	262	error = EBUSY;
	263	goto done;
	264	}
	265	if (hasmount) {
	266	cache_drop(&nch);
	267	vfs_unbusy(mp);
	268	vput(vp);
	269	error = EBUSY;
	270	goto done;
	271	}
	272	mp->mnt_flag \|= flags & (MNT_RELOAD \| MNT_FORCE \| MNT_UPDATE);
	273	lwkt_gettoken(&mp->mnt_token);
	274	vn_unlock(vp);
	275	vfsp = mp->mnt_vfc;
	276	goto update;
	277	}
	278
	279	/*
	280	* If the user is not root, ensure that they own the directory
	281	* onto which we are attempting to mount.
	282	*/
	283	if ((error = VOP_GETATTR(vp, &va)) \|\|
	284	(va.va_uid != cred->cr_uid &&
	285	(error = caps_priv_check_td(td, priv)))) {
	286	cache_drop(&nch);
	287	vput(vp);
	288	goto done;
	289	}
	290	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
	291	cache_drop(&nch);
	292	vput(vp);
	293	goto done;
	294	}
	295	if (vp->v_type != VDIR) {
	296	cache_drop(&nch);
	297	vput(vp);
	298	error = ENOTDIR;
	299	goto done;
	300	}
	301	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
	302	cache_drop(&nch);
	303	vput(vp);
	304	error = EPERM;
	305	goto done;
	306	}
	307	vfsp = vfsconf_find_by_name(fstypename);
	308	if (vfsp == NULL) {
	309	linker_file_t lf;
	310
	311	/* Only load modules for root (very important!) */
	312	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
	313	if (error) {
	314	cache_drop(&nch);
	315	vput(vp);
	316	goto done;
	317	}
	318	error = linker_load_file(fstypename, &lf);
	319	if (error \|\| lf == NULL) {
	320	cache_drop(&nch);
	321	vput(vp);
	322	if (lf == NULL)
	323	error = ENODEV;
	324	goto done;
	325	}
	326	lf->userrefs++;
	327	/* lookup again, see if the VFS was loaded */
	328	vfsp = vfsconf_find_by_name(fstypename);
	329	if (vfsp == NULL) {
	330	lf->userrefs--;
	331	linker_file_unload(lf);
	332	cache_drop(&nch);
	333	vput(vp);
	334	error = ENODEV;
	335	goto done;
	336	}
	337	}
	338	if (hasmount) {
	339	cache_drop(&nch);
	340	vput(vp);
	341	error = EBUSY;
	342	goto done;
	343	}
	344
	345	/*
	346	* Allocate and initialize the filesystem.
	347	*/
	348	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO\|M_WAITOK);
	349	mount_init(mp, vfsp->vfc_vfsops);
	350	vfs_busy(mp, LK_NOWAIT);
	351	mp->mnt_vfc = vfsp;
	352	mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
	353	vfsp->vfc_refcount++;
	354	mp->mnt_stat.f_type = vfsp->vfc_typenum;
	355	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	356	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
	357	mp->mnt_stat.f_owner = cred->cr_uid;
	358	lwkt_gettoken(&mp->mnt_token);
	359	vn_unlock(vp);
	360	update:
	361	/*
	362	* (per-mount token acquired at this point)
	363	*
	364	* Set the mount level flags.
	365	*/
	366	if (flags & MNT_RDONLY)
	367	mp->mnt_flag \|= MNT_RDONLY;
	368	else if (mp->mnt_flag & MNT_RDONLY)
	369	mp->mnt_kern_flag \|= MNTK_WANTRDWR;
	370	mp->mnt_flag &=~ (MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
	371	MNT_SYNCHRONOUS \| MNT_ASYNC \| MNT_NOATIME \|
	372	MNT_NOSYMFOLLOW \| MNT_IGNORE \| MNT_TRIM \|
	373	MNT_NOCLUSTERR \| MNT_NOCLUSTERW \| MNT_SUIDDIR \|
	374	MNT_AUTOMOUNTED);
	375	mp->mnt_flag \|= flags & (MNT_NOSUID \| MNT_NOEXEC \|
	376	MNT_NODEV \| MNT_SYNCHRONOUS \| MNT_ASYNC \| MNT_FORCE \|
	377	MNT_NOSYMFOLLOW \| MNT_IGNORE \| MNT_TRIM \|
	378	MNT_NOATIME \| MNT_NOCLUSTERR \| MNT_NOCLUSTERW \| MNT_SUIDDIR \|
	379	MNT_AUTOMOUNTED);
	380
	381	/*
	382	* Pre-set the mount's ALL_MPSAFE flags if specified in the vfsconf.
	383	* This way the initial VFS_MOUNT() call will also be MPSAFE.
	384	*/
	385	if (vfsp->vfc_flags & VFCF_MPSAFE)
	386	mp->mnt_kern_flag \|= MNTK_ALL_MPSAFE;
	387
	388	/*
	389	* Mount the filesystem.
	390	* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
	391	* get.
	392	*/
	393	if (mp->mnt_flag & MNT_UPDATE) {
	394	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
	395	if (mp->mnt_kern_flag & MNTK_WANTRDWR)
	396	mp->mnt_flag &= ~MNT_RDONLY;
	397	mp->mnt_flag &=~ (MNT_UPDATE \| MNT_RELOAD \| MNT_FORCE);
	398	mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
	399	if (error) {
	400	mp->mnt_flag = flag;
	401	mp->mnt_kern_flag = flag2;
	402	}
	403	lwkt_reltoken(&mp->mnt_token);
	404	vfs_unbusy(mp);
	405	vrele(vp);
	406	cache_drop(&nch);
	407	goto done;
	408	}
	409	mp->mnt_ncmounton = nch;
	410	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
	411	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	412
	413	/*
	414	* Put the new filesystem on the mount list after root. The mount
	415	* point gets its own mnt_ncmountpt (unless the VFS already set one
	416	* up) which represents the root of the mount. The lookup code
	417	* detects the mount point going forward and checks the root of
	418	* the mount going backwards.
	419	*
	420	* It is not necessary to invalidate or purge the vnode underneath
	421	* because elements under the mount will be given their own glue
	422	* namecache record.
	423	*/
	424	if (!error) {
	425	if (mp->mnt_ncmountpt.ncp == NULL) {
	426	/*
	427	* Allocate, then unlock, but leave the ref intact.
	428	* This is the mnt_refs (1) that we will retain
	429	* through to the unmount.
	430	*/
	431	cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
	432	cache_unlock(&mp->mnt_ncmountpt);
	433	}
	434	vn_unlock(vp);
	435	cache_lock(&nch);
	436	nch.ncp->nc_flag \|= NCF_ISMOUNTPT;
	437	cache_unlock(&nch);
	438	cache_ismounting(mp);
	439	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	440
	441	mountlist_insert(mp, MNTINS_LAST);
	442	vn_unlock(vp);
	443	checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
	444	error = vfs_allocate_syncvnode(mp);
	445	lwkt_reltoken(&mp->mnt_token);
	446	vfs_unbusy(mp);
	447	error = VFS_START(mp, 0);
	448	vrele(vp);
	449	KNOTE(&fs_klist, VQ_MOUNT);
	450	} else {
	451	bzero(&mp->mnt_ncmounton, sizeof(mp->mnt_ncmounton));
	452	vn_syncer_thr_stop(mp);
	453	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
	454	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
	455	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
	456	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
	457	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
	458	if (mp->mnt_cred) {
	459	crfree(mp->mnt_cred);
	460	mp->mnt_cred = NULL;
	461	}
	462	mp->mnt_vfc->vfc_refcount--;
	463	lwkt_reltoken(&mp->mnt_token);
	464	vfs_unbusy(mp);
	465	kfree(mp, M_MOUNT);
	466	cache_drop(&nch);
	467	vput(vp);
	468	}
	469	done:
	470	return (error);
	471	}
	472
	473	/*
	474	* Scan all active processes to see if any of them have a current
	475	* or root directory onto which the new filesystem has just been
	476	* mounted. If so, replace them with the new mount point.
	477	*
	478	* Both old_nch and new_nch are ref'd on call but not locked.
	479	* new_nch must be temporarily locked so it can be associated with the
	480	* vnode representing the root of the mount point.
	481	*/
	482	struct checkdirs_info {
	483	struct nchandle old_nch;
	484	struct nchandle new_nch;
	485	struct vnode *old_vp;
	486	struct vnode *new_vp;
	487	};
	488
	489	static int checkdirs_callback(struct proc p, void data);
	490
	491	static void
	492	checkdirs(struct nchandle old_nch, struct nchandle new_nch)
	493	{
	494	struct checkdirs_info info;
	495	struct vnode *olddp;
	496	struct vnode *newdp;
	497	struct mount *mp;
	498
	499	/*
	500	* If the old mount point's vnode has a usecount of 1, it is not
	501	* being held as a descriptor anywhere.
	502	*/
	503	olddp = old_nch->ncp->nc_vp;
	504	if (olddp == NULL \|\| VREFCNT(olddp) == 1)
	505	return;
	506
	507	/*
	508	* Force the root vnode of the new mount point to be resolved
	509	* so we can update any matching processes.
	510	*/
	511	mp = new_nch->mount;
	512	if (VFS_ROOT(mp, &newdp))
	513	panic("mount: lost mount");
	514	vn_unlock(newdp);
	515	cache_lock(new_nch);
	516	vn_lock(newdp, LK_EXCLUSIVE \| LK_RETRY);
	517	cache_setunresolved(new_nch);
	518	cache_setvp(new_nch, newdp);
	519	cache_unlock(new_nch);
	520
	521	/*
	522	* Special handling of the root node
	523	*/
	524	if (rootvnode == olddp) {
	525	vref(newdp);
	526	vfs_cache_setroot(newdp, cache_hold(new_nch));
	527	}
	528
	529	/*
	530	* Pass newdp separately so the callback does not have to access
	531	* it via new_nch->ncp->nc_vp.
	532	*/
	533	info.old_nch = *old_nch;
	534	info.new_nch = *new_nch;
	535	info.new_vp = newdp;
	536	allproc_scan(checkdirs_callback, &info, 0);
	537	vput(newdp);
	538	}
	539
	540	/*
	541	* NOTE: callback is not MP safe because the scanned process's filedesc
	542	* structure can be ripped out from under us, amoung other things.
	543	*/
	544	static int
	545	checkdirs_callback(struct proc p, void data)
	546	{
	547	struct checkdirs_info *info = data;
	548	struct filedesc *fdp;
	549	struct nchandle ncdrop1;
	550	struct nchandle ncdrop2;
	551	struct vnode *vprele1;
	552	struct vnode *vprele2;
	553
	554	if ((fdp = p->p_fd) != NULL) {
	555	cache_zero(&ncdrop1);
	556	cache_zero(&ncdrop2);
	557	vprele1 = NULL;
	558	vprele2 = NULL;
	559
	560	/*
	561	* MPUNSAFE - XXX fdp can be pulled out from under a
	562	* foreign process.
	563	*
	564	* A shared filedesc is ok, we don't have to copy it
	565	* because we are making this change globally.
	566	*/
	567	spin_lock(&fdp->fd_spin);
	568	if (fdp->fd_ncdir.mount == info->old_nch.mount &&
	569	fdp->fd_ncdir.ncp == info->old_nch.ncp) {
	570	vprele1 = fdp->fd_cdir;
	571	vref(info->new_vp);
	572	fdp->fd_cdir = info->new_vp;
	573	ncdrop1 = fdp->fd_ncdir;
	574	cache_copy(&info->new_nch, &fdp->fd_ncdir);
	575	}
	576	if (fdp->fd_nrdir.mount == info->old_nch.mount &&
	577	fdp->fd_nrdir.ncp == info->old_nch.ncp) {
	578	vprele2 = fdp->fd_rdir;
	579	vref(info->new_vp);
	580	fdp->fd_rdir = info->new_vp;
	581	ncdrop2 = fdp->fd_nrdir;
	582	cache_copy(&info->new_nch, &fdp->fd_nrdir);
	583	}
	584	spin_unlock(&fdp->fd_spin);
	585	if (ncdrop1.ncp)
	586	cache_drop(&ncdrop1);
	587	if (ncdrop2.ncp)
	588	cache_drop(&ncdrop2);
	589	if (vprele1)
	590	vrele(vprele1);
	591	if (vprele2)
	592	vrele(vprele2);
	593	}
	594	return(0);
	595	}
	596
	597	/*
	598	* Unmount a file system.
	599	*
	600	* Note: unmount takes a path to the vnode mounted on as argument,
	601	* not special file (as before).
	602	*
	603	* umount_args(char *path, int flags)
	604	*
	605	* MPALMOSTSAFE
	606	*/
	607	int
	608	sys_unmount(struct sysmsg sysmsg, const struct unmount_args uap)
	609	{
	610	struct thread *td = curthread;
	611	struct proc *p __debugvar = td->td_proc;
	612	struct mount *mp = NULL;
	613	struct nlookupdata nd;
	614	char fstypename[MFSNAMELEN];
	615	int priv = 0;
	616	int error;
	617	struct ucred *cred;
	618
	619	cred = td->td_ucred;
	620
	621	KKASSERT(p);
	622
	623	/* We do not allow user umounts inside a jail for now */
	624	if (usermount && jailed(cred)) {
	625	error = EPERM;
	626	goto done;
	627	}
	628
	629	error = nlookup_init(&nd, uap->path, UIO_USERSPACE,
	630	NLC_FOLLOW \| NLC_IGNBADDIR);
	631	if (error == 0)
	632	error = nlookup(&nd);
	633	if (error)
	634	goto out;
	635
	636	mp = nd.nl_nch.mount;
	637
	638	/* Figure out the fsname in order to select proper privs */
	639	ksnprintf(fstypename, MFSNAMELEN, "%s", mp->mnt_vfc->vfc_name);
	640	priv = get_fscap(fstypename);
	641
	642	if (usermount == 0 && (error = caps_priv_check_td(td, priv))) {
	643	nlookup_done(&nd);
	644	goto done;
	645	}
	646
	647	/*
	648	* Only root, or the user that did the original mount is
	649	* permitted to unmount this filesystem.
	650	*/
	651	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
	652	(error = caps_priv_check_td(td, priv)))
	653	{
	654	goto out;
	655	}
	656
	657	/*
	658	* Don't allow unmounting the root file system.
	659	*/
	660	if (mp->mnt_flag & MNT_ROOTFS) {
	661	error = EINVAL;
	662	goto out;
	663	}
	664
	665	/*
	666	* Must be the root of the filesystem
	667	*/
	668	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
	669	error = EINVAL;
	670	goto out;
	671	}
	672
	673	/* Check if this mount belongs to this prison */
	674	if (jailed(cred) && mp->mnt_cred && (!mp->mnt_cred->cr_prison \|\|
	675	mp->mnt_cred->cr_prison != cred->cr_prison)) {
	676	kprintf("mountpoint %s does not belong to this jail\n",
	677	uap->path);
	678	error = EPERM;
	679	goto out;
	680	}
	681
	682	/*
	683	* If no error try to issue the unmount. We lose our cache
	684	* ref when we call nlookup_done so we must hold the mount point
	685	* to prevent use-after-free races.
	686	*/
	687	out:
	688	if (error == 0) {
	689	mount_hold(mp);
	690	nlookup_done(&nd);
	691	error = dounmount(mp, uap->flags, 0);
	692	mount_drop(mp);
	693	} else {
	694	nlookup_done(&nd);
	695	}
	696	done:
	697	return (error);
	698	}
	699
	700	/*
	701	* Do the actual file system unmount (interlocked against the mountlist
	702	* token and mp->mnt_token).
	703	*/
	704	static int
	705	dounmount_interlock(struct mount *mp)
	706	{
	707	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
	708	return (EBUSY);
	709	mp->mnt_kern_flag \|= MNTK_UNMOUNT;
	710	return(0);
	711	}
	712
	713	/*
	714	* Returns non-zero if the specified process uses the specified
	715	* mount point.
	716	*/
	717	static int
	718	process_uses_mount(struct proc p, struct mount mp)
	719	{
	720	struct filedesc *fdp;
	721	struct file *fp;
	722	int found;
	723	int n;
	724
	725	fdp = p->p_fd;
	726	if (fdp == NULL)
	727	return 0;
	728	if (fdp->fd_ncdir.mount == mp \|\|
	729	fdp->fd_nrdir.mount == mp \|\|
	730	fdp->fd_njdir.mount == mp)
	731	{
	732	return 1;
	733	}
	734
	735	found = 0;
	736	spin_lock_shared(&fdp->fd_spin);
	737	for (n = 0; n < fdp->fd_nfiles; ++n) {
	738	fp = fdp->fd_files[n].fp;
	739	if (fp && fp->f_nchandle.mount == mp) {
	740	found = 1;
	741	break;
	742	}
	743	}
	744	spin_unlock_shared(&fdp->fd_spin);
	745
	746	return found;
	747	}
	748
	749	/*
	750	* Cleanup processes that have references to the mount point
	751	* being force-unmounted.
	752	*/
	753	struct unmount_allproc_info {
	754	struct mount *mp;
	755	int sig;
	756	};
	757
	758	static int
	759	unmount_allproc_cb(struct proc p, void arg)
	760	{
	761	struct unmount_allproc_info *info;
	762	struct mount *mp;
	763
	764	info = arg;
	765	mp = info->mp;
	766
	767	if (p->p_textnch.mount == mp)
	768	cache_drop(&p->p_textnch);
	769	if (info->sig && process_uses_mount(p, mp))
	770	ksignal(p, info->sig);
	771
	772	return 0;
	773	}
	774
	775	/*
	776	* The guts of the unmount code. The mount owns one ref and one hold
	777	* count. If we successfully interlock the unmount, those refs are ours.
	778	* (The ref is from mnt_ncmountpt).
	779	*
	780	* When halting we shortcut certain mount types such as devfs by not actually
	781	* issuing the VFS_SYNC() or VFS_UNMOUNT(). They are still disconnected
	782	* from the mountlist so higher-level filesytems can unmount cleanly.
	783	*
	784	* The mount types that allow QUICKHALT are: devfs, tmpfs, procfs.
	785	*/
	786	int
	787	dounmount(struct mount *mp, int flags, int halting)
	788	{
	789	struct namecache *ncp;
	790	struct nchandle nch;
	791	struct vnode *vp;
	792	int error;
	793	int async_flag;
	794	int lflags;
	795	int freeok = 1;
	796	int hadsyncer = 0;
	797	int retry;
	798	int quickhalt;
	799
	800	lwkt_gettoken(&mp->mnt_token);
	801
	802	/*
	803	* When halting, certain mount points can essentially just
	804	* be unhooked and otherwise ignored.
	805	*/
	806	if (halting && (mp->mnt_kern_flag & MNTK_QUICKHALT)) {
	807	quickhalt = 1;
	808	freeok = 0;
	809	} else {
	810	quickhalt = 0;
	811	}
	812
	813
	814	/*
	815	* Exclusive access for unmounting purposes.
	816	*/
	817	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
	818	goto out;
	819
	820	/*
	821	* We now 'own' the last mp->mnt_refs
	822	*
	823	* Allow filesystems to detect that a forced unmount is in progress.
	824	*/
	825	if (flags & MNT_FORCE)
	826	mp->mnt_kern_flag \|= MNTK_UNMOUNTF;
	827	lflags = LK_EXCLUSIVE \| ((flags & MNT_FORCE) ? 0 : LK_TIMELOCK);
	828	error = lockmgr(&mp->mnt_lock, lflags);
	829	if (error) {
	830	mp->mnt_kern_flag &= ~(MNTK_UNMOUNT \| MNTK_UNMOUNTF);
	831	if (mp->mnt_kern_flag & MNTK_MWAIT) {
	832	mp->mnt_kern_flag &= ~MNTK_MWAIT;
	833	wakeup(mp);
	834	}
	835	goto out;
	836	}
	837
	838	if (mp->mnt_flag & MNT_EXPUBLIC)
	839	vfs_setpublicfs(NULL, NULL, NULL);
	840
	841	vfs_msync(mp, MNT_WAIT);
	842	async_flag = mp->mnt_flag & MNT_ASYNC;
	843	mp->mnt_flag &=~ MNT_ASYNC;
	844
	845	/*
	846	* Decomission our special mnt_syncer vnode. This also stops
	847	* the vnlru code. If we are unable to unmount we recommission
	848	* the vnode.
	849	*
	850	* Then sync the filesystem.
	851	*/
	852	if ((vp = mp->mnt_syncer) != NULL) {
	853	mp->mnt_syncer = NULL;
	854	atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
	855	vrele(vp);
	856	hadsyncer = 1;
	857	}
	858
	859	/*
	860	* Sync normally-mounted filesystem.
	861	*/
	862	if (quickhalt == 0) {
	863	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	864	VFS_SYNC(mp, MNT_WAIT);
	865	}
	866
	867	/*
	868	* nchandle records ref the mount structure. Expect a count of 1
	869	* (our mount->mnt_ncmountpt).
	870	*
	871	* Scans can get temporary refs on a mountpoint (thought really
	872	* heavy duty stuff like cache_findmount() do not).
	873	*/
	874	for (retry = 0; (retry < 10 \|\| debug_unmount); ++retry) {
	875	/*
	876	* Invalidate the namecache topology under the mount.
	877	* nullfs mounts alias a real mount's namecache topology
	878	* and it should not be invalidated in that case.
	879	*/
	880	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
	881	cache_lock(&mp->mnt_ncmountpt);
	882	cache_inval(&mp->mnt_ncmountpt,
	883	CINV_DESTROY\|CINV_CHILDREN);
	884	cache_unlock(&mp->mnt_ncmountpt);
	885	}
	886
	887	/*
	888	* Clear pcpu caches
	889	*/
	890	cache_unmounting(mp);
	891	if (mp->mnt_refs != 1)
	892	cache_clearmntcache(mp);
	893
	894	/*
	895	* Break out if we are good. Don't count ncp refs if the
	896	* mount is aliased.
	897	*/
	898	ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
	899	NULL : mp->mnt_ncmountpt.ncp;
	900	if (mp->mnt_refs == 1 &&
	901	(ncp == NULL \|\| (ncp->nc_refs == 1 &&
	902	TAILQ_FIRST(&ncp->nc_list) == NULL))) {
	903	break;
	904	}
	905
	906	/*
	907	* If forcing the unmount, clean out any p->p_textnch
	908	* nchandles that match this mount.
	909	*
	910	* In addition any process which has a current, root, or
	911	* jail directory matching the mount, or which has an open
	912	* descriptor matching the mount, will be killed. We first
	913	* try SIGKILL, and if that doesn't work we issue SIGQUIT.
	914	*/
	915	if (flags & MNT_FORCE) {
	916	struct unmount_allproc_info info;
	917
	918	info.mp = mp;
	919	switch(retry) {
	920	case 3:
	921	info.sig = SIGINT;
	922	break;
	923	case 7:
	924	info.sig = SIGKILL;
	925	break;
	926	default:
	927	info.sig = 0;
	928	break;
	929	}
	930	allproc_scan(&unmount_allproc_cb, &info, 0);
	931	}
	932
	933	/*
	934	* Sleep and retry.
	935	*/
	936	tsleep(&mp->mnt_refs, 0, "mntbsy", hz / 4 + 1);
	937	if (debug_unmount && (retry & 15) == 15) {
	938	mount_warning(mp,
	939	"(%p) debug - retry %d, "
	940	"%d namecache refs, %d mount refs",
	941	mp, retry,
	942	(ncp ? ncp->nc_refs - 1 : 0),
	943	mp->mnt_refs - 1);
	944	}
	945	}
	946	if (retry == 10) {
	947	mount_warning(mp,
	948	"forced umount of \"%s\" - "
	949	"%d namecache refs, %d mount refs",
	950	(mp->mnt_ncmountpt.ncp ?
	951	mp->mnt_ncmountpt.ncp->nc_name : "?"),
	952	(ncp ? ncp->nc_refs - 1 : 0),
	953	mp->mnt_refs - 1);
	954	}
	955
	956	error = 0;
	957	ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
	958	NULL : mp->mnt_ncmountpt.ncp;
	959	if (mp->mnt_refs != 1 \|\|
	960	(ncp != NULL && (ncp->nc_refs != 1 \|\|
	961	TAILQ_FIRST(&ncp->nc_list)))) {
	962	mount_warning(mp,
	963	"(%p): %d namecache refs, %d mount refs "
	964	"still present",
	965	mp,
	966	(ncp ? ncp->nc_refs - 1 : 0),
	967	mp->mnt_refs - 1);
	968	if (flags & MNT_FORCE) {
	969	freeok = 0;
	970	mount_warning(mp, "forcing unmount\n");
	971	} else {
	972	error = EBUSY;
	973	}
	974	}
	975
	976	/*
	977	* So far so good, sync the filesystem once more and
	978	* call the VFS unmount code if the sync succeeds.
	979	*/
	980	if (error == 0 && quickhalt == 0) {
	981	if (mp->mnt_flag & MNT_RDONLY) {
	982	error = VFS_UNMOUNT(mp, flags);
	983	} else {
	984	error = VFS_SYNC(mp, MNT_WAIT);
	985	if (error == 0 \|\| /* no error */
	986	error == EOPNOTSUPP \|\| /* no sync avail */
	987	(flags & MNT_FORCE)) { /* force anyway */
	988	error = VFS_UNMOUNT(mp, flags);
	989	}
	990	}
	991	if (error) {
	992	mount_warning(mp,
	993	"(%p) unmount: vfs refused to unmount, "
	994	"error %d",
	995	mp, error);
	996	}
	997	}
	998
	999	/*
	1000	* If an error occurred we can still recover, restoring the
	1001	* syncer vnode and misc flags.
	1002	*/
	1003	if (error) {
	1004	if (mp->mnt_syncer == NULL && hadsyncer)
	1005	vfs_allocate_syncvnode(mp);
	1006	mp->mnt_kern_flag &= ~(MNTK_UNMOUNT \| MNTK_UNMOUNTF);
	1007	mp->mnt_flag \|= async_flag;
	1008	lockmgr(&mp->mnt_lock, LK_RELEASE);
	1009	if (mp->mnt_kern_flag & MNTK_MWAIT) {
	1010	mp->mnt_kern_flag &= ~MNTK_MWAIT;
	1011	wakeup(mp);
	1012	}
	1013	goto out;
	1014	}
	1015	/*
	1016	* Clean up any journals still associated with the mount after
	1017	* filesystem activity has ceased.
	1018	*/
	1019	journal_remove_all_journals(mp,
	1020	((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
	1021
	1022	mountlist_remove(mp);
	1023
	1024	/*
	1025	* Remove any installed vnode ops here so the individual VFSs don't
	1026	* have to.
	1027	*
	1028	* mnt_refs should go to zero when we scrap mnt_ncmountpt.
	1029	*
	1030	* When quickhalting we have to keep these intact because the
	1031	* underlying vnodes have not been destroyed, and some might be
	1032	* dirty.
	1033	*/
	1034	if (quickhalt == 0) {
	1035	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
	1036	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
	1037	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
	1038	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
	1039	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
	1040	}
	1041
	1042	if (mp->mnt_ncmountpt.ncp != NULL) {
	1043	nch = mp->mnt_ncmountpt;
	1044	cache_zero(&mp->mnt_ncmountpt);
	1045	cache_clrmountpt(&nch);
	1046	cache_drop(&nch);
	1047	}
	1048	if (mp->mnt_ncmounton.ncp != NULL) {
	1049	cache_unmounting(mp);
	1050	nch = mp->mnt_ncmounton;
	1051	cache_zero(&mp->mnt_ncmounton);
	1052	cache_clrmountpt(&nch);
	1053	cache_drop(&nch);
	1054	}
	1055
	1056	if (mp->mnt_cred) {
	1057	crfree(mp->mnt_cred);
	1058	mp->mnt_cred = NULL;
	1059	}
	1060
	1061	mp->mnt_vfc->vfc_refcount--;
	1062
	1063	/*
	1064	* If not quickhalting the mount, we expect there to be no
	1065	* vnodes left.
	1066	*/
	1067	if (quickhalt == 0 && !TAILQ_EMPTY(&mp->mnt_nvnodelist))
	1068	panic("unmount: dangling vnode");
	1069
	1070	/*
	1071	* Release the lock
	1072	*/
	1073	lockmgr(&mp->mnt_lock, LK_RELEASE);
	1074	if (mp->mnt_kern_flag & MNTK_MWAIT) {
	1075	mp->mnt_kern_flag &= ~MNTK_MWAIT;
	1076	wakeup(mp);
	1077	}
	1078
	1079	/*
	1080	* If we reach here and freeok != 0 we must free the mount.
	1081	* mnt_refs should already have dropped to 0, so if it is not
	1082	* zero we must cycle the caches and wait.
	1083	*
	1084	* When we are satisfied that the mount has disconnected we can
	1085	* drop the hold on the mp that represented the mount (though the
	1086	* caller might actually have another, so the caller's drop may
	1087	* do the actual free).
	1088	*/
	1089	if (freeok) {
	1090	if (mp->mnt_refs > 0)
	1091	cache_clearmntcache(mp);
	1092	while (mp->mnt_refs > 0) {
	1093	cache_unmounting(mp);
	1094	wakeup(mp);
	1095	tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
	1096	cache_clearmntcache(mp);
	1097	}
	1098	lwkt_reltoken(&mp->mnt_token);
	1099	mount_drop(mp);
	1100	mp = NULL;
	1101	} else {
	1102	cache_clearmntcache(mp);
	1103	}
	1104	error = 0;
	1105	KNOTE(&fs_klist, VQ_UNMOUNT);
	1106	out:
	1107	if (mp)
	1108	lwkt_reltoken(&mp->mnt_token);
	1109	return (error);
	1110	}
	1111
	1112	static
	1113	void
	1114	mount_warning(struct mount mp, const char ctl, ...)
	1115	{
	1116	char *ptr;
	1117	char *buf;
	1118	__va_list va;
	1119
	1120	__va_start(va, ctl);
	1121	if (cache_fullpath(NULL, &mp->mnt_ncmounton, NULL,
	1122	&ptr, &buf, 0) == 0) {
	1123	kprintf("unmount(%s): ", ptr);
	1124	kvprintf(ctl, va);
	1125	kprintf("\n");
	1126	kfree(buf, M_TEMP);
	1127	} else {
	1128	kprintf("unmount(%p", mp);
	1129	if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
	1130	kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
	1131	kprintf("): ");
	1132	kvprintf(ctl, va);
	1133	kprintf("\n");
	1134	}
	1135	__va_end(va);
	1136	}
	1137
	1138	/*
	1139	* Shim cache_fullpath() to handle the case where a process is chrooted into
	1140	* a subdirectory of a mount. In this case if the root mount matches the
	1141	* process root directory's mount we have to specify the process's root
	1142	* directory instead of the mount point, because the mount point might
	1143	* be above the root directory.
	1144	*/
	1145	static
	1146	int
	1147	mount_path(struct proc p, struct mount mp, char rb, char fb)
	1148	{
	1149	struct nchandle *nch;
	1150
	1151	if (p && p->p_fd->fd_nrdir.mount == mp)
	1152	nch = &p->p_fd->fd_nrdir;
	1153	else
	1154	nch = &mp->mnt_ncmountpt;
	1155	return(cache_fullpath(p, nch, NULL, rb, fb, 0));
	1156	}
	1157
	1158	/*
	1159	* Sync each mounted filesystem.
	1160	*/
	1161
	1162	#ifdef DEBUG
	1163	static int syncprt = 0;
	1164	SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
	1165	#endif /* DEBUG */
	1166
	1167	static int sync_callback(struct mount mp, void data);
	1168
	1169	int
	1170	sys_sync(struct sysmsg sysmsg, const struct sync_args uap)
	1171	{
	1172	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
	1173	return (0);
	1174	}
	1175
	1176	static
	1177	int
	1178	sync_callback(struct mount mp, void data __unused)
	1179	{
	1180	int asyncflag;
	1181
	1182	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
	1183	lwkt_gettoken(&mp->mnt_token);
	1184	asyncflag = mp->mnt_flag & MNT_ASYNC;
	1185	mp->mnt_flag &= ~MNT_ASYNC;
	1186	lwkt_reltoken(&mp->mnt_token);
	1187	vfs_msync(mp, MNT_NOWAIT);
	1188	VFS_SYNC(mp, MNT_NOWAIT);
	1189	lwkt_gettoken(&mp->mnt_token);
	1190	mp->mnt_flag \|= asyncflag;
	1191	lwkt_reltoken(&mp->mnt_token);
	1192	}
	1193	return(0);
	1194	}
	1195
	1196	/* XXX PRISON: could be per prison flag */
	1197	static int prison_quotas;
	1198	#if 0
	1199	SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
	1200	#endif
	1201
	1202	/*
	1203	* quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
	1204	*
	1205	* Change filesystem quotas.
	1206	*
	1207	* MPALMOSTSAFE
	1208	*/
	1209	int
	1210	sys_quotactl(struct sysmsg sysmsg, const struct quotactl_args uap)
	1211	{
	1212	struct nlookupdata nd;
	1213	struct thread *td;
	1214	struct mount *mp;
	1215	int error;
	1216
	1217	td = curthread;
	1218	if (td->td_ucred->cr_prison && !prison_quotas) {
	1219	error = EPERM;
	1220	goto done;
	1221	}
	1222
	1223	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	1224	if (error == 0)
	1225	error = nlookup(&nd);
	1226	if (error == 0) {
	1227	mp = nd.nl_nch.mount;
	1228	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
	1229	uap->arg, nd.nl_cred);
	1230	}
	1231	nlookup_done(&nd);
	1232	done:
	1233	return (error);
	1234	}
	1235
	1236	/*
	1237	* mountctl(char path, int op, int fd, const void ctl, int ctllen,
	1238	* void *buf, int buflen)
	1239	*
	1240	* This function operates on a mount point and executes the specified
	1241	* operation using the specified control data, and possibly returns data.
	1242	*
	1243	* The actual number of bytes stored in the result buffer is returned, 0
	1244	* if none, otherwise an error is returned.
	1245	*
	1246	* MPALMOSTSAFE
	1247	*/
	1248	int
	1249	sys_mountctl(struct sysmsg sysmsg, const struct mountctl_args uap)
	1250	{
	1251	struct thread *td = curthread;
	1252	struct file *fp;
	1253	void *ctl = NULL;
	1254	void *buf = NULL;
	1255	char *path = NULL;
	1256	int error;
	1257
	1258	/*
	1259	* Sanity and permissions checks. We must be root.
	1260	*/
	1261	if (td->td_ucred->cr_prison != NULL)
	1262	return (EPERM);
	1263	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
	1264	(error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)) != 0)
	1265	{
	1266	return (error);
	1267	}
	1268
	1269	/*
	1270	* Argument length checks
	1271	*/
	1272	if (uap->ctllen < 0 \|\| uap->ctllen > 1024)
	1273	return (EINVAL);
	1274	if (uap->buflen < 0 \|\| uap->buflen > 16 * 1024)
	1275	return (EINVAL);
	1276	if (uap->path == NULL)
	1277	return (EINVAL);
	1278
	1279	/*
	1280	* Allocate the necessary buffers and copyin data
	1281	*/
	1282	path = objcache_get(namei_oc, M_WAITOK);
	1283	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
	1284	if (error)
	1285	goto done;
	1286
	1287	if (uap->ctllen) {
	1288	ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK\|M_ZERO);
	1289	error = copyin(uap->ctl, ctl, uap->ctllen);
	1290	if (error)
	1291	goto done;
	1292	}
	1293	if (uap->buflen)
	1294	buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK\|M_ZERO);
	1295
	1296	/*
	1297	* Validate the descriptor
	1298	*/
	1299	if (uap->fd >= 0) {
	1300	fp = holdfp(td, uap->fd, -1);
	1301	if (fp == NULL) {
	1302	error = EBADF;
	1303	goto done;
	1304	}
	1305	} else {
	1306	fp = NULL;
	1307	}
	1308
	1309	/*
	1310	* Execute the internal kernel function and clean up.
	1311	*/
	1312	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen,
	1313	buf, uap->buflen, &sysmsg->sysmsg_result);
	1314	if (fp)
	1315	dropfp(td, uap->fd, fp);
	1316	if (error == 0 && sysmsg->sysmsg_result > 0)
	1317	error = copyout(buf, uap->buf, sysmsg->sysmsg_result);
	1318	done:
	1319	if (path)
	1320	objcache_put(namei_oc, path);
	1321	if (ctl)
	1322	kfree(ctl, M_TEMP);
	1323	if (buf)
	1324	kfree(buf, M_TEMP);
	1325	return (error);
	1326	}
	1327
	1328	/*
	1329	* Execute a mount control operation by resolving the path to a mount point
	1330	* and calling vop_mountctl().
	1331	*
	1332	* Use the mount point from the nch instead of the vnode so nullfs mounts
	1333	* can properly spike the VOP.
	1334	*/
	1335	int
	1336	kern_mountctl(const char path, int op, struct file fp,
	1337	const void *ctl, int ctllen,
	1338	void buf, int buflen, int res)
	1339	{
	1340	struct vnode *vp;
	1341	struct nlookupdata nd;
	1342	struct nchandle nch;
	1343	struct mount *mp;
	1344	int error;
	1345
	1346	*res = 0;
	1347	vp = NULL;
	1348	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
	1349	if (error)
	1350	return (error);
	1351	error = nlookup(&nd);
	1352	if (error) {
	1353	nlookup_done(&nd);
	1354	return (error);
	1355	}
	1356	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
	1357	if (error) {
	1358	nlookup_done(&nd);
	1359	return (error);
	1360	}
	1361
	1362	/*
	1363	* Yes, all this is needed to use the nch.mount below, because
	1364	* we must maintain a ref on the mount to avoid ripouts (e.g.
	1365	* due to heavy mount/unmount use by synth or poudriere).
	1366	*/
	1367	nch = nd.nl_nch;
	1368	cache_zero(&nd.nl_nch);
	1369	cache_unlock(&nch);
	1370	nlookup_done(&nd);
	1371	vn_unlock(vp);
	1372
	1373	mp = nch.mount;
	1374
	1375	/*
	1376	* Must be the root of the filesystem
	1377	*/
	1378	if ((vp->v_flag & (VROOT\|VPFSROOT)) == 0) {
	1379	cache_drop(&nch);
	1380	vrele(vp);
	1381	return (EINVAL);
	1382	}
	1383	if (mp == NULL \|\| mp->mnt_kern_flag & MNTK_UNMOUNT) {
	1384	kprintf("kern_mountctl: Warning, \"%s\" racing unmount\n",
	1385	path);
	1386	cache_drop(&nch);
	1387	vrele(vp);
	1388	return (EINVAL);
	1389	}
	1390	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
	1391	buf, buflen, res);
	1392	vrele(vp);
	1393	cache_drop(&nch);
	1394
	1395	return (error);
	1396	}
	1397
	1398	int
	1399	kern_statfs(struct nlookupdata nd, struct statfs buf)
	1400	{
	1401	struct thread *td = curthread;
	1402	struct proc *p = td->td_proc;
	1403	struct mount *mp;
	1404	struct statfs *sp;
	1405	char fullpath, freepath;
	1406	int error;
	1407
	1408	if ((error = nlookup(nd)) != 0)
	1409	return (error);
	1410	mp = nd->nl_nch.mount;
	1411	sp = &mp->mnt_stat;
	1412
	1413	/*
	1414	* Ignore refresh error, user should have visibility.
	1415	* This can happen if a NFS mount goes bad (e.g. server
	1416	* revokes perms or goes down).
	1417	*/
	1418	error = VFS_STATFS(mp, sp, nd->nl_cred);
	1419	/* ignore error */
	1420
	1421	error = mount_path(p, mp, &fullpath, &freepath);
	1422	if (error)
	1423	return(error);
	1424	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	1425	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
	1426	kfree(freepath, M_TEMP);
	1427
	1428	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	1429	bcopy(sp, buf, sizeof(*buf));
	1430	/* Only root should have access to the fsid's. */
	1431	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT))
	1432	buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
	1433	return (0);
	1434	}
	1435
	1436	/*
	1437	* statfs_args(char path, struct statfs buf)
	1438	*
	1439	* Get filesystem statistics.
	1440	*/
	1441	int
	1442	sys_statfs(struct sysmsg sysmsg, const struct statfs_args uap)
	1443	{
	1444	struct nlookupdata nd;
	1445	struct statfs buf;
	1446	int error;
	1447
	1448	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	1449	if (error == 0)
	1450	error = kern_statfs(&nd, &buf);
	1451	nlookup_done(&nd);
	1452	if (error == 0)
	1453	error = copyout(&buf, uap->buf, sizeof(*uap->buf));
	1454	return (error);
	1455	}
	1456
	1457	int
	1458	kern_fstatfs(int fd, struct statfs *buf)
	1459	{
	1460	struct thread *td = curthread;
	1461	struct proc *p = td->td_proc;
	1462	struct file *fp;
	1463	struct mount *mp;
	1464	struct statfs *sp;
	1465	char fullpath, freepath;
	1466	int error;
	1467
	1468	KKASSERT(p);
	1469	if ((error = holdvnode(td, fd, &fp)) != 0)
	1470	return (error);
	1471
	1472	/*
	1473	* Try to use mount info from any overlays rather than the
	1474	* mount info for the underlying vnode, otherwise we will
	1475	* fail when operating on null-mounted paths inside a chroot.
	1476	*/
	1477	if ((mp = fp->f_nchandle.mount) == NULL)
	1478	mp = ((struct vnode *)fp->f_data)->v_mount;
	1479	if (mp == NULL) {
	1480	error = EBADF;
	1481	goto done;
	1482	}
	1483	if (fp->f_cred == NULL) {
	1484	error = EINVAL;
	1485	goto done;
	1486	}
	1487
	1488	/*
	1489	* Ignore refresh error, user should have visibility.
	1490	* This can happen if a NFS mount goes bad (e.g. server
	1491	* revokes perms or goes down).
	1492	*/
	1493	sp = &mp->mnt_stat;
	1494	error = VFS_STATFS(mp, sp, fp->f_cred);
	1495
	1496	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
	1497	goto done;
	1498	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	1499	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
	1500	kfree(freepath, M_TEMP);
	1501
	1502	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	1503	bcopy(sp, buf, sizeof(*buf));
	1504
	1505	/* Only root should have access to the fsid's. */
	1506	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT))
	1507	buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
	1508	error = 0;
	1509	done:
	1510	fdrop(fp);
	1511	return (error);
	1512	}
	1513
	1514	/*
	1515	* fstatfs_args(int fd, struct statfs *buf)
	1516	*
	1517	* Get filesystem statistics.
	1518	*/
	1519	int
	1520	sys_fstatfs(struct sysmsg sysmsg, const struct fstatfs_args uap)
	1521	{
	1522	struct statfs buf;
	1523	int error;
	1524
	1525	error = kern_fstatfs(uap->fd, &buf);
	1526
	1527	if (error == 0)
	1528	error = copyout(&buf, uap->buf, sizeof(*uap->buf));
	1529	return (error);
	1530	}
	1531
	1532	int
	1533	kern_statvfs(struct nlookupdata nd, struct statvfs buf)
	1534	{
	1535	struct mount *mp;
	1536	struct statvfs *sp;
	1537	int error;
	1538
	1539	if ((error = nlookup(nd)) != 0)
	1540	return (error);
	1541	mp = nd->nl_nch.mount;
	1542	sp = &mp->mnt_vstat;
	1543	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
	1544	return (error);
	1545
	1546	sp->f_flag = 0;
	1547	if (mp->mnt_flag & MNT_RDONLY)
	1548	sp->f_flag \|= ST_RDONLY;
	1549	if (mp->mnt_flag & MNT_NOSUID)
	1550	sp->f_flag \|= ST_NOSUID;
	1551	bcopy(sp, buf, sizeof(*buf));
	1552	return (0);
	1553	}
	1554
	1555	/*
	1556	* statfs_args(char path, struct statfs buf)
	1557	*
	1558	* Get filesystem statistics.
	1559	*/
	1560	int
	1561	sys_statvfs(struct sysmsg sysmsg, const struct statvfs_args uap)
	1562	{
	1563	struct nlookupdata nd;
	1564	struct statvfs buf;
	1565	int error;
	1566
	1567	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	1568	if (error == 0)
	1569	error = kern_statvfs(&nd, &buf);
	1570	nlookup_done(&nd);
	1571	if (error == 0)
	1572	error = copyout(&buf, uap->buf, sizeof(*uap->buf));
	1573	return (error);
	1574	}
	1575
	1576	int
	1577	kern_fstatvfs(int fd, struct statvfs *buf)
	1578	{
	1579	struct thread *td = curthread;
	1580	struct file *fp;
	1581	struct mount *mp;
	1582	struct statvfs *sp;
	1583	int error;
	1584
	1585	if ((error = holdvnode(td, fd, &fp)) != 0)
	1586	return (error);
	1587	if ((mp = fp->f_nchandle.mount) == NULL)
	1588	mp = ((struct vnode *)fp->f_data)->v_mount;
	1589	if (mp == NULL) {
	1590	error = EBADF;
	1591	goto done;
	1592	}
	1593	if (fp->f_cred == NULL) {
	1594	error = EINVAL;
	1595	goto done;
	1596	}
	1597	sp = &mp->mnt_vstat;
	1598	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
	1599	goto done;
	1600
	1601	sp->f_flag = 0;
	1602	if (mp->mnt_flag & MNT_RDONLY)
	1603	sp->f_flag \|= ST_RDONLY;
	1604	if (mp->mnt_flag & MNT_NOSUID)
	1605	sp->f_flag \|= ST_NOSUID;
	1606
	1607	bcopy(sp, buf, sizeof(*buf));
	1608	error = 0;
	1609	done:
	1610	fdrop(fp);
	1611	return (error);
	1612	}
	1613
	1614	/*
	1615	* fstatfs_args(int fd, struct statfs *buf)
	1616	*
	1617	* Get filesystem statistics.
	1618	*/
	1619	int
	1620	sys_fstatvfs(struct sysmsg sysmsg, const struct fstatvfs_args uap)
	1621	{
	1622	struct statvfs buf;
	1623	int error;
	1624
	1625	error = kern_fstatvfs(uap->fd, &buf);
	1626
	1627	if (error == 0)
	1628	error = copyout(&buf, uap->buf, sizeof(*uap->buf));
	1629	return (error);
	1630	}
	1631
	1632	/*
	1633	* getfsstat_args(struct statfs *buf, long bufsize, int flags)
	1634	*
	1635	* Get statistics on all filesystems.
	1636	*/
	1637
	1638	struct getfsstat_info {
	1639	struct statfs *sfsp;
	1640	long count;
	1641	long maxcount;
	1642	int error;
	1643	int flags;
	1644	struct thread *td;
	1645	};
	1646
	1647	static int getfsstat_callback(struct mount , void );
	1648
	1649	int
	1650	sys_getfsstat(struct sysmsg sysmsg, const struct getfsstat_args uap)
	1651	{
	1652	struct thread *td = curthread;
	1653	struct getfsstat_info info;
	1654
	1655	bzero(&info, sizeof(info));
	1656
	1657	info.maxcount = uap->bufsize / sizeof(struct statfs);
	1658	info.sfsp = uap->buf;
	1659	info.count = 0;
	1660	info.flags = uap->flags;
	1661	info.td = td;
	1662
	1663	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
	1664	if (info.sfsp && info.count > info.maxcount)
	1665	sysmsg->sysmsg_result = info.maxcount;
	1666	else
	1667	sysmsg->sysmsg_result = info.count;
	1668	return (info.error);
	1669	}
	1670
	1671	static int
	1672	getfsstat_callback(struct mount mp, void data)
	1673	{
	1674	struct getfsstat_info *info = data;
	1675	struct statfs *sp;
	1676	char *freepath;
	1677	char *fullpath;
	1678	int error;
	1679
	1680	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
	1681	return(0);
	1682
	1683	if (info->sfsp && info->count < info->maxcount) {
	1684	sp = &mp->mnt_stat;
	1685
	1686	/*
	1687	* If MNT_NOWAIT or MNT_LAZY is specified, do not
	1688	* refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
	1689	* overrides MNT_WAIT.
	1690	*
	1691	* Ignore refresh error, user should have visibility.
	1692	* This can happen if a NFS mount goes bad (e.g. server
	1693	* revokes perms or goes down).
	1694	*/
	1695	if (((info->flags & (MNT_LAZY\|MNT_NOWAIT)) == 0 \|\|
	1696	(info->flags & MNT_WAIT)) &&
	1697	(error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
	1698	/* ignore error */
	1699	}
	1700	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	1701
	1702	error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
	1703	if (error) {
	1704	info->error = error;
	1705	return(-1);
	1706	}
	1707	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	1708	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
	1709	kfree(freepath, M_TEMP);
	1710
	1711	error = copyout(sp, info->sfsp, sizeof(*sp));
	1712	if (error) {
	1713	info->error = error;
	1714	return (-1);
	1715	}
	1716	++info->sfsp;
	1717	}
	1718	info->count++;
	1719	return(0);
	1720	}
	1721
	1722	/*
	1723	* getvfsstat_args(struct statfs buf, struct statvfs vbuf,
	1724	long bufsize, int flags)
	1725	*
	1726	* Get statistics on all filesystems.
	1727	*/
	1728
	1729	struct getvfsstat_info {
	1730	struct statfs *sfsp;
	1731	struct statvfs *vsfsp;
	1732	long count;
	1733	long maxcount;
	1734	int error;
	1735	int flags;
	1736	struct thread *td;
	1737	};
	1738
	1739	static int getvfsstat_callback(struct mount , void );
	1740
	1741	int
	1742	sys_getvfsstat(struct sysmsg sysmsg, const struct getvfsstat_args uap)
	1743	{
	1744	struct thread *td = curthread;
	1745	struct getvfsstat_info info;
	1746
	1747	bzero(&info, sizeof(info));
	1748
	1749	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
	1750	info.sfsp = uap->buf;
	1751	info.vsfsp = uap->vbuf;
	1752	info.count = 0;
	1753	info.flags = uap->flags;
	1754	info.td = td;
	1755
	1756	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
	1757	if (info.vsfsp && info.count > info.maxcount)
	1758	sysmsg->sysmsg_result = info.maxcount;
	1759	else
	1760	sysmsg->sysmsg_result = info.count;
	1761	return (info.error);
	1762	}
	1763
	1764	static int
	1765	getvfsstat_callback(struct mount mp, void data)
	1766	{
	1767	struct getvfsstat_info *info = data;
	1768	struct statfs *sp;
	1769	struct statvfs *vsp;
	1770	char *freepath;
	1771	char *fullpath;
	1772	int error;
	1773
	1774	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
	1775	return(0);
	1776
	1777	if (info->vsfsp && info->count < info->maxcount) {
	1778	sp = &mp->mnt_stat;
	1779	vsp = &mp->mnt_vstat;
	1780
	1781	/*
	1782	* If MNT_NOWAIT or MNT_LAZY is specified, do not
	1783	* refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
	1784	* overrides MNT_WAIT.
	1785	*
	1786	* Ignore refresh error, user should have visibility.
	1787	* This can happen if a NFS mount goes bad (e.g. server
	1788	* revokes perms or goes down).
	1789	*/
	1790	if (((info->flags & (MNT_LAZY\|MNT_NOWAIT)) == 0 \|\|
	1791	(info->flags & MNT_WAIT)) &&
	1792	(error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
	1793	/* ignore error */
	1794	}
	1795	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	1796
	1797	if (((info->flags & (MNT_LAZY\|MNT_NOWAIT)) == 0 \|\|
	1798	(info->flags & MNT_WAIT)) &&
	1799	(error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
	1800	/* ignore error */
	1801	}
	1802	vsp->f_flag = 0;
	1803	if (mp->mnt_flag & MNT_RDONLY)
	1804	vsp->f_flag \|= ST_RDONLY;
	1805	if (mp->mnt_flag & MNT_NOSUID)
	1806	vsp->f_flag \|= ST_NOSUID;
	1807
	1808	error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
	1809	if (error) {
	1810	info->error = error;
	1811	return(-1);
	1812	}
	1813	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	1814	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
	1815	kfree(freepath, M_TEMP);
	1816
	1817	error = copyout(sp, info->sfsp, sizeof(*sp));
	1818	if (error == 0)
	1819	error = copyout(vsp, info->vsfsp, sizeof(*vsp));
	1820	if (error) {
	1821	info->error = error;
	1822	return (-1);
	1823	}
	1824	++info->sfsp;
	1825	++info->vsfsp;
	1826	}
	1827	info->count++;
	1828	return(0);
	1829	}
	1830
	1831
	1832	/*
	1833	* fchdir_args(int fd)
	1834	*
	1835	* Change current working directory to a given file descriptor.
	1836	*/
	1837	int
	1838	sys_fchdir(struct sysmsg sysmsg, const struct fchdir_args uap)
	1839	{
	1840	struct thread *td = curthread;
	1841	struct proc *p = td->td_proc;
	1842	struct filedesc *fdp = p->p_fd;
	1843	struct vnode vp, ovp;
	1844	struct mount *mp;
	1845	struct file *fp;
	1846	struct nchandle nch, onch, tnch;
	1847	int error;
	1848
	1849	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
	1850	return (error);
	1851	lwkt_gettoken(&p->p_token);
	1852	vp = (struct vnode *)fp->f_data;
	1853	vref(vp);
	1854	vn_lock(vp, LK_SHARED \| LK_RETRY);
	1855	if (fp->f_nchandle.ncp == NULL)
	1856	error = ENOTDIR;
	1857	else
	1858	error = checkvp_chdir(vp, td);
	1859	if (error) {
	1860	vput(vp);
	1861	goto done;
	1862	}
	1863	cache_copy(&fp->f_nchandle, &nch);
	1864
	1865	/*
	1866	* If the ncp has become a mount point, traverse through
	1867	* the mount point.
	1868	*/
	1869
	1870	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
	1871	(mp = cache_findmount(&nch)) != NULL
	1872	) {
	1873	error = nlookup_mp(mp, &tnch);
	1874	if (error == 0) {
	1875	cache_unlock(&tnch); /* leave ref intact */
	1876	vput(vp);
	1877	vp = tnch.ncp->nc_vp;
	1878	error = vget(vp, LK_SHARED);
	1879	KKASSERT(error == 0);
	1880	cache_drop(&nch);
	1881	nch = tnch;
	1882	}
	1883	cache_dropmount(mp);
	1884	}
	1885	if (error == 0) {
	1886	spin_lock(&fdp->fd_spin);
	1887	ovp = fdp->fd_cdir;
	1888	onch = fdp->fd_ncdir;
	1889	fdp->fd_cdir = vp;
	1890	fdp->fd_ncdir = nch;
	1891	spin_unlock(&fdp->fd_spin);
	1892	vn_unlock(vp); /* leave ref intact */
	1893	cache_drop(&onch);
	1894	vrele(ovp);
	1895	} else {
	1896	cache_drop(&nch);
	1897	vput(vp);
	1898	}
	1899	fdrop(fp);
	1900	done:
	1901	lwkt_reltoken(&p->p_token);
	1902	return (error);
	1903	}
	1904
	1905	int
	1906	kern_chdir(struct nlookupdata *nd)
	1907	{
	1908	struct thread *td = curthread;
	1909	struct proc *p = td->td_proc;
	1910	struct filedesc *fdp = p->p_fd;
	1911	struct vnode vp, ovp;
	1912	struct nchandle onch;
	1913	int error;
	1914
	1915	nd->nl_flags \|= NLC_SHAREDLOCK;
	1916	if ((error = nlookup(nd)) != 0)
	1917	return (error);
	1918	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
	1919	return (ENOENT);
	1920	if ((error = vget(vp, LK_SHARED)) != 0)
	1921	return (error);
	1922
	1923	lwkt_gettoken(&p->p_token);
	1924	error = checkvp_chdir(vp, td);
	1925	vn_unlock(vp);
	1926	if (error == 0) {
	1927	spin_lock(&fdp->fd_spin);
	1928	ovp = fdp->fd_cdir;
	1929	onch = fdp->fd_ncdir;
	1930	fdp->fd_ncdir = nd->nl_nch;
	1931	fdp->fd_cdir = vp;
	1932	spin_unlock(&fdp->fd_spin);
	1933	cache_unlock(&nd->nl_nch); /* leave reference intact */
	1934	cache_drop(&onch);
	1935	vrele(ovp);
	1936	cache_zero(&nd->nl_nch);
	1937	} else {
	1938	vrele(vp);
	1939	}
	1940	lwkt_reltoken(&p->p_token);
	1941	return (error);
	1942	}
	1943
	1944	/*
	1945	* chdir_args(char *path)
	1946	*
	1947	* Change current working directory (``.'').
	1948	*/
	1949	int
	1950	sys_chdir(struct sysmsg sysmsg, const struct chdir_args uap)
	1951	{
	1952	struct nlookupdata nd;
	1953	int error;
	1954
	1955	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	1956	if (error == 0)
	1957	error = kern_chdir(&nd);
	1958	nlookup_done(&nd);
	1959	return (error);
	1960	}
	1961
	1962	/*
	1963	* Helper function for raised chroot(2) security function: Refuse if
	1964	* any filedescriptors are open directories.
	1965	*/
	1966	static int
	1967	chroot_refuse_vdir_fds(thread_t td, struct filedesc *fdp)
	1968	{
	1969	struct vnode *vp;
	1970	struct file *fp;
	1971	int error;
	1972	int fd;
	1973
	1974	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
	1975	if ((error = holdvnode(td, fd, &fp)) != 0)
	1976	continue;
	1977	vp = (struct vnode *)fp->f_data;
	1978	if (vp->v_type != VDIR) {
	1979	fdrop(fp);
	1980	continue;
	1981	}
	1982	fdrop(fp);
	1983	return(EPERM);
	1984	}
	1985	return (0);
	1986	}
	1987
	1988	/*
	1989	* This sysctl determines if we will allow a process to chroot(2) if it
	1990	* has a directory open:
	1991	* 0: disallowed for all processes.
	1992	* 1: allowed for processes that were not already chroot(2)'ed.
	1993	* 2: allowed for all processes.
	1994	*/
	1995
	1996	static int chroot_allow_open_directories = 1;
	1997
	1998	SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
	1999	&chroot_allow_open_directories, 0, "");
	2000
	2001	/*
	2002	* chroot to the specified namecache entry. We obtain the vp from the
	2003	* namecache data. The passed ncp must be locked and referenced and will
	2004	* remain locked and referenced on return.
	2005	*/
	2006	int
	2007	kern_chroot(struct nchandle *nch)
	2008	{
	2009	struct thread *td = curthread;
	2010	struct proc *p = td->td_proc;
	2011	struct filedesc *fdp = p->p_fd;
	2012	struct vnode *vp;
	2013	int error;
	2014
	2015	/*
	2016	* Only privileged user can chroot
	2017	*/
	2018	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHROOT);
	2019	if (error)
	2020	return (error);
	2021
	2022	/*
	2023	* Disallow open directory descriptors (fchdir() breakouts).
	2024	*/
	2025	if (chroot_allow_open_directories == 0 \|\|
	2026	(chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
	2027	if ((error = chroot_refuse_vdir_fds(td, fdp)) != 0)
	2028	return (error);
	2029	}
	2030	if ((vp = nch->ncp->nc_vp) == NULL)
	2031	return (ENOENT);
	2032
	2033	if ((error = vget(vp, LK_SHARED)) != 0)
	2034	return (error);
	2035
	2036	/*
	2037	* Check the validity of vp as a directory to change to and
	2038	* associate it with rdir/jdir.
	2039	*/
	2040	error = checkvp_chdir(vp, td);
	2041	vn_unlock(vp); /* leave reference intact */
	2042	if (error == 0) {
	2043	lwkt_gettoken(&p->p_token);
	2044	vrele(fdp->fd_rdir);
	2045	fdp->fd_rdir = vp; /* reference inherited by fd_rdir */
	2046	cache_drop(&fdp->fd_nrdir);
	2047	cache_copy(nch, &fdp->fd_nrdir);
	2048	if (fdp->fd_jdir == NULL) {
	2049	fdp->fd_jdir = vp;
	2050	vref(fdp->fd_jdir);
	2051	cache_copy(nch, &fdp->fd_njdir);
	2052	}
	2053	if ((p->p_flags & P_DIDCHROOT) == 0) {
	2054	p->p_flags \|= P_DIDCHROOT;
	2055	if (p->p_depth <= 65535 - 32)
	2056	p->p_depth += 32;
	2057	}
	2058	lwkt_reltoken(&p->p_token);
	2059	} else {
	2060	vrele(vp);
	2061	}
	2062	return (error);
	2063	}
	2064
	2065	/*
	2066	* chroot_args(char *path)
	2067	*
	2068	* Change notion of root (``/'') directory.
	2069	*/
	2070	int
	2071	sys_chroot(struct sysmsg sysmsg, const struct chroot_args uap)
	2072	{
	2073	struct thread *td __debugvar = curthread;
	2074	struct nlookupdata nd;
	2075	int error;
	2076
	2077	KKASSERT(td->td_proc);
	2078	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	2079	if (error == 0) {
	2080	nd.nl_flags \|= NLC_EXEC;
	2081	error = nlookup(&nd);
	2082	if (error == 0)
	2083	error = kern_chroot(&nd.nl_nch);
	2084	}
	2085	nlookup_done(&nd);
	2086	return(error);
	2087	}
	2088
	2089	int
	2090	sys_chroot_kernel(struct sysmsg sysmsg, const struct chroot_kernel_args uap)
	2091	{
	2092	struct thread *td = curthread;
	2093	struct nlookupdata nd;
	2094	struct nchandle *nch;
	2095	struct vnode *vp;
	2096	int error;
	2097
	2098	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	2099	if (error)
	2100	goto error_nond;
	2101
	2102	error = nlookup(&nd);
	2103	if (error)
	2104	goto error_out;
	2105
	2106	nch = &nd.nl_nch;
	2107
	2108	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHROOT);
	2109	if (error)
	2110	goto error_out;
	2111
	2112	if ((vp = nch->ncp->nc_vp) == NULL) {
	2113	error = ENOENT;
	2114	goto error_out;
	2115	}
	2116
	2117	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
	2118	goto error_out;
	2119
	2120	vfs_cache_setroot(vp, cache_hold(nch));
	2121
	2122	error_out:
	2123	nlookup_done(&nd);
	2124	error_nond:
	2125	return(error);
	2126	}
	2127
	2128	/*
	2129	* Common routine for chroot and chdir. Given a locked, referenced vnode,
	2130	* determine whether it is legal to chdir to the vnode. The vnode's state
	2131	* is not changed by this call.
	2132	*/
	2133	static int
	2134	checkvp_chdir(struct vnode vp, struct thread td)
	2135	{
	2136	int error;
	2137
	2138	if (vp->v_type != VDIR)
	2139	error = ENOTDIR;
	2140	else
	2141	error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
	2142	return (error);
	2143	}
	2144
	2145	int
	2146	kern_open(struct nlookupdata nd, int oflags, int mode, int res)
	2147	{
	2148	struct thread *td = curthread;
	2149	struct proc *p = td->td_proc;
	2150	struct lwp *lp = td->td_lwp;
	2151	struct filedesc *fdp = p->p_fd;
	2152	int cmode, flags;
	2153	struct file *nfp;
	2154	struct file *fp;
	2155	int type, indx, error = 0;
	2156	struct flock lf;
	2157
	2158	if ((oflags & O_ACCMODE) == O_ACCMODE)
	2159	return (EINVAL);
	2160	flags = FFLAGS(oflags);
	2161	error = falloc(lp, &nfp, NULL);
	2162	if (error)
	2163	return (error);
	2164	fp = nfp;
	2165	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
	2166
	2167	/*
	2168	* Call vn_open() to do the lookup and assign the vnode to the
	2169	* file pointer. vn_open() does not change the ref count on fp
	2170	* and the vnode, on success, will be inherited by the file pointer
	2171	* and unlocked.
	2172	*
	2173	* Request a shared lock on the vnode if possible.
	2174	*
	2175	* When NLC_SHAREDLOCK is set we may still need an exclusive vnode
	2176	* lock for O_RDWR opens on executables in order to avoid a VTEXT
	2177	* detection race. The NLC_EXCLLOCK_IFEXEC handles this case.
	2178	*
	2179	* NOTE: We need a flag to separate terminal vnode locking from
	2180	* parent locking. O_CREAT needs parent locking, but O_TRUNC
	2181	* and O_RDWR only need to lock the terminal vnode exclusively.
	2182	*/
	2183	nd->nl_flags \|= NLC_LOCKVP;
	2184	if ((flags & (O_CREAT\|O_TRUNC)) == 0) {
	2185	nd->nl_flags \|= NLC_SHAREDLOCK;
	2186	if (flags & O_RDWR)
	2187	nd->nl_flags \|= NLC_EXCLLOCK_IFEXEC;
	2188	}
	2189
	2190	/*
	2191	* Issue the vn_open, passing in the referenced fp. the vn_open()
	2192	* is allowed to replace fp by fdrop()ing it and returning its own
	2193	* referenced fp.
	2194	*/
	2195	nfp = fp;
	2196	error = vn_open(nd, &nfp, flags, cmode);
	2197	fp = nfp;
	2198	nlookup_done(nd);
	2199
	2200	/*
	2201	* Deal with any error condition
	2202	*/
	2203	if (error) {
	2204	fdrop(fp); /* our ref */
	2205	if (error == ERESTART)
	2206	error = EINTR;
	2207	return (error);
	2208	}
	2209
	2210	/*
	2211	* Reserve a file descriptor.
	2212	*/
	2213	if ((error = fdalloc(p, 0, &indx)) != 0) {
	2214	fdrop(fp);
	2215	return (error);
	2216	}
	2217
	2218	/*
	2219	* Handle advisory lock flags. This is only supported with vnodes.
	2220	* For things like /dev/fd/N we might not actually get a vnode.
	2221	*/
	2222	if ((flags & (O_EXLOCK \| O_SHLOCK)) && fp->f_type == DTYPE_VNODE) {
	2223	struct vnode *vp;
	2224
	2225	vp = (struct vnode *)fp->f_data;
	2226	vref(vp);
	2227
	2228	lf.l_whence = SEEK_SET;
	2229	lf.l_start = 0;
	2230	lf.l_len = 0;
	2231	if (flags & O_EXLOCK)
	2232	lf.l_type = F_WRLCK;
	2233	else
	2234	lf.l_type = F_RDLCK;
	2235	if (flags & FNONBLOCK)
	2236	type = 0;
	2237	else
	2238	type = F_WAIT;
	2239
	2240	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
	2241	if (error) {
	2242	/*
	2243	* lock request failed. Clean up the reserved
	2244	* descriptor.
	2245	*/
	2246	vrele(vp);
	2247	fsetfd(fdp, NULL, indx);
	2248	fdrop(fp);
	2249	return (error);
	2250	}
	2251	atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
	2252	vrele(vp);
	2253	}
	2254
	2255	/*
	2256	* release our private reference, leaving the one associated with the
	2257	* descriptor table intact.
	2258	*/
	2259	if (oflags & O_CLOEXEC)
	2260	fdp->fd_files[indx].fileflags \|= UF_EXCLOSE;
	2261	fsetfd(fdp, fp, indx);
	2262	fdrop(fp);
	2263	*res = indx;
	2264
	2265	return (error);
	2266	}
	2267
	2268	/*
	2269	* open_args(char *path, int flags, int mode)
	2270	*
	2271	* Check permissions, allocate an open file structure,
	2272	* and call the device open routine if any.
	2273	*/
	2274	int
	2275	sys_open(struct sysmsg sysmsg, const struct open_args uap)
	2276	{
	2277	struct nlookupdata nd;
	2278	int error;
	2279
	2280	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	2281	if (error == 0) {
	2282	error = kern_open(&nd, uap->flags,
	2283	uap->mode, &sysmsg->sysmsg_result);
	2284	}
	2285	nlookup_done(&nd);
	2286	return (error);
	2287	}
	2288
	2289	/*
	2290	* openat_args(int fd, char *path, int flags, int mode)
	2291	*/
	2292	int
	2293	sys_openat(struct sysmsg sysmsg, const struct openat_args uap)
	2294	{
	2295	struct nlookupdata nd;
	2296	int error;
	2297	struct file *fp;
	2298
	2299	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
	2300	if (error == 0) {
	2301	error = kern_open(&nd, uap->flags, uap->mode,
	2302	&sysmsg->sysmsg_result);
	2303	}
	2304	nlookup_done_at(&nd, fp);
	2305	return (error);
	2306	}
	2307
	2308	int
	2309	kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
	2310	{
	2311	struct thread *td = curthread;
	2312	struct proc *p = td->td_proc;
	2313	struct vnode *vp;
	2314	struct vattr vattr;
	2315	int error;
	2316	int whiteout = 0;
	2317
	2318	KKASSERT(p);
	2319
	2320	VATTR_NULL(&vattr);
	2321	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
	2322	vattr.va_rmajor = rmajor;
	2323	vattr.va_rminor = rminor;
	2324
	2325	switch (mode & S_IFMT) {
	2326	case S_IFMT: /* used by badsect to flag bad sectors */
	2327	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_BAD);
	2328	vattr.va_type = VBAD;
	2329	break;
	2330	case S_IFCHR:
	2331	error = caps_priv_check_td(td, SYSCAP_NOVFS_MKNOD_DEV);
	2332	vattr.va_type = VCHR;
	2333	break;
	2334	case S_IFBLK:
	2335	error = caps_priv_check_td(td, SYSCAP_NOVFS_MKNOD_DEV);
	2336	vattr.va_type = VBLK;
	2337	break;
	2338	case S_IFWHT:
	2339	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_WHT);
	2340	whiteout = 1;
	2341	break;
	2342	case S_IFDIR: /* special directories support for HAMMER */
	2343	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_DIR);
	2344	vattr.va_type = VDIR;
	2345	break;
	2346	case S_IFIFO:
	2347	return (kern_mkfifo(nd, mode));
	2348	break;
	2349	default:
	2350	error = EINVAL;
	2351	break;
	2352	}
	2353
	2354	if (error)
	2355	return (error);
	2356
	2357	bwillinode(1);
	2358	nd->nl_flags \|= NLC_CREATE \| NLC_REFDVP;
	2359	if ((error = nlookup(nd)) != 0)
	2360	return (error);
	2361	if (nd->nl_nch.ncp->nc_vp)
	2362	return (EEXIST);
	2363	if (nd->nl_dvp == NULL)
	2364	return (EINVAL);
	2365	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	2366	return (error);
	2367
	2368	if (whiteout) {
	2369	error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
	2370	nd->nl_cred, NAMEI_CREATE);
	2371	} else {
	2372	vp = NULL;
	2373	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
	2374	&vp, nd->nl_cred, &vattr);
	2375	if (error == 0)
	2376	vput(vp);
	2377	}
	2378	return (error);
	2379	}
	2380
	2381	/*
	2382	* mknod_args(char *path, int mode, int dev)
	2383	*
	2384	* Create a special file.
	2385	*/
	2386	int
	2387	sys_mknod(struct sysmsg sysmsg, const struct mknod_args uap)
	2388	{
	2389	struct nlookupdata nd;
	2390	int error;
	2391
	2392	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	2393	if (error == 0) {
	2394	error = kern_mknod(&nd, uap->mode,
	2395	umajor(uap->dev), uminor(uap->dev));
	2396	}
	2397	nlookup_done(&nd);
	2398	return (error);
	2399	}
	2400
	2401	/*
	2402	* mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
	2403	*
	2404	* Create a special file. The path is relative to the directory associated
	2405	* with fd.
	2406	*/
	2407	int
	2408	sys_mknodat(struct sysmsg sysmsg, const struct mknodat_args uap)
	2409	{
	2410	struct nlookupdata nd;
	2411	struct file *fp;
	2412	int error;
	2413
	2414	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
	2415	if (error == 0) {
	2416	error = kern_mknod(&nd, uap->mode,
	2417	umajor(uap->dev), uminor(uap->dev));
	2418	}
	2419	nlookup_done_at(&nd, fp);
	2420	return (error);
	2421	}
	2422
	2423	int
	2424	kern_mkfifo(struct nlookupdata *nd, int mode)
	2425	{
	2426	struct thread *td = curthread;
	2427	struct proc *p = td->td_proc;
	2428	struct vattr vattr;
	2429	struct vnode *vp;
	2430	int error;
	2431
	2432	bwillinode(1);
	2433
	2434	nd->nl_flags \|= NLC_CREATE \| NLC_REFDVP;
	2435	if ((error = nlookup(nd)) != 0)
	2436	return (error);
	2437	if (nd->nl_nch.ncp->nc_vp)
	2438	return (EEXIST);
	2439	if (nd->nl_dvp == NULL)
	2440	return (EINVAL);
	2441	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	2442	return (error);
	2443
	2444	VATTR_NULL(&vattr);
	2445	vattr.va_type = VFIFO;
	2446	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
	2447	vp = NULL;
	2448	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
	2449	if (error == 0)
	2450	vput(vp);
	2451	return (error);
	2452	}
	2453
	2454	/*
	2455	* mkfifo_args(char *path, int mode)
	2456	*
	2457	* Create a named pipe.
	2458	*/
	2459	int
	2460	sys_mkfifo(struct sysmsg sysmsg, const struct mkfifo_args uap)
	2461	{
	2462	struct nlookupdata nd;
	2463	int error;
	2464
	2465	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	2466	if (error == 0)
	2467	error = kern_mkfifo(&nd, uap->mode);
	2468	nlookup_done(&nd);
	2469	return (error);
	2470	}
	2471
	2472	/*
	2473	* mkfifoat_args(int fd, char *path, mode_t mode)
	2474	*
	2475	* Create a named pipe. The path is relative to the directory associated
	2476	* with fd.
	2477	*/
	2478	int
	2479	sys_mkfifoat(struct sysmsg sysmsg, const struct mkfifoat_args uap)
	2480	{
	2481	struct nlookupdata nd;
	2482	struct file *fp;
	2483	int error;
	2484
	2485	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
	2486	if (error == 0)
	2487	error = kern_mkfifo(&nd, uap->mode);
	2488	nlookup_done_at(&nd, fp);
	2489	return (error);
	2490	}
	2491
	2492	static int hardlink_check_uid = 0;
	2493	SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
	2494	&hardlink_check_uid, 0,
	2495	"Unprivileged processes cannot create hard links to files owned by other "
	2496	"users");
	2497	static int hardlink_check_gid = 0;
	2498	SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
	2499	&hardlink_check_gid, 0,
	2500	"Unprivileged processes cannot create hard links to files owned by other "
	2501	"groups");
	2502
	2503	static int
	2504	can_hardlink(struct vnode vp, struct thread td, struct ucred *cred)
	2505	{
	2506	struct vattr va;
	2507	int error;
	2508
	2509	/*
	2510	* Shortcut if disabled
	2511	*/
	2512	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
	2513	return (0);
	2514
	2515	/*
	2516	* Privileged user can always hardlink
	2517	*/
	2518	if (caps_priv_check(cred, SYSCAP_NOVFS_LINK) == 0)
	2519	return (0);
	2520
	2521	/*
	2522	* Otherwise only if the originating file is owned by the
	2523	* same user or group. Note that any group is allowed if
	2524	* the file is owned by the caller.
	2525	*/
	2526	error = VOP_GETATTR(vp, &va);
	2527	if (error != 0)
	2528	return (error);
	2529
	2530	if (hardlink_check_uid) {
	2531	if (cred->cr_uid != va.va_uid)
	2532	return (EPERM);
	2533	}
	2534
	2535	if (hardlink_check_gid) {
	2536	if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
	2537	return (EPERM);
	2538	}
	2539
	2540	return (0);
	2541	}
	2542
	2543	int
	2544	kern_link(struct nlookupdata nd, struct nlookupdata linknd)
	2545	{
	2546	struct thread *td = curthread;
	2547	struct vnode *vp;
	2548	int error;
	2549
	2550	/*
	2551	* Lookup the source and obtained a locked vnode.
	2552	*
	2553	* You may only hardlink a file which you have write permission
	2554	* on or which you own.
	2555	*
	2556	* XXX relookup on vget failure / race ?
	2557	*/
	2558	bwillinode(1);
	2559	nd->nl_flags \|= NLC_WRITE \| NLC_OWN \| NLC_HLINK;
	2560	if ((error = nlookup(nd)) != 0)
	2561	return (error);
	2562	vp = nd->nl_nch.ncp->nc_vp;
	2563	KKASSERT(vp != NULL);
	2564	if (vp->v_type == VDIR)
	2565	return (EPERM); /* POSIX */
	2566	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	2567	return (error);
	2568	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
	2569	return (error);
	2570
	2571	/*
	2572	* Unlock the source so we can lookup the target without deadlocking
	2573	* (XXX vp is locked already, possible other deadlock?). The target
	2574	* must not exist.
	2575	*/
	2576	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
	2577	nd->nl_flags &= ~NLC_NCPISLOCKED;
	2578	cache_unlock(&nd->nl_nch);
	2579	vn_unlock(vp);
	2580
	2581	linknd->nl_flags \|= NLC_CREATE \| NLC_REFDVP;
	2582	if ((error = nlookup(linknd)) != 0) {
	2583	vrele(vp);
	2584	return (error);
	2585	}
	2586	if (linknd->nl_nch.ncp->nc_vp) {
	2587	vrele(vp);
	2588	return (EEXIST);
	2589	}
	2590	if (linknd->nl_dvp == NULL) {
	2591	vrele(vp);
	2592	return (EINVAL);
	2593	}
	2594	VFS_MODIFYING(vp->v_mount);
	2595	error = vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY \| LK_FAILRECLAIM);
	2596	if (error) {
	2597	vrele(vp);
	2598	return (error);
	2599	}
	2600
	2601	/*
	2602	* Finally run the new API VOP.
	2603	*/
	2604	error = can_hardlink(vp, td, td->td_ucred);
	2605	if (error == 0) {
	2606	error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
	2607	vp, linknd->nl_cred);
	2608	}
	2609	vput(vp);
	2610	return (error);
	2611	}
	2612
	2613	/*
	2614	* link_args(char path, char link)
	2615	*
	2616	* Make a hard file link.
	2617	*/
	2618	int
	2619	sys_link(struct sysmsg sysmsg, const struct link_args uap)
	2620	{
	2621	struct nlookupdata nd, linknd;
	2622	int error;
	2623
	2624	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	2625	if (error == 0) {
	2626	error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
	2627	if (error == 0)
	2628	error = kern_link(&nd, &linknd);
	2629	nlookup_done(&linknd);
	2630	}
	2631	nlookup_done(&nd);
	2632	return (error);
	2633	}
	2634
	2635	/*
	2636	* linkat_args(int fd1, char path1, int fd2, char path2, int flags)
	2637	*
	2638	* Make a hard file link. The path1 argument is relative to the directory
	2639	* associated with fd1, and similarly the path2 argument is relative to
	2640	* the directory associated with fd2.
	2641	*/
	2642	int
	2643	sys_linkat(struct sysmsg sysmsg, const struct linkat_args uap)
	2644	{
	2645	struct nlookupdata nd, linknd;
	2646	struct file fp1, fp2;
	2647	int error;
	2648
	2649	error = nlookup_init_at(&nd, &fp1, uap->fd1, uap->path1, UIO_USERSPACE,
	2650	(uap->flags & AT_SYMLINK_FOLLOW) ? NLC_FOLLOW : 0);
	2651	if (error == 0) {
	2652	error = nlookup_init_at(&linknd, &fp2, uap->fd2,
	2653	uap->path2, UIO_USERSPACE, 0);
	2654	if (error == 0)
	2655	error = kern_link(&nd, &linknd);
	2656	nlookup_done_at(&linknd, fp2);
	2657	}
	2658	nlookup_done_at(&nd, fp1);
	2659	return (error);
	2660	}
	2661
	2662	int
	2663	kern_symlink(struct nlookupdata nd, char path, int mode)
	2664	{
	2665	struct vattr vattr;
	2666	struct vnode *vp;
	2667	struct vnode *dvp;
	2668	int error;
	2669
	2670	bwillinode(1);
	2671	nd->nl_flags \|= NLC_CREATE \| NLC_REFDVP;
	2672	if ((error = nlookup(nd)) != 0)
	2673	return (error);
	2674	if (nd->nl_nch.ncp->nc_vp)
	2675	return (EEXIST);
	2676	if (nd->nl_dvp == NULL)
	2677	return (EINVAL);
	2678	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	2679	return (error);
	2680	dvp = nd->nl_dvp;
	2681	VATTR_NULL(&vattr);
	2682	vattr.va_mode = mode;
	2683	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
	2684	if (error == 0)
	2685	vput(vp);
	2686	return (error);
	2687	}
	2688
	2689	/*
	2690	* symlink(char path, char link)
	2691	*
	2692	* Make a symbolic link.
	2693	*/
	2694	int
	2695	sys_symlink(struct sysmsg sysmsg, const struct symlink_args uap)
	2696	{
	2697	struct thread *td = curthread;
	2698	struct nlookupdata nd;
	2699	char *path;
	2700	int error;
	2701	int mode;
	2702
	2703	path = objcache_get(namei_oc, M_WAITOK);
	2704	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
	2705	if (error == 0) {
	2706	error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
	2707	if (error == 0) {
	2708	mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
	2709	error = kern_symlink(&nd, path, mode);
	2710	}
	2711	nlookup_done(&nd);
	2712	}
	2713	objcache_put(namei_oc, path);
	2714	return (error);
	2715	}
	2716
	2717	/*
	2718	* symlinkat_args(char path1, int fd, char path2)
	2719	*
	2720	* Make a symbolic link. The path2 argument is relative to the directory
	2721	* associated with fd.
	2722	*/
	2723	int
	2724	sys_symlinkat(struct sysmsg sysmsg, const struct symlinkat_args uap)
	2725	{
	2726	struct thread *td = curthread;
	2727	struct nlookupdata nd;
	2728	struct file *fp;
	2729	char *path1;
	2730	int error;
	2731	int mode;
	2732
	2733	path1 = objcache_get(namei_oc, M_WAITOK);
	2734	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
	2735	if (error == 0) {
	2736	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
	2737	UIO_USERSPACE, 0);
	2738	if (error == 0) {
	2739	mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
	2740	error = kern_symlink(&nd, path1, mode);
	2741	}
	2742	nlookup_done_at(&nd, fp);
	2743	}
	2744	objcache_put(namei_oc, path1);
	2745	return (error);
	2746	}
	2747
	2748	/*
	2749	* undelete_args(char *path)
	2750	*
	2751	* Delete a whiteout from the filesystem.
	2752	*/
	2753	int
	2754	sys_undelete(struct sysmsg sysmsg, const struct undelete_args uap)
	2755	{
	2756	struct nlookupdata nd;
	2757	int error;
	2758
	2759	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	2760	bwillinode(1);
	2761	nd.nl_flags \|= NLC_DELETE \| NLC_REFDVP;
	2762	if (error == 0)
	2763	error = nlookup(&nd);
	2764	if (error == 0 && nd.nl_dvp == NULL)
	2765	error = EINVAL;
	2766	if (error == 0)
	2767	error = ncp_writechk(&nd.nl_nch);
	2768	if (error == 0) {
	2769	error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
	2770	NAMEI_DELETE);
	2771	}
	2772	nlookup_done(&nd);
	2773	return (error);
	2774	}
	2775
	2776	int
	2777	kern_unlink(struct nlookupdata *nd)
	2778	{
	2779	int error;
	2780
	2781	bwillinode(1);
	2782	nd->nl_flags \|= NLC_DELETE \| NLC_REFDVP;
	2783	if ((error = nlookup(nd)) != 0)
	2784	return (error);
	2785	if (nd->nl_dvp == NULL)
	2786	return EINVAL;
	2787	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	2788	return (error);
	2789	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
	2790	return (error);
	2791	}
	2792
	2793	/*
	2794	* unlink_args(char *path)
	2795	*
	2796	* Delete a name from the filesystem.
	2797	*/
	2798	int
	2799	sys_unlink(struct sysmsg sysmsg, const struct unlink_args uap)
	2800	{
	2801	struct nlookupdata nd;
	2802	int error;
	2803
	2804	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	2805	if (error == 0)
	2806	error = kern_unlink(&nd);
	2807	nlookup_done(&nd);
	2808	return (error);
	2809	}
	2810
	2811
	2812	/*
	2813	* unlinkat_args(int fd, char *path, int flags)
	2814	*
	2815	* Delete the file or directory entry pointed to by fd/path.
	2816	*/
	2817	int
	2818	sys_unlinkat(struct sysmsg sysmsg, const struct unlinkat_args uap)
	2819	{
	2820	struct nlookupdata nd;
	2821	struct file *fp;
	2822	int error;
	2823
	2824	if (uap->flags & ~AT_REMOVEDIR)
	2825	return (EINVAL);
	2826
	2827	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
	2828	if (error == 0) {
	2829	if (uap->flags & AT_REMOVEDIR)
	2830	error = kern_rmdir(&nd);
	2831	else
	2832	error = kern_unlink(&nd);
	2833	}
	2834	nlookup_done_at(&nd, fp);
	2835	return (error);
	2836	}
	2837
	2838	int
	2839	kern_lseek(int fd, off_t offset, int whence, off_t *res)
	2840	{
	2841	struct thread *td = curthread;
	2842	struct file *fp;
	2843	struct vnode *vp;
	2844	struct vattr_lite lva;
	2845	off_t new_offset;
	2846	int error;
	2847
	2848	fp = holdfp(td, fd, -1);
	2849	if (fp == NULL)
	2850	return (EBADF);
	2851	if (fp->f_type != DTYPE_VNODE) {
	2852	error = ESPIPE;
	2853	goto done;
	2854	}
	2855	vp = (struct vnode *)fp->f_data;
	2856
	2857	switch (whence) {
	2858	case L_INCR:
	2859	spin_lock(&fp->f_spin);
	2860	new_offset = fp->f_offset + offset;
	2861	error = 0;
	2862	break;
	2863	case L_XTND:
	2864	error = VOP_GETATTR_LITE(vp, &lva);
	2865	spin_lock(&fp->f_spin);
	2866	new_offset = offset + lva.va_size;
	2867	break;
	2868	case L_SET:
	2869	new_offset = offset;
	2870	error = 0;
	2871	spin_lock(&fp->f_spin);
	2872	break;
	2873	default:
	2874	new_offset = 0;
	2875	error = EINVAL;
	2876	spin_lock(&fp->f_spin);
	2877	break;
	2878	}
	2879
	2880	/*
	2881	* Validate the seek position. Negative offsets are not allowed
	2882	* for regular files or directories.
	2883	*
	2884	* Normally we would also not want to allow negative offsets for
	2885	* character and block-special devices. However kvm addresses
	2886	* on 64 bit architectures might appear to be negative and must
	2887	* be allowed.
	2888	*/
	2889	if (error == 0) {
	2890	if (new_offset < 0 &&
	2891	(vp->v_type == VREG \|\| vp->v_type == VDIR)) {
	2892	error = EINVAL;
	2893	} else {
	2894	fp->f_offset = new_offset;
	2895	}
	2896	}
	2897	*res = fp->f_offset;
	2898	spin_unlock(&fp->f_spin);
	2899	done:
	2900	dropfp(td, fd, fp);
	2901
	2902	return (error);
	2903	}
	2904
	2905	/*
	2906	* lseek_args(int fd, int pad, off_t offset, int whence)
	2907	*
	2908	* Reposition read/write file offset.
	2909	*/
	2910	int
	2911	sys_lseek(struct sysmsg sysmsg, const struct lseek_args uap)
	2912	{
	2913	int error;
	2914
	2915	error = kern_lseek(uap->fd, uap->offset, uap->whence,
	2916	&sysmsg->sysmsg_offset);
	2917
	2918	return (error);
	2919	}
	2920
	2921	/*
	2922	* Check if current process can access given file. amode is a bitmask of *_OK
	2923	* access bits. flags is a bitmask of AT_* flags.
	2924	*/
	2925	int
	2926	kern_access(struct nlookupdata *nd, int amode, int flags)
	2927	{
	2928	struct vnode *vp;
	2929	int error, mode;
	2930
	2931	if (flags & ~AT_EACCESS)
	2932	return (EINVAL);
	2933	nd->nl_flags \|= NLC_SHAREDLOCK;
	2934	if ((error = nlookup(nd)) != 0)
	2935	return (error);
	2936	if ((amode & W_OK) && (error = ncp_writechk(&nd->nl_nch)) != 0)
	2937	return (error);
	2938	retry:
	2939	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
	2940	if (error)
	2941	return (error);
	2942
	2943	/* Flags == 0 means only check for existence. */
	2944	if (amode) {
	2945	mode = 0;
	2946	if (amode & R_OK)
	2947	mode \|= VREAD;
	2948	if (amode & W_OK)
	2949	mode \|= VWRITE;
	2950	if (amode & X_OK)
	2951	mode \|= VEXEC;
	2952	if ((mode & VWRITE) == 0 \|\|
	2953	(error = vn_writechk(vp)) == 0) {
	2954	error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
	2955	}
	2956
	2957	/*
	2958	* If the file handle is stale we have to re-resolve the
	2959	* entry with the ncp held exclusively. This is a hack
	2960	* at the moment.
	2961	*/
	2962	if (error == ESTALE) {
	2963	u_int dummy_gen;
	2964
	2965	vput(vp);
	2966	cache_unlock(&nd->nl_nch);
	2967	cache_lock(&nd->nl_nch);
	2968	dummy_gen = nd->nl_nch.ncp->nc_generation;
	2969	cache_setunresolved(&nd->nl_nch);
	2970	error = cache_resolve(&nd->nl_nch, &dummy_gen,
	2971	nd->nl_cred);
	2972	if (error == 0) {
	2973	vp = NULL;
	2974	goto retry;
	2975	}
	2976	return(error);
	2977	}
	2978	}
	2979	vput(vp);
	2980	return (error);
	2981	}
	2982
	2983	/*
	2984	* access_args(char *path, int flags)
	2985	*
	2986	* Check access permissions.
	2987	*/
	2988	int
	2989	sys_access(struct sysmsg sysmsg, const struct access_args uap)
	2990	{
	2991	struct nlookupdata nd;
	2992	int error;
	2993
	2994	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	2995	if (error == 0)
	2996	error = kern_access(&nd, uap->flags, 0);
	2997	nlookup_done(&nd);
	2998	return (error);
	2999	}
	3000
	3001
	3002	/*
	3003	* eaccess_args(char *path, int flags)
	3004	*
	3005	* Check access permissions.
	3006	*/
	3007	int
	3008	sys_eaccess(struct sysmsg sysmsg, const struct eaccess_args uap)
	3009	{
	3010	struct nlookupdata nd;
	3011	int error;
	3012
	3013	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	3014	if (error == 0)
	3015	error = kern_access(&nd, uap->flags, AT_EACCESS);
	3016	nlookup_done(&nd);
	3017	return (error);
	3018	}
	3019
	3020
	3021	/*
	3022	* faccessat_args(int fd, char *path, int amode, int flags)
	3023	*
	3024	* Check access permissions.
	3025	*/
	3026	int
	3027	sys_faccessat(struct sysmsg sysmsg, const struct faccessat_args uap)
	3028	{
	3029	struct nlookupdata nd;
	3030	struct file *fp;
	3031	int error;
	3032
	3033	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
	3034	NLC_FOLLOW);
	3035	if (error == 0)
	3036	error = kern_access(&nd, uap->amode, uap->flags);
	3037	nlookup_done_at(&nd, fp);
	3038	return (error);
	3039	}
	3040
	3041	int
	3042	kern_stat(struct nlookupdata nd, struct stat st)
	3043	{
	3044	int error;
	3045	struct vnode *vp;
	3046
	3047	nd->nl_flags \|= NLC_SHAREDLOCK;
	3048	if ((error = nlookup(nd)) != 0)
	3049	return (error);
	3050	again:
	3051	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
	3052	return (ENOENT);
	3053
	3054	#if 1
	3055	error = cache_vref(&nd->nl_nch, NULL, &vp);
	3056	#else
	3057	error = vget(vp, LK_SHARED);
	3058	#endif
	3059	if (error)
	3060	return (error);
	3061	error = vn_stat(vp, st, nd->nl_cred);
	3062
	3063	/*
	3064	* If the file handle is stale we have to re-resolve the
	3065	* entry with the ncp held exclusively. This is a hack
	3066	* at the moment.
	3067	*/
	3068	if (error == ESTALE) {
	3069	u_int dummy_gen;
	3070	#if 1
	3071	vrele(vp);
	3072	#else
	3073	vput(vp);
	3074	#endif
	3075	cache_unlock(&nd->nl_nch);
	3076	cache_lock(&nd->nl_nch);
	3077	dummy_gen = nd->nl_nch.ncp->nc_generation;
	3078	cache_setunresolved(&nd->nl_nch);
	3079	error = cache_resolve(&nd->nl_nch, &dummy_gen, nd->nl_cred);
	3080	if (error == 0)
	3081	goto again;
	3082	} else {
	3083	#if 1
	3084	vrele(vp);
	3085	#else
	3086	vput(vp);
	3087	#endif
	3088	}
	3089	return (error);
	3090	}
	3091
	3092	/*
	3093	* stat_args(char path, struct stat ub)
	3094	*
	3095	* Get file status; this version follows links.
	3096	*/
	3097	int
	3098	sys_stat(struct sysmsg sysmsg, const struct stat_args uap)
	3099	{
	3100	struct nlookupdata nd;
	3101	struct stat st;
	3102	int error;
	3103
	3104	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	3105	if (error == 0) {
	3106	error = kern_stat(&nd, &st);
	3107	if (error == 0)
	3108	error = copyout(&st, uap->ub, sizeof(*uap->ub));
	3109	}
	3110	nlookup_done(&nd);
	3111	return (error);
	3112	}
	3113
	3114	/*
	3115	* lstat_args(char path, struct stat ub)
	3116	*
	3117	* Get file status; this version does not follow links.
	3118	*/
	3119	int
	3120	sys_lstat(struct sysmsg sysmsg, const struct lstat_args uap)
	3121	{
	3122	struct nlookupdata nd;
	3123	struct stat st;
	3124	int error;
	3125
	3126	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	3127	if (error == 0) {
	3128	error = kern_stat(&nd, &st);
	3129	if (error == 0)
	3130	error = copyout(&st, uap->ub, sizeof(*uap->ub));
	3131	}
	3132	nlookup_done(&nd);
	3133	return (error);
	3134	}
	3135
	3136	/*
	3137	* fstatat_args(int fd, char path, struct stat sb, int flags)
	3138	*
	3139	* Get status of file pointed to by fd/path.
	3140	*/
	3141	int
	3142	sys_fstatat(struct sysmsg sysmsg, const struct fstatat_args uap)
	3143	{
	3144	struct nlookupdata nd;
	3145	struct stat st;
	3146	int error;
	3147	int flags;
	3148	struct file *fp;
	3149
	3150	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
	3151	return (EINVAL);
	3152
	3153	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
	3154
	3155	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
	3156	UIO_USERSPACE, flags);
	3157	if (error == 0) {
	3158	error = kern_stat(&nd, &st);
	3159	if (error == 0)
	3160	error = copyout(&st, uap->sb, sizeof(*uap->sb));
	3161	}
	3162	nlookup_done_at(&nd, fp);
	3163	return (error);
	3164	}
	3165
	3166	static int
	3167	kern_pathconf(char path, int name, int flags, register_t sysmsg_regp)
	3168	{
	3169	struct nlookupdata nd;
	3170	struct vnode *vp;
	3171	int error;
	3172
	3173	vp = NULL;
	3174	error = nlookup_init(&nd, path, UIO_USERSPACE, flags);
	3175	if (error == 0)
	3176	error = nlookup(&nd);
	3177	if (error == 0)
	3178	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
	3179	nlookup_done(&nd);
	3180	if (error == 0) {
	3181	error = VOP_PATHCONF(vp, name, sysmsg_regp);
	3182	vput(vp);
	3183	}
	3184	return (error);
	3185	}
	3186
	3187	/*
	3188	* pathconf_Args(char *path, int name)
	3189	*
	3190	* Get configurable pathname variables.
	3191	*/
	3192	int
	3193	sys_pathconf(struct sysmsg sysmsg, const struct pathconf_args uap)
	3194	{
	3195	return (kern_pathconf(uap->path, uap->name, NLC_FOLLOW,
	3196	&sysmsg->sysmsg_reg));
	3197	}
	3198
	3199	/*
	3200	* lpathconf_Args(char *path, int name)
	3201	*
	3202	* Get configurable pathname variables, but don't follow symlinks.
	3203	*/
	3204	int
	3205	sys_lpathconf(struct sysmsg sysmsg, const struct lpathconf_args uap)
	3206	{
	3207	return (kern_pathconf(uap->path, uap->name, 0, &sysmsg->sysmsg_reg));
	3208	}
	3209
	3210	/*
	3211	* XXX: daver
	3212	* kern_readlink isn't properly split yet. There is a copyin burried
	3213	* in VOP_READLINK().
	3214	*/
	3215	int
	3216	kern_readlink(struct nlookupdata nd, char buf, int count, int *res)
	3217	{
	3218	struct thread *td = curthread;
	3219	struct vnode *vp;
	3220	struct iovec aiov;
	3221	struct uio auio;
	3222	int error;
	3223
	3224	nd->nl_flags \|= NLC_SHAREDLOCK;
	3225	if ((error = nlookup(nd)) != 0)
	3226	return (error);
	3227	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
	3228	if (error)
	3229	return (error);
	3230	if (vp->v_type != VLNK) {
	3231	error = EINVAL;
	3232	} else {
	3233	aiov.iov_base = buf;
	3234	aiov.iov_len = count;
	3235	auio.uio_iov = &aiov;
	3236	auio.uio_iovcnt = 1;
	3237	auio.uio_offset = 0;
	3238	auio.uio_rw = UIO_READ;
	3239	auio.uio_segflg = UIO_USERSPACE;
	3240	auio.uio_td = td;
	3241	auio.uio_resid = count;
	3242	error = VOP_READLINK(vp, &auio, td->td_ucred);
	3243	}
	3244	vput(vp);
	3245	*res = count - auio.uio_resid;
	3246	return (error);
	3247	}
	3248
	3249	/*
	3250	* readlink_args(char path, char buf, int count)
	3251	*
	3252	* Return target name of a symbolic link.
	3253	*/
	3254	int
	3255	sys_readlink(struct sysmsg sysmsg, const struct readlink_args uap)
	3256	{
	3257	struct nlookupdata nd;
	3258	int error;
	3259
	3260	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	3261	if (error == 0) {
	3262	error = kern_readlink(&nd, uap->buf, uap->count,
	3263	&sysmsg->sysmsg_result);
	3264	}
	3265	nlookup_done(&nd);
	3266	return (error);
	3267	}
	3268
	3269	/*
	3270	* readlinkat_args(int fd, char path, char buf, size_t bufsize)
	3271	*
	3272	* Return target name of a symbolic link. The path is relative to the
	3273	* directory associated with fd.
	3274	*/
	3275	int
	3276	sys_readlinkat(struct sysmsg sysmsg, const struct readlinkat_args uap)
	3277	{
	3278	struct nlookupdata nd;
	3279	struct file *fp;
	3280	int error;
	3281
	3282	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
	3283	if (error == 0) {
	3284	error = kern_readlink(&nd, uap->buf, uap->bufsize,
	3285	&sysmsg->sysmsg_result);
	3286	}
	3287	nlookup_done_at(&nd, fp);
	3288	return (error);
	3289	}
	3290
	3291	static int
	3292	setfflags(struct vnode *vp, u_long flags)
	3293	{
	3294	struct thread *td = curthread;
	3295	int error;
	3296	struct vattr vattr;
	3297
	3298	/*
	3299	* Prevent non-root users from setting flags on devices. When
	3300	* a device is reused, users can retain ownership of the device
	3301	* if they are allowed to set flags and programs assume that
	3302	* chown can't fail when done as root.
	3303	*/
	3304	if ((vp->v_type == VCHR \|\| vp->v_type == VBLK) &&
	3305	((error =
	3306	caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHFLAGS_DEV)) != 0))
	3307	{
	3308	return (error);
	3309	}
	3310
	3311	/*
	3312	* note: vget is required for any operation that might mod the vnode
	3313	* so VINACTIVE is properly cleared.
	3314	*/
	3315	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
	3316	VATTR_NULL(&vattr);
	3317	vattr.va_flags = flags;
	3318	error = VOP_SETATTR(vp, &vattr, td->td_ucred);
	3319	vput(vp);
	3320	}
	3321	return (error);
	3322	}
	3323
	3324	/*
	3325	* chflags(const char *path, u_long flags)
	3326	*
	3327	* Change flags of a file given a path name.
	3328	*/
	3329	int
	3330	sys_chflags(struct sysmsg sysmsg, const struct chflags_args uap)
	3331	{
	3332	struct nlookupdata nd;
	3333	struct vnode *vp;
	3334	int error;
	3335
	3336	vp = NULL;
	3337	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	3338	if (error == 0)
	3339	error = nlookup(&nd);
	3340	if (error == 0)
	3341	error = ncp_writechk(&nd.nl_nch);
	3342	if (error == 0)
	3343	error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
	3344	nlookup_done(&nd);
	3345	if (error == 0) {
	3346	error = setfflags(vp, uap->flags);
	3347	vrele(vp);
	3348	}
	3349	return (error);
	3350	}
	3351
	3352	/*
	3353	* lchflags(const char *path, u_long flags)
	3354	*
	3355	* Change flags of a file given a path name, but don't follow symlinks.
	3356	*/
	3357	int
	3358	sys_lchflags(struct sysmsg sysmsg, const struct lchflags_args uap)
	3359	{
	3360	struct nlookupdata nd;
	3361	struct vnode *vp;
	3362	int error;
	3363
	3364	vp = NULL;
	3365	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	3366	if (error == 0)
	3367	error = nlookup(&nd);
	3368	if (error == 0)
	3369	error = ncp_writechk(&nd.nl_nch);
	3370	if (error == 0)
	3371	error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
	3372	nlookup_done(&nd);
	3373	if (error == 0) {
	3374	error = setfflags(vp, uap->flags);
	3375	vrele(vp);
	3376	}
	3377	return (error);
	3378	}
	3379
	3380	/*
	3381	* fchflags_args(int fd, u_flags flags)
	3382	*
	3383	* Change flags of a file given a file descriptor.
	3384	*/
	3385	int
	3386	sys_fchflags(struct sysmsg sysmsg, const struct fchflags_args uap)
	3387	{
	3388	struct thread *td = curthread;
	3389	struct file *fp;
	3390	int error;
	3391
	3392	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
	3393	return (error);
	3394	if (fp->f_nchandle.ncp)
	3395	error = ncp_writechk(&fp->f_nchandle);
	3396	if (error == 0)
	3397	error = setfflags((struct vnode *) fp->f_data, uap->flags);
	3398	fdrop(fp);
	3399	return (error);
	3400	}
	3401
	3402	/*
	3403	* chflagsat_args(int fd, const char *path, u_long flags, int atflags)
	3404	* change flags given a pathname relative to a filedescriptor
	3405	*/
	3406	int
	3407	sys_chflagsat(struct sysmsg sysmsg, const struct chflagsat_args uap)
	3408	{
	3409	struct nlookupdata nd;
	3410	struct vnode *vp;
	3411	struct file *fp;
	3412	int error;
	3413	int lookupflags;
	3414
	3415	if (uap->atflags & ~AT_SYMLINK_NOFOLLOW)
	3416	return (EINVAL);
	3417
	3418	lookupflags = (uap->atflags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
	3419
	3420	vp = NULL;
	3421	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, lookupflags);
	3422	if (error == 0)
	3423	error = nlookup(&nd);
	3424	if (error == 0)
	3425	error = ncp_writechk(&nd.nl_nch);
	3426	if (error == 0)
	3427	error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
	3428	nlookup_done_at(&nd, fp);
	3429	if (error == 0) {
	3430	error = setfflags(vp, uap->flags);
	3431	vrele(vp);
	3432	}
	3433	return (error);
	3434	}
	3435
	3436
	3437	static int
	3438	setfmode(struct vnode *vp, int mode)
	3439	{
	3440	struct thread *td = curthread;
	3441	int error;
	3442	struct vattr vattr;
	3443
	3444	/*
	3445	* note: vget is required for any operation that might mod the vnode
	3446	* so VINACTIVE is properly cleared.
	3447	*/
	3448	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
	3449	VATTR_NULL(&vattr);
	3450	vattr.va_mode = mode & ALLPERMS;
	3451	error = VOP_SETATTR(vp, &vattr, td->td_ucred);
	3452	cache_inval_wxok(vp);
	3453	vput(vp);
	3454	}
	3455	return error;
	3456	}
	3457
	3458	int
	3459	kern_chmod(struct nlookupdata *nd, int mode)
	3460	{
	3461	struct vnode *vp;
	3462	int error;
	3463
	3464	if ((error = nlookup(nd)) != 0)
	3465	return (error);
	3466	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
	3467	return (error);
	3468	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
	3469	error = setfmode(vp, mode);
	3470	vrele(vp);
	3471	return (error);
	3472	}
	3473
	3474	/*
	3475	* chmod_args(char *path, int mode)
	3476	*
	3477	* Change mode of a file given path name.
	3478	*/
	3479	int
	3480	sys_chmod(struct sysmsg sysmsg, const struct chmod_args uap)
	3481	{
	3482	struct nlookupdata nd;
	3483	int error;
	3484
	3485	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	3486	if (error == 0)
	3487	error = kern_chmod(&nd, uap->mode);
	3488	nlookup_done(&nd);
	3489	return (error);
	3490	}
	3491
	3492	/*
	3493	* lchmod_args(char *path, int mode)
	3494	*
	3495	* Change mode of a file given path name (don't follow links.)
	3496	*/
	3497	int
	3498	sys_lchmod(struct sysmsg sysmsg, const struct lchmod_args uap)
	3499	{
	3500	struct nlookupdata nd;
	3501	int error;
	3502
	3503	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	3504	if (error == 0)
	3505	error = kern_chmod(&nd, uap->mode);
	3506	nlookup_done(&nd);
	3507	return (error);
	3508	}
	3509
	3510	/*
	3511	* fchmod_args(int fd, int mode)
	3512	*
	3513	* Change mode of a file given a file descriptor.
	3514	*/
	3515	int
	3516	sys_fchmod(struct sysmsg sysmsg, const struct fchmod_args uap)
	3517	{
	3518	struct thread *td = curthread;
	3519	struct file *fp;
	3520	int error;
	3521
	3522	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
	3523	return (error);
	3524	if (fp->f_nchandle.ncp)
	3525	error = ncp_writechk(&fp->f_nchandle);
	3526	if (error == 0)
	3527	error = setfmode((struct vnode *)fp->f_data, uap->mode);
	3528	fdrop(fp);
	3529	return (error);
	3530	}
	3531
	3532	/*
	3533	* fchmodat_args(char *path, int mode)
	3534	*
	3535	* Change mode of a file pointed to by fd/path.
	3536	*/
	3537	int
	3538	sys_fchmodat(struct sysmsg sysmsg, const struct fchmodat_args uap)
	3539	{
	3540	struct nlookupdata nd;
	3541	struct file *fp;
	3542	int error;
	3543	int flags;
	3544
	3545	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
	3546	return (EINVAL);
	3547	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
	3548
	3549	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
	3550	UIO_USERSPACE, flags);
	3551	if (error == 0)
	3552	error = kern_chmod(&nd, uap->mode);
	3553	nlookup_done_at(&nd, fp);
	3554	return (error);
	3555	}
	3556
	3557	static int
	3558	setfown(struct mount mp, struct vnode vp, uid_t uid, gid_t gid)
	3559	{
	3560	struct thread *td = curthread;
	3561	int error;
	3562	struct vattr vattr;
	3563	uid_t o_uid;
	3564	gid_t o_gid;
	3565	uint64_t size;
	3566
	3567	/*
	3568	* note: vget is required for any operation that might mod the vnode
	3569	* so VINACTIVE is properly cleared.
	3570	*/
	3571	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
	3572	if ((error = VOP_GETATTR(vp, &vattr)) != 0)
	3573	return error;
	3574	o_uid = vattr.va_uid;
	3575	o_gid = vattr.va_gid;
	3576	size = vattr.va_size;
	3577
	3578	VATTR_NULL(&vattr);
	3579	vattr.va_uid = uid;
	3580	vattr.va_gid = gid;
	3581	error = VOP_SETATTR(vp, &vattr, td->td_ucred);
	3582	vput(vp);
	3583	}
	3584
	3585	if (error == 0) {
	3586	if (uid == -1)
	3587	uid = o_uid;
	3588	if (gid == -1)
	3589	gid = o_gid;
	3590	VFS_ACCOUNT(mp, o_uid, o_gid, -size);
	3591	VFS_ACCOUNT(mp, uid, gid, size);
	3592	}
	3593
	3594	return error;
	3595	}
	3596
	3597	int
	3598	kern_chown(struct nlookupdata *nd, int uid, int gid)
	3599	{
	3600	struct vnode *vp;
	3601	int error;
	3602
	3603	if ((error = nlookup(nd)) != 0)
	3604	return (error);
	3605	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
	3606	return (error);
	3607	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
	3608	error = setfown(nd->nl_nch.mount, vp, uid, gid);
	3609	vrele(vp);
	3610	return (error);
	3611	}
	3612
	3613	/*
	3614	* chown(char *path, int uid, int gid)
	3615	*
	3616	* Set ownership given a path name.
	3617	*/
	3618	int
	3619	sys_chown(struct sysmsg sysmsg, const struct chown_args uap)
	3620	{
	3621	struct nlookupdata nd;
	3622	int error;
	3623
	3624	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	3625	if (error == 0)
	3626	error = kern_chown(&nd, uap->uid, uap->gid);
	3627	nlookup_done(&nd);
	3628	return (error);
	3629	}
	3630
	3631	/*
	3632	* lchown_args(char *path, int uid, int gid)
	3633	*
	3634	* Set ownership given a path name, do not cross symlinks.
	3635	*/
	3636	int
	3637	sys_lchown(struct sysmsg sysmsg, const struct lchown_args uap)
	3638	{
	3639	struct nlookupdata nd;
	3640	int error;
	3641
	3642	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	3643	if (error == 0)
	3644	error = kern_chown(&nd, uap->uid, uap->gid);
	3645	nlookup_done(&nd);
	3646	return (error);
	3647	}
	3648
	3649	/*
	3650	* fchown_args(int fd, int uid, int gid)
	3651	*
	3652	* Set ownership given a file descriptor.
	3653	*/
	3654	int
	3655	sys_fchown(struct sysmsg sysmsg, const struct fchown_args uap)
	3656	{
	3657	struct thread *td = curthread;
	3658	struct proc *p = td->td_proc;
	3659	struct file *fp;
	3660	int error;
	3661
	3662	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
	3663	return (error);
	3664	if (fp->f_nchandle.ncp)
	3665	error = ncp_writechk(&fp->f_nchandle);
	3666	if (error == 0)
	3667	error = setfown(p->p_fd->fd_ncdir.mount,
	3668	(struct vnode *)fp->f_data, uap->uid, uap->gid);
	3669	fdrop(fp);
	3670	return (error);
	3671	}
	3672
	3673	/*
	3674	* fchownat(int fd, char *path, int uid, int gid, int flags)
	3675	*
	3676	* Set ownership of file pointed to by fd/path.
	3677	*/
	3678	int
	3679	sys_fchownat(struct sysmsg sysmsg, const struct fchownat_args uap)
	3680	{
	3681	struct nlookupdata nd;
	3682	struct file *fp;
	3683	int error;
	3684	int flags;
	3685
	3686	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
	3687	return (EINVAL);
	3688	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
	3689
	3690	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
	3691	UIO_USERSPACE, flags);
	3692	if (error == 0)
	3693	error = kern_chown(&nd, uap->uid, uap->gid);
	3694	nlookup_done_at(&nd, fp);
	3695	return (error);
	3696	}
	3697
	3698
	3699	static int
	3700	getutimes(struct timeval tvp, struct timespec tsp)
	3701	{
	3702	struct timeval tv[2];
	3703	int error;
	3704
	3705	if (tvp == NULL) {
	3706	microtime(&tv[0]);
	3707	TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
	3708	tsp[1] = tsp[0];
	3709	} else {
	3710	if ((error = itimerfix(tvp)) != 0)
	3711	return (error);
	3712	TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
	3713	TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
	3714	}
	3715	return 0;
	3716	}
	3717
	3718	static int
	3719	getutimens(const struct timespec ts, struct timespec newts, int *nullflag)
	3720	{
	3721	struct timespec tsnow;
	3722	int error;
	3723
	3724	*nullflag = 0;
	3725	nanotime(&tsnow);
	3726	if (ts == NULL) {
	3727	newts[0] = tsnow;
	3728	newts[1] = tsnow;
	3729	*nullflag = 1;
	3730	return (0);
	3731	}
	3732
	3733	newts[0] = ts[0];
	3734	newts[1] = ts[1];
	3735	if (newts[0].tv_nsec == UTIME_OMIT && newts[1].tv_nsec == UTIME_OMIT)
	3736	return (0);
	3737	if (newts[0].tv_nsec == UTIME_NOW && newts[1].tv_nsec == UTIME_NOW)
	3738	*nullflag = 1;
	3739
	3740	if (newts[0].tv_nsec == UTIME_OMIT)
	3741	newts[0].tv_sec = VNOVAL;
	3742	else if (newts[0].tv_nsec == UTIME_NOW)
	3743	newts[0] = tsnow;
	3744	else if ((error = itimespecfix(&newts[0])) != 0)
	3745	return (error);
	3746
	3747	if (newts[1].tv_nsec == UTIME_OMIT)
	3748	newts[1].tv_sec = VNOVAL;
	3749	else if (newts[1].tv_nsec == UTIME_NOW)
	3750	newts[1] = tsnow;
	3751	else if ((error = itimespecfix(&newts[1])) != 0)
	3752	return (error);
	3753
	3754	return (0);
	3755	}
	3756
	3757	static int
	3758	setutimes(struct vnode vp, struct vattr vattr,
	3759	const struct timespec *ts, int nullflag)
	3760	{
	3761	struct thread *td = curthread;
	3762	int error;
	3763
	3764	VATTR_NULL(vattr);
	3765	vattr->va_atime = ts[0];
	3766	vattr->va_mtime = ts[1];
	3767	if (nullflag)
	3768	vattr->va_vaflags \|= VA_UTIMES_NULL;
	3769	error = VOP_SETATTR(vp, vattr, td->td_ucred);
	3770
	3771	return error;
	3772	}
	3773
	3774	int
	3775	kern_utimes(struct nlookupdata nd, struct timeval tptr)
	3776	{
	3777	struct timespec ts[2];
	3778	int error;
	3779
	3780	if (tptr) {
	3781	if ((error = getutimes(tptr, ts)) != 0)
	3782	return (error);
	3783	}
	3784	error = kern_utimensat(nd, tptr ? ts : NULL, 0);
	3785	return (error);
	3786	}
	3787
	3788	/*
	3789	* utimes_args(char path, struct timeval tptr)
	3790	*
	3791	* Set the access and modification times of a file.
	3792	*/
	3793	int
	3794	sys_utimes(struct sysmsg sysmsg, const struct utimes_args uap)
	3795	{
	3796	struct timeval tv[2];
	3797	struct nlookupdata nd;
	3798	int error;
	3799
	3800	if (uap->tptr) {
	3801	error = copyin(uap->tptr, tv, sizeof(tv));
	3802	if (error)
	3803	return (error);
	3804	}
	3805	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	3806	if (error == 0)
	3807	error = kern_utimes(&nd, uap->tptr ? tv : NULL);
	3808	nlookup_done(&nd);
	3809	return (error);
	3810	}
	3811
	3812	/*
	3813	* lutimes_args(char path, struct timeval tptr)
	3814	*
	3815	* Set the access and modification times of a file.
	3816	*/
	3817	int
	3818	sys_lutimes(struct sysmsg sysmsg, const struct lutimes_args uap)
	3819	{
	3820	struct timeval tv[2];
	3821	struct nlookupdata nd;
	3822	int error;
	3823
	3824	if (uap->tptr) {
	3825	error = copyin(uap->tptr, tv, sizeof(tv));
	3826	if (error)
	3827	return (error);
	3828	}
	3829	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	3830	if (error == 0)
	3831	error = kern_utimes(&nd, uap->tptr ? tv : NULL);
	3832	nlookup_done(&nd);
	3833	return (error);
	3834	}
	3835
	3836	/*
	3837	* Set utimes on a file descriptor. The creds used to open the
	3838	* file are used to determine whether the operation is allowed
	3839	* or not.
	3840	*/
	3841	int
	3842	kern_futimens(int fd, struct timespec *ts)
	3843	{
	3844	struct thread *td = curthread;
	3845	struct timespec newts[2];
	3846	struct file *fp;
	3847	struct vnode *vp;
	3848	struct vattr vattr;
	3849	struct vattr_lite lva;
	3850	int nullflag;
	3851	int error;
	3852
	3853	error = getutimens(ts, newts, &nullflag);
	3854	if (error)
	3855	return (error);
	3856	if ((error = holdvnode(td, fd, &fp)) != 0)
	3857	return (error);
	3858	if (fp->f_nchandle.ncp)
	3859	error = ncp_writechk(&fp->f_nchandle);
	3860	if (error == 0) {
	3861	vp = fp->f_data;
	3862	error = vget(vp, LK_EXCLUSIVE);
	3863	if (error == 0) {
	3864	error = VOP_GETATTR_FP(vp, &vattr, fp);
	3865	if (error == 0) {
	3866	lva.va_type = vattr.va_type;
	3867	lva.va_nlink = vattr.va_nlink;
	3868	lva.va_mode = vattr.va_mode;
	3869	lva.va_uid = vattr.va_uid;
	3870	lva.va_gid = vattr.va_gid;
	3871	lva.va_size = vattr.va_size;
	3872	lva.va_flags = vattr.va_flags;
	3873
	3874	error = naccess_lva(&lva, NLC_OWN \| NLC_WRITE,
	3875	fp->f_cred);
	3876	}
	3877	if (error == 0) {
	3878	error = setutimes(vp, &vattr, newts, nullflag);
	3879	}
	3880	vput(vp);
	3881	}
	3882	}
	3883	fdrop(fp);
	3884	return (error);
	3885	}
	3886
	3887	/*
	3888	* futimens_args(int fd, struct timespec *ts)
	3889	*
	3890	* Set the access and modification times of a file.
	3891	*/
	3892	int
	3893	sys_futimens(struct sysmsg sysmsg, const struct futimens_args uap)
	3894	{
	3895	struct timespec ts[2];
	3896	int error;
	3897
	3898	if (uap->ts) {
	3899	error = copyin(uap->ts, ts, sizeof(ts));
	3900	if (error)
	3901	return (error);
	3902	}
	3903	error = kern_futimens(uap->fd, uap->ts ? ts : NULL);
	3904	return (error);
	3905	}
	3906
	3907	int
	3908	kern_futimes(int fd, struct timeval *tptr)
	3909	{
	3910	struct timespec ts[2];
	3911	int error;
	3912
	3913	if (tptr) {
	3914	if ((error = getutimes(tptr, ts)) != 0)
	3915	return (error);
	3916	}
	3917	error = kern_futimens(fd, tptr ? ts : NULL);
	3918	return (error);
	3919	}
	3920
	3921	/*
	3922	* futimes_args(int fd, struct timeval *tptr)
	3923	*
	3924	* Set the access and modification times of a file.
	3925	*/
	3926	int
	3927	sys_futimes(struct sysmsg sysmsg, const struct futimes_args uap)
	3928	{
	3929	struct timeval tv[2];
	3930	int error;
	3931
	3932	if (uap->tptr) {
	3933	error = copyin(uap->tptr, tv, sizeof(tv));
	3934	if (error)
	3935	return (error);
	3936	}
	3937	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
	3938	return (error);
	3939	}
	3940
	3941	int
	3942	kern_utimensat(struct nlookupdata nd, const struct timespec ts, int flags)
	3943	{
	3944	struct timespec newts[2];
	3945	struct vnode *vp;
	3946	struct vattr vattr;
	3947	int nullflag;
	3948	int error;
	3949
	3950	if (flags & ~AT_SYMLINK_NOFOLLOW)
	3951	return (EINVAL);
	3952
	3953	error = getutimens(ts, newts, &nullflag);
	3954	if (error)
	3955	return (error);
	3956
	3957	nd->nl_flags \|= NLC_OWN \| NLC_WRITE;
	3958	if ((error = nlookup(nd)) != 0)
	3959	return (error);
	3960	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	3961	return (error);
	3962	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
	3963	return (error);
	3964	if ((error = vn_writechk(vp)) == 0) {
	3965	error = vget(vp, LK_EXCLUSIVE);
	3966	if (error == 0) {
	3967	error = setutimes(vp, &vattr, newts, nullflag);
	3968	vput(vp);
	3969	}
	3970	}
	3971	vrele(vp);
	3972	return (error);
	3973	}
	3974
	3975	/*
	3976	* utimensat_args(int fd, const char path, const struct timespec ts, int flags);
	3977	*
	3978	* Set file access and modification times of a file.
	3979	*/
	3980	int
	3981	sys_utimensat(struct sysmsg sysmsg, const struct utimensat_args uap)
	3982	{
	3983	struct timespec ts[2];
	3984	struct nlookupdata nd;
	3985	struct file *fp;
	3986	int error;
	3987	int flags;
	3988
	3989	if (uap->ts) {
	3990	error = copyin(uap->ts, ts, sizeof(ts));
	3991	if (error)
	3992	return (error);
	3993	}
	3994
	3995	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
	3996	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
	3997	UIO_USERSPACE, flags);
	3998	if (error == 0)
	3999	error = kern_utimensat(&nd, uap->ts ? ts : NULL, uap->flags);
	4000	nlookup_done_at(&nd, fp);
	4001	return (error);
	4002	}
	4003
	4004	int
	4005	kern_truncate(struct nlookupdata *nd, off_t length)
	4006	{
	4007	struct vnode *vp;
	4008	struct vattr vattr;
	4009	int error;
	4010	uid_t uid = 0;
	4011	gid_t gid = 0;
	4012	uint64_t old_size = 0;
	4013
	4014	if (length < 0)
	4015	return(EINVAL);
	4016	nd->nl_flags \|= NLC_WRITE \| NLC_TRUNCATE;
	4017	if ((error = nlookup(nd)) != 0)
	4018	return (error);
	4019	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	4020	return (error);
	4021	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
	4022	return (error);
	4023	error = vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY \| LK_FAILRECLAIM);
	4024	if (error) {
	4025	vrele(vp);
	4026	return (error);
	4027	}
	4028	if (vp->v_type == VDIR) {
	4029	error = EISDIR;
	4030	goto done;
	4031	}
	4032	if (vfs_quota_enabled) {
	4033	error = VOP_GETATTR(vp, &vattr);
	4034	KASSERT(error == 0, ("kern_truncate(): VOP_GETATTR didn't return 0"));
	4035	uid = vattr.va_uid;
	4036	gid = vattr.va_gid;
	4037	old_size = vattr.va_size;
	4038	}
	4039
	4040	if ((error = vn_writechk(vp)) == 0) {
	4041	VATTR_NULL(&vattr);
	4042	vattr.va_size = length;
	4043	error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
	4044	VFS_ACCOUNT(nd->nl_nch.mount, uid, gid, length - old_size);
	4045	}
	4046	done:
	4047	vput(vp);
	4048	return (error);
	4049	}
	4050
	4051	/*
	4052	* truncate(char *path, int pad, off_t length)
	4053	*
	4054	* Truncate a file given its path name.
	4055	*/
	4056	int
	4057	sys_truncate(struct sysmsg sysmsg, const struct truncate_args uap)
	4058	{
	4059	struct nlookupdata nd;
	4060	int error;
	4061
	4062	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	4063	if (error == 0)
	4064	error = kern_truncate(&nd, uap->length);
	4065	nlookup_done(&nd);
	4066	return error;
	4067	}
	4068
	4069	int
	4070	kern_ftruncate(int fd, off_t length)
	4071	{
	4072	struct thread *td = curthread;
	4073	struct vattr vattr;
	4074	struct vnode *vp;
	4075	struct file *fp;
	4076	int error;
	4077	uid_t uid = 0;
	4078	gid_t gid = 0;
	4079	uint64_t old_size = 0;
	4080	struct mount *mp;
	4081
	4082	if (length < 0)
	4083	return(EINVAL);
	4084	if ((error = holdvnode(td, fd, &fp)) != 0)
	4085	return (error);
	4086	if (fp->f_nchandle.ncp) {
	4087	error = ncp_writechk(&fp->f_nchandle);
	4088	if (error)
	4089	goto done;
	4090	}
	4091	if ((fp->f_flag & FWRITE) == 0) {
	4092	error = EINVAL;
	4093	goto done;
	4094	}
	4095	if (fp->f_flag & FAPPENDONLY) { /* inode was set s/uapnd */
	4096	error = EINVAL;
	4097	goto done;
	4098	}
	4099	vp = (struct vnode *)fp->f_data;
	4100	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	4101	if (vp->v_type == VDIR) {
	4102	error = EISDIR;
	4103	vn_unlock(vp);
	4104	goto done;
	4105	}
	4106
	4107	if (vfs_quota_enabled) {
	4108	error = VOP_GETATTR_FP(vp, &vattr, fp);
	4109	KASSERT(error == 0, ("kern_ftruncate(): VOP_GETATTR didn't return 0"));
	4110	uid = vattr.va_uid;
	4111	gid = vattr.va_gid;
	4112	old_size = vattr.va_size;
	4113	}
	4114
	4115	if ((error = vn_writechk(vp)) == 0) {
	4116	VATTR_NULL(&vattr);
	4117	vattr.va_size = length;
	4118	error = VOP_SETATTR_FP(vp, &vattr, fp->f_cred, fp);
	4119	mp = vq_vptomp(vp);
	4120	VFS_ACCOUNT(mp, uid, gid, length - old_size);
	4121	}
	4122	vn_unlock(vp);
	4123	done:
	4124	fdrop(fp);
	4125	return (error);
	4126	}
	4127
	4128	/*
	4129	* ftruncate_args(int fd, int pad, off_t length)
	4130	*
	4131	* Truncate a file given a file descriptor.
	4132	*/
	4133	int
	4134	sys_ftruncate(struct sysmsg sysmsg, const struct ftruncate_args uap)
	4135	{
	4136	int error;
	4137
	4138	error = kern_ftruncate(uap->fd, uap->length);
	4139
	4140	return (error);
	4141	}
	4142
	4143	int
	4144	kern_fsync(int fd, bool fullsync)
	4145	{
	4146	struct thread *td = curthread;
	4147	struct vnode *vp;
	4148	struct file *fp;
	4149	vm_object_t obj;
	4150	int error;
	4151
	4152	if ((error = holdvnode(td, fd, &fp)) != 0)
	4153	return (error);
	4154	vp = (struct vnode *)fp->f_data;
	4155	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	4156	if ((obj = vp->v_object) != NULL) {
	4157	if (vp->v_mount == NULL \|\|
	4158	(vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC) == 0) {
	4159	vm_object_page_clean(obj, 0, 0, 0);
	4160	}
	4161	}
	4162	error = fullsync ?
	4163	VOP_FSYNC_FP(vp, MNT_WAIT, VOP_FSYNC_SYSCALL, fp) :
	4164	VOP_FDATASYNC_FP(vp, MNT_WAIT, VOP_FSYNC_SYSCALL, fp);
	4165	if (error == 0 && vp->v_mount)
	4166	error = buf_fsync(vp);
	4167	vn_unlock(vp);
	4168	fdrop(fp);
	4169
	4170	return (error);
	4171	}
	4172
	4173	/*
	4174	* fsync(int fd)
	4175	*
	4176	* Sync an open file.
	4177	*/
	4178	int
	4179	sys_fsync(struct sysmsg sysmsg, const struct fsync_args uap)
	4180	{
	4181	return (kern_fsync(uap->fd, true));
	4182	}
	4183
	4184	/*
	4185	* fdatasync(int fd)
	4186	*
	4187	* Data-sync an open file.
	4188	*/
	4189	int
	4190	sys_fdatasync(struct sysmsg sysmsg, const struct fdatasync_args uap)
	4191	{
	4192	return (kern_fsync(uap->fd, false));
	4193	}
	4194
	4195	/*
	4196	* rename op.
	4197	*
	4198	* NOTE: error == 0 and nl_dvp is NULL indicates a mount point, operation
	4199	* disallowed. e.g. /var/cache where /var/cache is a null-mount, for
	4200	* example.
	4201	*/
	4202	int
	4203	kern_rename(struct nlookupdata fromnd, struct nlookupdata tond)
	4204	{
	4205	struct nchandle fnchd;
	4206	struct nchandle tnchd;
	4207	struct namecache *ncp;
	4208	struct vnode *fdvp;
	4209	struct vnode *tdvp;
	4210	struct mount *mp;
	4211	struct mount *userenlk;
	4212	int error;
	4213	u_int fncp_gen;
	4214	u_int tncp_gen;
	4215
	4216	bwillinode(1);
	4217	fromnd->nl_flags \|= NLC_REFDVP \| NLC_RENAME_SRC;
	4218	if ((error = nlookup(fromnd)) != 0)
	4219	return (error);
	4220
	4221	/*
	4222	* Attempt to rename a mount point (from or to)
	4223	*/
	4224	if (error == 0 && fromnd->nl_dvp == NULL)
	4225	return (EINVAL);
	4226
	4227	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
	4228	return (ENOENT);
	4229	fnchd.mount = fromnd->nl_nch.mount;
	4230	cache_hold(&fnchd);
	4231
	4232	/*
	4233	* unlock the source nch so we can lookup the target nch without
	4234	* deadlocking. The target may or may not exist so we do not check
	4235	* for a target vp like kern_mkdir() and other creation functions do.
	4236	*
	4237	* The source and target directories are ref'd and rechecked after
	4238	* everything is relocked to determine if the source or target file
	4239	* has been renamed.
	4240	*/
	4241	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
	4242	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
	4243	fncp_gen = fromnd->nl_nch.ncp->nc_generation;
	4244
	4245	if (fromnd->nl_nch.ncp->nc_vp &&
	4246	fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
	4247	userenlk = fnchd.mount;
	4248	cache_unlock(&fromnd->nl_nch);
	4249	lockmgr(&userenlk->mnt_renlock, LK_EXCLUSIVE);
	4250	} else {
	4251	userenlk = NULL;
	4252	cache_unlock(&fromnd->nl_nch);
	4253	}
	4254
	4255	/*
	4256	* Lookup target
	4257	*/
	4258	tond->nl_flags \|= NLC_RENAME_DST \| NLC_REFDVP;
	4259	if ((error = nlookup(tond)) != 0) {
	4260	cache_drop(&fnchd);
	4261	goto done;
	4262	}
	4263	tncp_gen = tond->nl_nch.ncp->nc_generation;
	4264
	4265	/*
	4266	* Attempt to rename a mount point (from or to)
	4267	*/
	4268	if (error == 0 && tond->nl_dvp == NULL) {
	4269	cache_drop(&fnchd);
	4270	error = ENOENT;
	4271	goto done;
	4272	}
	4273
	4274	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
	4275	cache_drop(&fnchd);
	4276	error = ENOENT;
	4277	goto done;
	4278	}
	4279	tnchd.mount = tond->nl_nch.mount;
	4280	cache_hold(&tnchd);
	4281
	4282	/*
	4283	* If the source and target are the same there is nothing to do
	4284	*/
	4285	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
	4286	cache_drop(&fnchd);
	4287	cache_drop(&tnchd);
	4288	error = 0;
	4289	goto done;
	4290	}
	4291
	4292	/*
	4293	* Mount points cannot be renamed or overwritten
	4294	*/
	4295	if ((fromnd->nl_nch.ncp->nc_flag \| tond->nl_nch.ncp->nc_flag) &
	4296	NCF_ISMOUNTPT
	4297	) {
	4298	cache_drop(&fnchd);
	4299	cache_drop(&tnchd);
	4300	error = EINVAL;
	4301	goto done;
	4302	}
	4303
	4304	/*
	4305	* Lock all four namecache entries. tond is already locked.
	4306	*/
	4307	cache_lock4_tondlocked(&fnchd, &fromnd->nl_nch,
	4308	&tnchd, &tond->nl_nch,
	4309	fromnd->nl_cred, tond->nl_cred);
	4310	fromnd->nl_flags \|= NLC_NCPISLOCKED;
	4311
	4312	/*
	4313	* If the namecache generation changed for either fromnd or tond,
	4314	* we must retry.
	4315	*/
	4316	if (((fromnd->nl_nch.ncp->nc_generation - fncp_gen) & ~1) \|\|
	4317	((tond->nl_nch.ncp->nc_generation - tncp_gen) & ~1))
	4318	{
	4319	krateprintf(&krate_rename,
	4320	"kern_rename: retry due to race on: "
	4321	"\"%s\" -> \"%s\" (%d,%d)\n",
	4322	fromnd->nl_nch.ncp->nc_name,
	4323	tond->nl_nch.ncp->nc_name,
	4324	fromnd->nl_nch.ncp->nc_generation - fncp_gen,
	4325	tond->nl_nch.ncp->nc_generation - tncp_gen);
	4326	error = EAGAIN;
	4327	goto finish;
	4328	}
	4329
	4330	/*
	4331	* If either fromnd or tond are marked destroyed a ripout occured
	4332	* out from under us and we must retry.
	4333	*/
	4334	if ((fromnd->nl_nch.ncp->nc_flag & (NCF_DESTROYED \| NCF_UNRESOLVED)) \|\|
	4335	fromnd->nl_nch.ncp->nc_vp == NULL \|\|
	4336	(tond->nl_nch.ncp->nc_flag & (NCF_DESTROYED \| NCF_UNRESOLVED))) {
	4337	krateprintf(&krate_rename,
	4338	"kern_rename: retry due to ripout on: "
	4339	"\"%s\" -> \"%s\"\n",
	4340	fromnd->nl_nch.ncp->nc_name,
	4341	tond->nl_nch.ncp->nc_name);
	4342	error = EAGAIN;
	4343	goto finish;
	4344	}
	4345
	4346	/*
	4347	* Make sure the parent directories linkages are the same. We have
	4348	* already checked that fromnd and tond are not mount points so this
	4349	* should not loop forever on a cross-mount.
	4350	*/
	4351	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent \|\|
	4352	tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
	4353	error = EAGAIN;
	4354	goto finish;
	4355	}
	4356
	4357	/*
	4358	* Both the source and target must be within the same filesystem and
	4359	* in the same filesystem as their parent directories within the
	4360	* namecache topology.
	4361	*
	4362	* NOTE: fromnd's nc_mount or nc_vp could be NULL.
	4363	*/
	4364	mp = fnchd.mount;
	4365	if (mp != tnchd.mount \|\| mp != fromnd->nl_nch.mount \|\|
	4366	mp != tond->nl_nch.mount) {
	4367	error = EXDEV;
	4368	goto finish;
	4369	}
	4370
	4371	/*
	4372	* Make sure the mount point is writable
	4373	*/
	4374	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
	4375	goto finish;
	4376	}
	4377
	4378	/*
	4379	* If the target exists and either the source or target is a directory,
	4380	* then both must be directories.
	4381	*
	4382	* Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
	4383	* have become NULL.
	4384	*/
	4385	if (tond->nl_nch.ncp->nc_vp) {
	4386	if (fromnd->nl_nch.ncp->nc_vp == NULL) {
	4387	error = ENOENT;
	4388	} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
	4389	if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
	4390	error = ENOTDIR;
	4391	} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
	4392	error = EISDIR;
	4393	}
	4394	}
	4395
	4396	/*
	4397	* You cannot rename a source into itself or a subdirectory of itself.
	4398	* We check this by travsersing the target directory upwards looking
	4399	* for a match against the source.
	4400	*
	4401	* Only required when renaming a directory, in which case userenlk is
	4402	* non-NULL.
	4403	*/
	4404	if (__predict_false(userenlk && error == 0)) {
	4405	for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
	4406	if (fromnd->nl_nch.ncp == ncp) {
	4407	error = EINVAL;
	4408	break;
	4409	}
	4410	}
	4411	}
	4412
	4413	/*
	4414	* Even though the namespaces are different, they may still represent
	4415	* hardlinks to the same file. The filesystem might have a hard time
	4416	* with this so we issue a NREMOVE of the source instead of a NRENAME
	4417	* when we detect the situation.
	4418	*/
	4419	if (error == 0) {
	4420	fdvp = fromnd->nl_dvp;
	4421	tdvp = tond->nl_dvp;
	4422	if (fdvp == NULL \|\| tdvp == NULL) {
	4423	error = EPERM;
	4424	} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
	4425	error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
	4426	fromnd->nl_cred);
	4427	} else {
	4428	error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
	4429	fdvp, tdvp, tond->nl_cred);
	4430	}
	4431	}
	4432	finish:
	4433	cache_put(&tnchd);
	4434	cache_put(&fnchd);
	4435	done:
	4436	if (userenlk)
	4437	lockmgr(&userenlk->mnt_renlock, LK_RELEASE);
	4438	return (error);
	4439	}
	4440
	4441	/*
	4442	* rename_args(char from, char to)
	4443	*
	4444	* Rename files. Source and destination must either both be directories,
	4445	* or both not be directories. If target is a directory, it must be empty.
	4446	*/
	4447	int
	4448	sys_rename(struct sysmsg sysmsg, const struct rename_args uap)
	4449	{
	4450	struct nlookupdata fromnd, tond;
	4451	int error;
	4452
	4453	do {
	4454	error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
	4455	if (error == 0) {
	4456	error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
	4457	if (error == 0)
	4458	error = kern_rename(&fromnd, &tond);
	4459	nlookup_done(&tond);
	4460	}
	4461	nlookup_done(&fromnd);
	4462	} while (error == EAGAIN);
	4463	return (error);
	4464	}
	4465
	4466	/*
	4467	* renameat_args(int oldfd, char old, int newfd, char new)
	4468	*
	4469	* Rename files using paths relative to the directories associated with
	4470	* oldfd and newfd. Source and destination must either both be directories,
	4471	* or both not be directories. If target is a directory, it must be empty.
	4472	*/
	4473	int
	4474	sys_renameat(struct sysmsg sysmsg, const struct renameat_args uap)
	4475	{
	4476	struct nlookupdata oldnd, newnd;
	4477	struct file oldfp, newfp;
	4478	int error;
	4479
	4480	do {
	4481	error = nlookup_init_at(&oldnd, &oldfp,
	4482	uap->oldfd, uap->old,
	4483	UIO_USERSPACE, 0);
	4484	if (error == 0) {
	4485	error = nlookup_init_at(&newnd, &newfp,
	4486	uap->newfd, uap->new,
	4487	UIO_USERSPACE, 0);
	4488	if (error == 0)
	4489	error = kern_rename(&oldnd, &newnd);
	4490	nlookup_done_at(&newnd, newfp);
	4491	}
	4492	nlookup_done_at(&oldnd, oldfp);
	4493	} while (error == EAGAIN);
	4494	return (error);
	4495	}
	4496
	4497	int
	4498	kern_mkdir(struct nlookupdata *nd, int mode)
	4499	{
	4500	struct thread *td = curthread;
	4501	struct proc *p = td->td_proc;
	4502	struct vnode *vp;
	4503	struct vattr vattr;
	4504	int error;
	4505
	4506	bwillinode(1);
	4507	nd->nl_flags \|= NLC_WILLBEDIR \| NLC_CREATE \| NLC_REFDVP;
	4508	if ((error = nlookup(nd)) != 0)
	4509	return (error);
	4510
	4511	if (nd->nl_nch.ncp->nc_vp)
	4512	return (EEXIST);
	4513	if (nd->nl_dvp == NULL)
	4514	return (EINVAL);
	4515	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	4516	return (error);
	4517	VATTR_NULL(&vattr);
	4518	vattr.va_type = VDIR;
	4519	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
	4520
	4521	vp = NULL;
	4522	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
	4523	if (error == 0)
	4524	vput(vp);
	4525	return (error);
	4526	}
	4527
	4528	/*
	4529	* mkdir_args(char *path, int mode)
	4530	*
	4531	* Make a directory file.
	4532	*/
	4533	int
	4534	sys_mkdir(struct sysmsg sysmsg, const struct mkdir_args uap)
	4535	{
	4536	struct nlookupdata nd;
	4537	int error;
	4538
	4539	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	4540	if (error == 0)
	4541	error = kern_mkdir(&nd, uap->mode);
	4542	nlookup_done(&nd);
	4543	return (error);
	4544	}
	4545
	4546	/*
	4547	* mkdirat_args(int fd, char *path, mode_t mode)
	4548	*
	4549	* Make a directory file. The path is relative to the directory associated
	4550	* with fd.
	4551	*/
	4552	int
	4553	sys_mkdirat(struct sysmsg sysmsg, const struct mkdirat_args uap)
	4554	{
	4555	struct nlookupdata nd;
	4556	struct file *fp;
	4557	int error;
	4558
	4559	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
	4560	if (error == 0)
	4561	error = kern_mkdir(&nd, uap->mode);
	4562	nlookup_done_at(&nd, fp);
	4563	return (error);
	4564	}
	4565
	4566	int
	4567	kern_rmdir(struct nlookupdata *nd)
	4568	{
	4569	int error;
	4570
	4571	bwillinode(1);
	4572	nd->nl_flags \|= NLC_DELETE \| NLC_REFDVP;
	4573	if ((error = nlookup(nd)) != 0)
	4574	return (error);
	4575
	4576	/*
	4577	* Do not allow directories representing mount points to be
	4578	* deleted, even if empty. Check write perms on mount point
	4579	* in case the vnode is aliased (aka nullfs).
	4580	*/
	4581	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
	4582	return (EBUSY);
	4583	if (nd->nl_dvp == NULL)
	4584	return (EINVAL);
	4585	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	4586	return (error);
	4587	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
	4588	return (error);
	4589	}
	4590
	4591	/*
	4592	* rmdir_args(char *path)
	4593	*
	4594	* Remove a directory file.
	4595	*/
	4596	int
	4597	sys_rmdir(struct sysmsg sysmsg, const struct rmdir_args uap)
	4598	{
	4599	struct nlookupdata nd;
	4600	int error;
	4601
	4602	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
	4603	if (error == 0)
	4604	error = kern_rmdir(&nd);
	4605	nlookup_done(&nd);
	4606	return (error);
	4607	}
	4608
	4609	int
	4610	kern_getdirentries(int fd, char buf, u_int count, long basep, int *res,
	4611	enum uio_seg direction)
	4612	{
	4613	struct thread *td = curthread;
	4614	struct vnode *vp;
	4615	struct file *fp;
	4616	struct uio auio;
	4617	struct iovec aiov;
	4618	off_t loff;
	4619	int error, eofflag;
	4620
	4621	if ((error = holdvnode(td, fd, &fp)) != 0)
	4622	return (error);
	4623	if ((fp->f_flag & FREAD) == 0) {
	4624	error = EBADF;
	4625	goto done;
	4626	}
	4627	vp = (struct vnode *)fp->f_data;
	4628	if (vp->v_type != VDIR) {
	4629	error = EINVAL;
	4630	goto done;
	4631	}
	4632	aiov.iov_base = buf;
	4633	aiov.iov_len = count;
	4634	auio.uio_iov = &aiov;
	4635	auio.uio_iovcnt = 1;
	4636	auio.uio_rw = UIO_READ;
	4637	auio.uio_segflg = direction;
	4638	auio.uio_td = td;
	4639	auio.uio_resid = count;
	4640	loff = auio.uio_offset = fp->f_offset;
	4641	error = VOP_READDIR_FP(vp, &auio, fp->f_cred, &eofflag, NULL, NULL, fp);
	4642	fp->f_offset = auio.uio_offset;
	4643	if (error)
	4644	goto done;
	4645
	4646	/*
	4647	* WARNING! *basep may not be wide enough to accomodate the
	4648	* seek offset. XXX should we hack this to return the upper 32 bits
	4649	* for offsets greater then 4G?
	4650	*/
	4651	if (basep) {
	4652	*basep = (long)loff;
	4653	}
	4654	*res = count - auio.uio_resid;
	4655	done:
	4656	fdrop(fp);
	4657	return (error);
	4658	}
	4659
	4660	/*
	4661	* getdirentries_args(int fd, char buf, u_int conut, long basep)
	4662	*
	4663	* Read a block of directory entries in a file system independent format.
	4664	*/
	4665	int
	4666	sys_getdirentries(struct sysmsg sysmsg, const struct getdirentries_args uap)
	4667	{
	4668	long base;
	4669	int error;
	4670
	4671	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
	4672	&sysmsg->sysmsg_result, UIO_USERSPACE);
	4673
	4674	if (error == 0 && uap->basep)
	4675	error = copyout(&base, uap->basep, sizeof(*uap->basep));
	4676	return (error);
	4677	}
	4678
	4679	/*
	4680	* getdents_args(int fd, char *buf, size_t count)
	4681	*/
	4682	int
	4683	sys_getdents(struct sysmsg sysmsg, const struct getdents_args uap)
	4684	{
	4685	int error;
	4686
	4687	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
	4688	&sysmsg->sysmsg_result, UIO_USERSPACE);
	4689
	4690	return (error);
	4691	}
	4692
	4693	/*
	4694	* Set the mode mask for creation of filesystem nodes.
	4695	*
	4696	* umask(int newmask)
	4697	*/
	4698	int
	4699	sys_umask(struct sysmsg sysmsg, const struct umask_args uap)
	4700	{
	4701	struct thread *td = curthread;
	4702	struct proc *p = td->td_proc;
	4703	struct filedesc *fdp;
	4704
	4705	fdp = p->p_fd;
	4706	sysmsg->sysmsg_result = fdp->fd_cmask;
	4707	fdp->fd_cmask = uap->newmask & ALLPERMS;
	4708	return (0);
	4709	}
	4710
	4711	/*
	4712	* revoke(char *path)
	4713	*
	4714	* Void all references to file by ripping underlying filesystem
	4715	* away from vnode.
	4716	*/
	4717	int
	4718	sys_revoke(struct sysmsg sysmsg, const struct revoke_args uap)
	4719	{
	4720	struct nlookupdata nd;
	4721	struct vattr vattr;
	4722	struct vnode *vp;
	4723	struct ucred *cred;
	4724	int error;
	4725
	4726	vp = NULL;
	4727	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	4728	if (error == 0)
	4729	error = nlookup(&nd);
	4730	if (error == 0)
	4731	error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
	4732	cred = crhold(nd.nl_cred);
	4733	nlookup_done(&nd);
	4734	if (error == 0) {
	4735	if (error == 0)
	4736	error = VOP_GETATTR(vp, &vattr);
	4737	if (error == 0 && cred->cr_uid != vattr.va_uid)
	4738	error = caps_priv_check(cred, SYSCAP_NOVFS_REVOKE);
	4739	if (error == 0 && (vp->v_type == VCHR \|\| vp->v_type == VBLK)) {
	4740	if (vcount(vp) > 0)
	4741	error = vrevoke(vp, cred);
	4742	} else if (error == 0) {
	4743	error = vrevoke(vp, cred);
	4744	}
	4745	vrele(vp);
	4746	}
	4747	if (cred)
	4748	crfree(cred);
	4749	return (error);
	4750	}
	4751
	4752	/*
	4753	* getfh_args(char fname, fhandle_t fhp)
	4754	*
	4755	* Get (NFS) file handle
	4756	*
	4757	* NOTE: We use the fsid of the covering mount, even if it is a nullfs
	4758	* mount. This allows nullfs mounts to be explicitly exported.
	4759	*
	4760	* WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
	4761	*
	4762	* nullfs mounts of subdirectories are not safe. That is, it will
	4763	* work, but you do not really have protection against access to
	4764	* the related parent directories.
	4765	*/
	4766	int
	4767	sys_getfh(struct sysmsg sysmsg, const struct getfh_args uap)
	4768	{
	4769	struct nlookupdata nd;
	4770	fhandle_t fh;
	4771	struct vnode *vp;
	4772	struct mount *mp;
	4773	int error;
	4774
	4775	/*
	4776	* Must be super user
	4777	*/
	4778	if ((error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) != 0)
	4779	return (error);
	4780
	4781	vp = NULL;
	4782	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
	4783	if (error == 0)
	4784	error = nlookup(&nd);
	4785	if (error == 0)
	4786	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
	4787	mp = nd.nl_nch.mount;
	4788	nlookup_done(&nd);
	4789	if (error == 0) {
	4790	bzero(&fh, sizeof(fh));
	4791	fh.fh_fsid = mp->mnt_stat.f_fsid;
	4792	error = VFS_VPTOFH(vp, &fh.fh_fid);
	4793	vput(vp);
	4794	if (error == 0)
	4795	error = copyout(&fh, uap->fhp, sizeof(fh));
	4796	}
	4797	return (error);
	4798	}
	4799
	4800	/*
	4801	* fhopen_args(const struct fhandle *u_fhp, int flags)
	4802	*
	4803	* syscall for the rpc.lockd to use to translate a NFS file handle into
	4804	* an open descriptor.
	4805	*
	4806	* WARNING: Do not remove the caps_priv_check() call or this becomes
	4807	* one giant security hole.
	4808	*/
	4809	int
	4810	sys_fhopen(struct sysmsg sysmsg, const struct fhopen_args uap)
	4811	{
	4812	struct thread *td = curthread;
	4813	struct filedesc *fdp = td->td_proc->p_fd;
	4814	struct mount *mp;
	4815	struct vnode *vp;
	4816	struct fhandle fhp;
	4817	struct vattr vat;
	4818	struct vattr *vap = &vat;
	4819	struct flock lf;
	4820	int fmode, mode, error = 0, type;
	4821	struct file *nfp;
	4822	struct file *fp;
	4823	int indx;
	4824
	4825	/*
	4826	* Must be super user
	4827	*/
	4828	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
	4829	if (error)
	4830	return (error);
	4831
	4832	fmode = FFLAGS(uap->flags);
	4833
	4834	/*
	4835	* Why not allow a non-read/write open for our lockd?
	4836	*/
	4837	if (((fmode & (FREAD \| FWRITE)) == 0) \|\| (fmode & O_CREAT))
	4838	return (EINVAL);
	4839	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
	4840	if (error)
	4841	return(error);
	4842
	4843	/*
	4844	* Find the mount point
	4845	*/
	4846	mp = vfs_getvfs(&fhp.fh_fsid);
	4847	if (mp == NULL) {
	4848	error = ESTALE;
	4849	goto done2;
	4850	}
	4851	/* now give me my vnode, it gets returned to me locked */
	4852	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
	4853	if (error)
	4854	goto done;
	4855	/*
	4856	* from now on we have to make sure not
	4857	* to forget about the vnode
	4858	* any error that causes an abort must vput(vp)
	4859	* just set error = err and 'goto bad;'.
	4860	*/
	4861
	4862	/*
	4863	* from vn_open
	4864	*/
	4865	if (vp->v_type == VLNK) {
	4866	error = EMLINK;
	4867	goto bad;
	4868	}
	4869	if (vp->v_type == VSOCK) {
	4870	error = EOPNOTSUPP;
	4871	goto bad;
	4872	}
	4873	mode = 0;
	4874	if (fmode & (FWRITE \| O_TRUNC)) {
	4875	if (vp->v_type == VDIR) {
	4876	error = EISDIR;
	4877	goto bad;
	4878	}
	4879	error = vn_writechk(vp);
	4880	if (error)
	4881	goto bad;
	4882	mode \|= VWRITE;
	4883	}
	4884	if (fmode & FREAD)
	4885	mode \|= VREAD;
	4886	if (mode) {
	4887	error = VOP_ACCESS(vp, mode, td->td_ucred);
	4888	if (error)
	4889	goto bad;
	4890	}
	4891	if (fmode & O_TRUNC) {
	4892	vn_unlock(vp); /* XXX */
	4893	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY); /* XXX */
	4894	VATTR_NULL(vap);
	4895	vap->va_size = 0;
	4896	error = VOP_SETATTR(vp, vap, td->td_ucred);
	4897	if (error)
	4898	goto bad;
	4899	}
	4900
	4901	/*
	4902	* VOP_OPEN needs the file pointer so it can potentially override
	4903	* it.
	4904	*
	4905	* WARNING! no f_nchandle will be associated when fhopen()ing a
	4906	* directory. XXX
	4907	*/
	4908	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
	4909	goto bad;
	4910	error = VOP_OPEN(vp, fmode, td->td_ucred, &nfp);
	4911	fp = nfp;
	4912
	4913	if (error) {
	4914	/*
	4915	* setting f_ops this way prevents VOP_CLOSE from being
	4916	* called or fdrop() releasing the vp from v_data. Since
	4917	* the VOP_OPEN failed we don't want to VOP_CLOSE.
	4918	*/
	4919	fp->f_ops = &badfileops;
	4920	fp->f_data = NULL;
	4921	goto bad_drop;
	4922	}
	4923
	4924	/*
	4925	* The fp is given its own reference, we still have our ref and lock.
	4926	*
	4927	* Assert that all regular files must be created with a VM object.
	4928	*/
	4929	if (vp->v_type == VREG && vp->v_object == NULL) {
	4930	kprintf("fhopen: regular file did not "
	4931	"have VM object: %p\n",
	4932	vp);
	4933	goto bad_drop;
	4934	}
	4935
	4936	/*
	4937	* The open was successful. Handle any locking requirements.
	4938	*/
	4939	if (fmode & (O_EXLOCK \| O_SHLOCK)) {
	4940	lf.l_whence = SEEK_SET;
	4941	lf.l_start = 0;
	4942	lf.l_len = 0;
	4943	if (fmode & O_EXLOCK)
	4944	lf.l_type = F_WRLCK;
	4945	else
	4946	lf.l_type = F_RDLCK;
	4947	if (fmode & FNONBLOCK)
	4948	type = 0;
	4949	else
	4950	type = F_WAIT;
	4951	vn_unlock(vp);
	4952	if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK,
	4953	&lf, type)) != 0) {
	4954	/*
	4955	* release our private reference.
	4956	*/
	4957	fsetfd(fdp, NULL, indx);
	4958	fdrop(fp);
	4959	vrele(vp);
	4960	goto done;
	4961	}
	4962	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	4963	atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
	4964	}
	4965
	4966	/*
	4967	* Clean up. Associate the file pointer with the previously
	4968	* reserved descriptor and return it.
	4969	*/
	4970	vput(vp);
	4971	if (uap->flags & O_CLOEXEC)
	4972	fdp->fd_files[indx].fileflags \|= UF_EXCLOSE;
	4973	fsetfd(fdp, fp, indx);
	4974	fdrop(fp);
	4975	sysmsg->sysmsg_result = indx;
	4976	mount_drop(mp);
	4977
	4978	return (error);
	4979
	4980	bad_drop:
	4981	fsetfd(fdp, NULL, indx);
	4982	fdrop(fp);
	4983	bad:
	4984	vput(vp);
	4985	done:
	4986	mount_drop(mp);
	4987	done2:
	4988	return (error);
	4989	}
	4990
	4991	/*
	4992	* fhstat_args(struct fhandle u_fhp, struct stat sb)
	4993	*/
	4994	int
	4995	sys_fhstat(struct sysmsg sysmsg, const struct fhstat_args uap)
	4996	{
	4997	struct thread *td = curthread;
	4998	struct stat sb;
	4999	fhandle_t fh;
	5000	struct mount *mp;
	5001	struct vnode *vp;
	5002	int error;
	5003
	5004	/*
	5005	* Must be super user
	5006	*/
	5007	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
	5008	if (error)
	5009	return (error);
	5010
	5011	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
	5012	if (error)
	5013	return (error);
	5014
	5015	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
	5016	error = ESTALE;
	5017	if (error == 0) {
	5018	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
	5019	error = vn_stat(vp, &sb, td->td_ucred);
	5020	vput(vp);
	5021	}
	5022	}
	5023	if (error == 0)
	5024	error = copyout(&sb, uap->sb, sizeof(sb));
	5025	if (mp)
	5026	mount_drop(mp);
	5027
	5028	return (error);
	5029	}
	5030
	5031	/*
	5032	* fhstatfs_args(struct fhandle u_fhp, struct statfs buf)
	5033	*/
	5034	int
	5035	sys_fhstatfs(struct sysmsg sysmsg, const struct fhstatfs_args uap)
	5036	{
	5037	struct thread *td = curthread;
	5038	struct proc *p = td->td_proc;
	5039	struct statfs *sp;
	5040	struct mount *mp;
	5041	struct vnode *vp;
	5042	struct statfs sb;
	5043	char fullpath, freepath;
	5044	fhandle_t fh;
	5045	int error;
	5046
	5047	/*
	5048	* Must be super user
	5049	*/
	5050	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
	5051	if (error)
	5052	return (error);
	5053
	5054	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
	5055	return (error);
	5056
	5057	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
	5058	error = ESTALE;
	5059	goto done;
	5060	}
	5061	if (p != NULL && !chroot_visible_mnt(mp, p)) {
	5062	error = ESTALE;
	5063	goto done;
	5064	}
	5065
	5066	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
	5067	goto done;
	5068	mp = vp->v_mount;
	5069	sp = &mp->mnt_stat;
	5070	vput(vp);
	5071	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
	5072	goto done;
	5073
	5074	error = mount_path(p, mp, &fullpath, &freepath);
	5075	if (error)
	5076	goto done;
	5077	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	5078	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
	5079	kfree(freepath, M_TEMP);
	5080
	5081	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	5082	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)) {
	5083	bcopy(sp, &sb, sizeof(sb));
	5084	sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
	5085	sp = &sb;
	5086	}
	5087	error = copyout(sp, uap->buf, sizeof(*sp));
	5088	done:
	5089	if (mp)
	5090	mount_drop(mp);
	5091
	5092	return (error);
	5093	}
	5094
	5095	/*
	5096	* fhstatvfs_args(struct fhandle u_fhp, struct statvfs buf)
	5097	*/
	5098	int
	5099	sys_fhstatvfs(struct sysmsg sysmsg, const struct fhstatvfs_args uap)
	5100	{
	5101	struct thread *td = curthread;
	5102	struct proc *p = td->td_proc;
	5103	struct statvfs *sp;
	5104	struct mount *mp;
	5105	struct vnode *vp;
	5106	fhandle_t fh;
	5107	int error;
	5108
	5109	/*
	5110	* Must be super user
	5111	*/
	5112	if ((error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)))
	5113	return (error);
	5114
	5115	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
	5116	return (error);
	5117
	5118	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
	5119	error = ESTALE;
	5120	goto done;
	5121	}
	5122	if (p != NULL && !chroot_visible_mnt(mp, p)) {
	5123	error = ESTALE;
	5124	goto done;
	5125	}
	5126
	5127	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
	5128	goto done;
	5129	mp = vp->v_mount;
	5130	sp = &mp->mnt_vstat;
	5131	vput(vp);
	5132	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
	5133	goto done;
	5134
	5135	sp->f_flag = 0;
	5136	if (mp->mnt_flag & MNT_RDONLY)
	5137	sp->f_flag \|= ST_RDONLY;
	5138	if (mp->mnt_flag & MNT_NOSUID)
	5139	sp->f_flag \|= ST_NOSUID;
	5140	error = copyout(sp, uap->buf, sizeof(*sp));
	5141	done:
	5142	if (mp)
	5143	mount_drop(mp);
	5144	return (error);
	5145	}
	5146
	5147
	5148	/*
	5149	* Syscall to push extended attribute configuration information into the
	5150	* VFS. Accepts a path, which it converts to a mountpoint, as well as
	5151	* a command (int cmd), and attribute name and misc data. For now, the
	5152	* attribute name is left in userspace for consumption by the VFS_op.
	5153	* It will probably be changed to be copied into sysspace by the
	5154	* syscall in the future, once issues with various consumers of the
	5155	* attribute code have raised their hands.
	5156	*
	5157	* Currently this is used only by UFS Extended Attributes.
	5158	*/
	5159	int
	5160	sys_extattrctl(struct sysmsg sysmsg, const struct extattrctl_args uap)
	5161	{
	5162	struct nlookupdata nd;
	5163	struct vnode *vp;
	5164	char attrname[EXTATTR_MAXNAMELEN];
	5165	int error;
	5166	size_t size;
	5167
	5168	attrname[0] = 0;
	5169	vp = NULL;
	5170	error = 0;
	5171
	5172	if (error == 0 && uap->filename) {
	5173	error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
	5174	NLC_FOLLOW);
	5175	if (error == 0)
	5176	error = nlookup(&nd);
	5177	if (error == 0)
	5178	error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
	5179	nlookup_done(&nd);
	5180	}
	5181
	5182	if (error == 0 && uap->attrname) {
	5183	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
	5184	&size);
	5185	}
	5186
	5187	if (error == 0) {
	5188	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	5189	if (error == 0)
	5190	error = nlookup(&nd);
	5191	if (error == 0)
	5192	error = ncp_writechk(&nd.nl_nch);
	5193	if (error == 0) {
	5194	error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
	5195	uap->attrnamespace,
	5196	uap->attrname, nd.nl_cred);
	5197	}
	5198	nlookup_done(&nd);
	5199	}
	5200
	5201	return (error);
	5202	}
	5203
	5204	/*
	5205	* Syscall to get a named extended attribute on a file or directory.
	5206	*/
	5207	int
	5208	sys_extattr_set_file(struct sysmsg *sysmsg,
	5209	const struct extattr_set_file_args *uap)
	5210	{
	5211	char attrname[EXTATTR_MAXNAMELEN];
	5212	struct nlookupdata nd;
	5213	struct vnode *vp;
	5214	struct uio auio;
	5215	struct iovec aiov;
	5216	int error;
	5217
	5218	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
	5219	if (error)
	5220	return (error);
	5221
	5222	vp = NULL;
	5223
	5224	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	5225	if (error == 0)
	5226	error = nlookup(&nd);
	5227	if (error == 0)
	5228	error = ncp_writechk(&nd.nl_nch);
	5229	if (error == 0)
	5230	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
	5231	if (error) {
	5232	nlookup_done(&nd);
	5233	return (error);
	5234	}
	5235
	5236	bzero(&auio, sizeof(auio));
	5237	aiov.iov_base = uap->data;
	5238	aiov.iov_len = uap->nbytes;
	5239	auio.uio_iov = &aiov;
	5240	auio.uio_iovcnt = 1;
	5241	auio.uio_offset = 0;
	5242	auio.uio_resid = uap->nbytes;
	5243	auio.uio_rw = UIO_WRITE;
	5244	auio.uio_td = curthread;
	5245
	5246	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
	5247	&auio, nd.nl_cred);
	5248
	5249	vput(vp);
	5250	nlookup_done(&nd);
	5251	return (error);
	5252	}
	5253
	5254	/*
	5255	* Syscall to get a named extended attribute on a file or directory.
	5256	*/
	5257	int
	5258	sys_extattr_get_file(struct sysmsg *sysmsg,
	5259	const struct extattr_get_file_args *uap)
	5260	{
	5261	char attrname[EXTATTR_MAXNAMELEN];
	5262	struct nlookupdata nd;
	5263	struct uio auio;
	5264	struct iovec aiov;
	5265	struct vnode *vp;
	5266	int error;
	5267
	5268	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
	5269	if (error)
	5270	return (error);
	5271
	5272	vp = NULL;
	5273
	5274	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	5275	if (error == 0)
	5276	error = nlookup(&nd);
	5277	if (error == 0)
	5278	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_SHARED, &vp);
	5279	if (error) {
	5280	nlookup_done(&nd);
	5281	return (error);
	5282	}
	5283
	5284	bzero(&auio, sizeof(auio));
	5285	aiov.iov_base = uap->data;
	5286	aiov.iov_len = uap->nbytes;
	5287	auio.uio_iov = &aiov;
	5288	auio.uio_iovcnt = 1;
	5289	auio.uio_offset = 0;
	5290	auio.uio_resid = uap->nbytes;
	5291	auio.uio_rw = UIO_READ;
	5292	auio.uio_td = curthread;
	5293
	5294	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
	5295	&auio, nd.nl_cred);
	5296	sysmsg->sysmsg_result = uap->nbytes - auio.uio_resid;
	5297
	5298	vput(vp);
	5299	nlookup_done(&nd);
	5300	return(error);
	5301	}
	5302
	5303	/*
	5304	* Syscall to delete a named extended attribute from a file or directory.
	5305	* Accepts attribute name. The real work happens in VOP_SETEXTATTR().
	5306	*/
	5307	int
	5308	sys_extattr_delete_file(struct sysmsg *sysmsg,
	5309	const struct extattr_delete_file_args *uap)
	5310	{
	5311	char attrname[EXTATTR_MAXNAMELEN];
	5312	struct nlookupdata nd;
	5313	struct vnode *vp;
	5314	int error;
	5315
	5316	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
	5317	if (error)
	5318	return(error);
	5319
	5320	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	5321	if (error == 0)
	5322	error = nlookup(&nd);
	5323	if (error == 0)
	5324	error = ncp_writechk(&nd.nl_nch);
	5325	if (error == 0) {
	5326	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
	5327	if (error == 0) {
	5328	error = VOP_SETEXTATTR(vp, uap->attrnamespace,
	5329	attrname, NULL, nd.nl_cred);
	5330	vput(vp);
	5331	}
	5332	}
	5333	nlookup_done(&nd);
	5334	return(error);
	5335	}
	5336
	5337	/*
	5338	* Determine if the mount is visible to the process.
	5339	*/
	5340	static int
	5341	chroot_visible_mnt(struct mount mp, struct proc p)
	5342	{
	5343	struct nchandle nch;
	5344
	5345	/*
	5346	* Traverse from the mount point upwards. If we hit the process
	5347	* root then the mount point is visible to the process.
	5348	*/
	5349	nch = mp->mnt_ncmountpt;
	5350	while (nch.ncp) {
	5351	if (nch.mount == p->p_fd->fd_nrdir.mount &&
	5352	nch.ncp == p->p_fd->fd_nrdir.ncp) {
	5353	return(1);
	5354	}
	5355	if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
	5356	nch = nch.mount->mnt_ncmounton;
	5357	} else {
	5358	nch.ncp = nch.ncp->nc_parent;
	5359	}
	5360	}
	5361
	5362	/*
	5363	* If the mount point is not visible to the process, but the
	5364	* process root is in a subdirectory of the mount, return
	5365	* TRUE anyway.
	5366	*/
	5367	if (p->p_fd->fd_nrdir.mount == mp)
	5368	return(1);
	5369
	5370	return(0);
	5371	}
	5372
	5373	/*
	5374	* Return the appropriate system capability restriction.
	5375	*/
	5376	static int
	5377	get_fscap(const char *fsname)
	5378	{
	5379
	5380	if (strncmp("null", fsname, 5) == 0) {
	5381	return SYSCAP_NOMOUNT_NULLFS;
	5382	} else if (strncmp(fsname, "devfs", 6) == 0) {
	5383	return SYSCAP_NOMOUNT_DEVFS;
	5384	} else if (strncmp(fsname, "procfs", 7) == 0) {
	5385	return SYSCAP_NOMOUNT_PROCFS;
	5386	} else if (strncmp(fsname, "tmpfs", 6) == 0) {
	5387	return SYSCAP_NOMOUNT_TMPFS;
	5388	} else if (strncmp(fsname, "fusefs", 7) == 0) {
	5389	return SYSCAP_NOMOUNT_FUSE;
	5390	}
	5391	return SYSCAP_RESTRICTEDROOT;
	5392	}
	5393
	5394	int
	5395	sys___realpath(struct sysmsg sysmsg, const struct __realpath_args uap)
	5396	{
	5397	struct nlookupdata nd;
	5398	char *rbuf;
	5399	char *fbuf;
	5400	ssize_t rlen;
	5401	int error;
	5402
	5403	/*
	5404	* Invalid length if less than 0. 0 is allowed
	5405	*/
	5406	if ((ssize_t)uap->len < 0)
	5407	return EINVAL;
	5408
	5409	rbuf = NULL;
	5410	fbuf = NULL;
	5411	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
	5412	if (error)
	5413	goto done;
	5414
	5415	nd.nl_flags \|= NLC_SHAREDLOCK;
	5416	error = nlookup(&nd);
	5417	if (error)
	5418	goto done;
	5419
	5420	if (nd.nl_nch.ncp->nc_vp == NULL) {
	5421	error = ENOENT;
	5422	goto done;
	5423	}
	5424
	5425	/*
	5426	* Shortcut test for existence.
	5427	*/
	5428	if (uap->len == 0) {
	5429	error = ENAMETOOLONG;
	5430	goto done;
	5431	}
	5432
	5433	/*
	5434	* Obtain the path relative to the process root. The nch must not
	5435	* be locked for the cache_fullpath() call.
	5436	*/
	5437	if (nd.nl_flags & NLC_NCPISLOCKED) {
	5438	nd.nl_flags &= ~NLC_NCPISLOCKED;
	5439	cache_unlock(&nd.nl_nch);
	5440	}
	5441	error = cache_fullpath(curproc, &nd.nl_nch, NULL, &rbuf, &fbuf, 0);
	5442	if (error)
	5443	goto done;
	5444
	5445	rlen = (ssize_t)strlen(rbuf);
	5446	if (rlen >= uap->len) {
	5447	error = ENAMETOOLONG;
	5448	goto done;
	5449	}
	5450	error = copyout(rbuf, uap->buf, rlen + 1);
	5451	if (error == 0)
	5452	sysmsg->sysmsg_szresult = rlen;
	5453	done:
	5454	nlookup_done(&nd);
	5455	if (fbuf)
	5456	kfree(fbuf, M_TEMP);
	5457
	5458	return error;
	5459	}
	5460
	5461	int
	5462	sys_posix_fallocate(struct sysmsg sysmsg, const struct posix_fallocate_args uap)
	5463	{
	5464	return (kern_posix_fallocate(uap->fd, uap->offset, uap->len));
	5465	}
	5466
	5467	int
	5468	kern_posix_fallocate(int fd, off_t offset, off_t len)
	5469	{
	5470	struct thread *td = curthread;
	5471	struct vnode *vp;
	5472	struct file *fp;
	5473	int error;
	5474
	5475	if (offset < 0 \|\| len <= 0)
	5476	return (EINVAL);
	5477	/* Check for wrap. */
	5478	if (offset > OFF_MAX - len)
	5479	return (EFBIG);
	5480
	5481	fp = holdfp(td, fd, -1);
	5482	if (fp == NULL)
	5483	return (EBADF);
	5484
	5485	switch (fp->f_type) {
	5486	case DTYPE_VNODE:
	5487	break;
	5488	case DTYPE_PIPE:
	5489	case DTYPE_FIFO:
	5490	error = ESPIPE;
	5491	goto out;
	5492	default:
	5493	error = ENODEV;
	5494	goto out;
	5495	}
	5496
	5497	if ((fp->f_flag & FWRITE) == 0) {
	5498	error = EBADF;
	5499	goto out;
	5500	}
	5501
	5502	vp = fp->f_data;
	5503	if (vp->v_type != VREG) {
	5504	error = ENODEV;
	5505	goto out;
	5506	}
	5507
	5508	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	5509	error = VOP_ALLOCATE(vp, offset, len);
	5510	vn_unlock(vp);
	5511	out:
	5512	dropfp(td, fd, fp);
	5513	return (error);
	5514	}