gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
	35	*/
	36
	37	#include <sys/param.h>
	38	#include <sys/systm.h>
	39	#include <sys/kernel.h>
	40	#include <sys/fcntl.h>
	41	#include <sys/namecache.h>
	42	#include <sys/vnode.h>
	43	#include <sys/lockf.h>
	44	#include <sys/event.h>
	45	#include <sys/stat.h>
	46	#include <sys/dirent.h>
	47	#include <sys/file.h>
	48	#include <vm/vm_extern.h>
	49	#include <vfs/fifofs/fifo.h>
	50
	51	#include <sys/mplock2.h>
	52
	53	#include "hammer.h"
	54
	55	/*
	56	* USERFS VNOPS
	57	*/
	58	/static int hammer_vop_vnoperate(struct vop_generic_args );*/
	59	static int hammer_vop_fsync(struct vop_fsync_args *);
	60	static int hammer_vop_read(struct vop_read_args *);
	61	static int hammer_vop_write(struct vop_write_args *);
	62	static int hammer_vop_access(struct vop_access_args *);
	63	static int hammer_vop_advlock(struct vop_advlock_args *);
	64	static int hammer_vop_close(struct vop_close_args *);
	65	static int hammer_vop_ncreate(struct vop_ncreate_args *);
	66	static int hammer_vop_getattr(struct vop_getattr_args *);
	67	static int hammer_vop_nresolve(struct vop_nresolve_args *);
	68	static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
	69	static int hammer_vop_nlink(struct vop_nlink_args *);
	70	static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
	71	static int hammer_vop_nmknod(struct vop_nmknod_args *);
	72	static int hammer_vop_open(struct vop_open_args *);
	73	static int hammer_vop_print(struct vop_print_args *);
	74	static int hammer_vop_readdir(struct vop_readdir_args *);
	75	static int hammer_vop_readlink(struct vop_readlink_args *);
	76	static int hammer_vop_nremove(struct vop_nremove_args *);
	77	static int hammer_vop_nrename(struct vop_nrename_args *);
	78	static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
	79	static int hammer_vop_markatime(struct vop_markatime_args *);
	80	static int hammer_vop_setattr(struct vop_setattr_args *);
	81	static int hammer_vop_strategy(struct vop_strategy_args *);
	82	static int hammer_vop_bmap(struct vop_bmap_args *ap);
	83	static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
	84	static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
	85	static int hammer_vop_ioctl(struct vop_ioctl_args *);
	86	static int hammer_vop_mountctl(struct vop_mountctl_args *);
	87	static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
	88
	89	static int hammer_vop_fifoclose (struct vop_close_args *);
	90	static int hammer_vop_fiforead (struct vop_read_args *);
	91	static int hammer_vop_fifowrite (struct vop_write_args *);
	92	static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
	93
	94	struct vop_ops hammer_vnode_vops = {
	95	.vop_default = vop_defaultop,
	96	.vop_fsync = hammer_vop_fsync,
	97	.vop_getpages = vop_stdgetpages,
	98	.vop_putpages = vop_stdputpages,
	99	.vop_read = hammer_vop_read,
	100	.vop_write = hammer_vop_write,
	101	.vop_access = hammer_vop_access,
	102	.vop_advlock = hammer_vop_advlock,
	103	.vop_close = hammer_vop_close,
	104	.vop_ncreate = hammer_vop_ncreate,
	105	.vop_getattr = hammer_vop_getattr,
	106	.vop_inactive = hammer_vop_inactive,
	107	.vop_reclaim = hammer_vop_reclaim,
	108	.vop_nresolve = hammer_vop_nresolve,
	109	.vop_nlookupdotdot = hammer_vop_nlookupdotdot,
	110	.vop_nlink = hammer_vop_nlink,
	111	.vop_nmkdir = hammer_vop_nmkdir,
	112	.vop_nmknod = hammer_vop_nmknod,
	113	.vop_open = hammer_vop_open,
	114	.vop_pathconf = vop_stdpathconf,
	115	.vop_print = hammer_vop_print,
	116	.vop_readdir = hammer_vop_readdir,
	117	.vop_readlink = hammer_vop_readlink,
	118	.vop_nremove = hammer_vop_nremove,
	119	.vop_nrename = hammer_vop_nrename,
	120	.vop_nrmdir = hammer_vop_nrmdir,
	121	.vop_markatime = hammer_vop_markatime,
	122	.vop_setattr = hammer_vop_setattr,
	123	.vop_bmap = hammer_vop_bmap,
	124	.vop_strategy = hammer_vop_strategy,
	125	.vop_nsymlink = hammer_vop_nsymlink,
	126	.vop_nwhiteout = hammer_vop_nwhiteout,
	127	.vop_ioctl = hammer_vop_ioctl,
	128	.vop_mountctl = hammer_vop_mountctl,
	129	.vop_kqfilter = hammer_vop_kqfilter
	130	};
	131
	132	struct vop_ops hammer_spec_vops = {
	133	.vop_default = vop_defaultop,
	134	.vop_fsync = hammer_vop_fsync,
	135	.vop_read = vop_stdnoread,
	136	.vop_write = vop_stdnowrite,
	137	.vop_access = hammer_vop_access,
	138	.vop_close = hammer_vop_close,
	139	.vop_markatime = hammer_vop_markatime,
	140	.vop_getattr = hammer_vop_getattr,
	141	.vop_inactive = hammer_vop_inactive,
	142	.vop_reclaim = hammer_vop_reclaim,
	143	.vop_setattr = hammer_vop_setattr
	144	};
	145
	146	struct vop_ops hammer_fifo_vops = {
	147	.vop_default = fifo_vnoperate,
	148	.vop_fsync = hammer_vop_fsync,
	149	.vop_read = hammer_vop_fiforead,
	150	.vop_write = hammer_vop_fifowrite,
	151	.vop_access = hammer_vop_access,
	152	.vop_close = hammer_vop_fifoclose,
	153	.vop_markatime = hammer_vop_markatime,
	154	.vop_getattr = hammer_vop_getattr,
	155	.vop_inactive = hammer_vop_inactive,
	156	.vop_reclaim = hammer_vop_reclaim,
	157	.vop_setattr = hammer_vop_setattr,
	158	.vop_kqfilter = hammer_vop_fifokqfilter
	159	};
	160
	161	static __inline
	162	void
	163	hammer_knote(struct vnode *vp, int flags)
	164	{
	165	if (flags)
	166	KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
	167	}
	168
	169	#ifdef DEBUG_TRUNCATE
	170	struct hammer_inode *HammerTruncIp;
	171	#endif
	172
	173	static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
	174	struct vnode dvp, struct ucred cred,
	175	int flags, int isdir);
	176	static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
	177	static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
	178
	179	#if 0
	180	static
	181	int
	182	hammer_vop_vnoperate(struct vop_generic_args *)
	183	{
	184	return (VOCALL(&hammer_vnode_vops, ap));
	185	}
	186	#endif
	187
	188	/*
	189	* hammer_vop_fsync { vp, waitfor }
	190	*
	191	* fsync() an inode to disk and wait for it to be completely committed
	192	* such that the information would not be undone if a crash occured after
	193	* return.
	194	*
	195	* NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
	196	* a REDO log. A sysctl is provided to relax HAMMER's fsync()
	197	* operation.
	198	*
	199	* Ultimately the combination of a REDO log and use of fast storage
	200	* to front-end cluster caches will make fsync fast, but it aint
	201	* here yet. And, in anycase, we need real transactional
	202	* all-or-nothing features which are not restricted to a single file.
	203	*/
	204	static
	205	int
	206	hammer_vop_fsync(struct vop_fsync_args *ap)
	207	{
	208	hammer_inode_t ip = VTOI(ap->a_vp);
	209	hammer_mount_t hmp = ip->hmp;
	210	int waitfor = ap->a_waitfor;
	211	int mode;
	212
	213	lwkt_gettoken(&hmp->fs_token);
	214
	215	/*
	216	* Fsync rule relaxation (default is either full synchronous flush
	217	* or REDO semantics with synchronous flush).
	218	*/
	219	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
	220	switch(hammer_fsync_mode) {
	221	case 0:
	222	mode0:
	223	/* no REDO, full synchronous flush */
	224	goto skip;
	225	case 1:
	226	mode1:
	227	/* no REDO, full asynchronous flush */
	228	if (waitfor == MNT_WAIT)
	229	waitfor = MNT_NOWAIT;
	230	goto skip;
	231	case 2:
	232	/* REDO semantics, synchronous flush */
	233	if (hmp->version < HAMMER_VOL_VERSION_FOUR)
	234	goto mode0;
	235	mode = HAMMER_FLUSH_UNDOS_AUTO;
	236	break;
	237	case 3:
	238	/* REDO semantics, relaxed asynchronous flush */
	239	if (hmp->version < HAMMER_VOL_VERSION_FOUR)
	240	goto mode1;
	241	mode = HAMMER_FLUSH_UNDOS_RELAXED;
	242	if (waitfor == MNT_WAIT)
	243	waitfor = MNT_NOWAIT;
	244	break;
	245	case 4:
	246	/* ignore the fsync() system call */
	247	lwkt_reltoken(&hmp->fs_token);
	248	return(0);
	249	default:
	250	/* we have to do something */
	251	mode = HAMMER_FLUSH_UNDOS_RELAXED;
	252	if (waitfor == MNT_WAIT)
	253	waitfor = MNT_NOWAIT;
	254	break;
	255	}
	256
	257	/*
	258	* Fast fsync only needs to flush the UNDO/REDO fifo if
	259	* HAMMER_INODE_REDO is non-zero and the only modifications
	260	* made to the file are write or write-extends.
	261	*/
	262	if ((ip->flags & HAMMER_INODE_REDO) &&
	263	(ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
	264	) {
	265	++hammer_count_fsyncs;
	266	hammer_flusher_flush_undos(hmp, mode);
	267	ip->redo_count = 0;
	268	lwkt_reltoken(&hmp->fs_token);
	269	return(0);
	270	}
	271
	272	/*
	273	* REDO is enabled by fsync(), the idea being we really only
	274	* want to lay down REDO records when programs are using
	275	* fsync() heavily. The first fsync() on the file starts
	276	* the gravy train going and later fsync()s keep it hot by
	277	* resetting the redo_count.
	278	*
	279	* We weren't running REDOs before now so we have to fall
	280	* through and do a full fsync of what we have.
	281	*/
	282	if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
	283	(hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
	284	ip->flags \|= HAMMER_INODE_REDO;
	285	ip->redo_count = 0;
	286	}
	287	}
	288	skip:
	289
	290	/*
	291	* Do a full flush sequence.
	292	*/
	293	++hammer_count_fsyncs;
	294	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
	295	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	296	if (waitfor == MNT_WAIT) {
	297	vn_unlock(ap->a_vp);
	298	hammer_wait_inode(ip);
	299	vn_lock(ap->a_vp, LK_EXCLUSIVE \| LK_RETRY);
	300	}
	301	lwkt_reltoken(&hmp->fs_token);
	302	return (ip->error);
	303	}
	304
	305	/*
	306	* hammer_vop_read { vp, uio, ioflag, cred }
	307	*
	308	* MPSAFE (for the cache safe does not require fs_token)
	309	*/
	310	static
	311	int
	312	hammer_vop_read(struct vop_read_args *ap)
	313	{
	314	struct hammer_transaction trans;
	315	hammer_inode_t ip;
	316	hammer_mount_t hmp;
	317	off_t offset;
	318	struct buf *bp;
	319	struct uio *uio;
	320	int error;
	321	int n;
	322	int seqcount;
	323	int ioseqcount;
	324	int blksize;
	325	int bigread;
	326	int got_fstoken;
	327
	328	if (ap->a_vp->v_type != VREG)
	329	return (EINVAL);
	330	ip = VTOI(ap->a_vp);
	331	hmp = ip->hmp;
	332	error = 0;
	333	uio = ap->a_uio;
	334
	335	/*
	336	* Allow the UIO's size to override the sequential heuristic.
	337	*/
	338	blksize = hammer_blocksize(uio->uio_offset);
	339	seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
	340	ioseqcount = (ap->a_ioflag >> 16);
	341	if (seqcount < ioseqcount)
	342	seqcount = ioseqcount;
	343
	344	/*
	345	* If reading or writing a huge amount of data we have to break
	346	* atomicy and allow the operation to be interrupted by a signal
	347	* or it can DOS the machine.
	348	*/
	349	bigread = (uio->uio_resid > 100 * 1024 * 1024);
	350	got_fstoken = 0;
	351
	352	/*
	353	* Access the data typically in HAMMER_BUFSIZE blocks via the
	354	* buffer cache, but HAMMER may use a variable block size based
	355	* on the offset.
	356	*
	357	* XXX Temporary hack, delay the start transaction while we remain
	358	* MPSAFE. NOTE: ino_data.size cannot change while vnode is
	359	* locked-shared.
	360	*/
	361	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
	362	int64_t base_offset;
	363	int64_t file_limit;
	364
	365	blksize = hammer_blocksize(uio->uio_offset);
	366	offset = (int)uio->uio_offset & (blksize - 1);
	367	base_offset = uio->uio_offset - offset;
	368
	369	if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
	370	break;
	371
	372	/*
	373	* MPSAFE
	374	*/
	375	bp = getcacheblk(ap->a_vp, base_offset);
	376	if (bp) {
	377	error = 0;
	378	goto skip;
	379	}
	380
	381	/*
	382	* MPUNSAFE
	383	*/
	384	if (got_fstoken == 0) {
	385	lwkt_gettoken(&hmp->fs_token);
	386	got_fstoken = 1;
	387	hammer_start_transaction(&trans, ip->hmp);
	388	}
	389
	390	if (hammer_cluster_enable) {
	391	/*
	392	* Use file_limit to prevent cluster_read() from
	393	* creating buffers of the wrong block size past
	394	* the demarc.
	395	*/
	396	file_limit = ip->ino_data.size;
	397	if (base_offset < HAMMER_XDEMARC &&
	398	file_limit > HAMMER_XDEMARC) {
	399	file_limit = HAMMER_XDEMARC;
	400	}
	401	error = cluster_read(ap->a_vp,
	402	file_limit, base_offset,
	403	blksize, uio->uio_resid,
	404	seqcount * BKVASIZE, &bp);
	405	} else {
	406	error = bread(ap->a_vp, base_offset, blksize, &bp);
	407	}
	408	if (error) {
	409	brelse(bp);
	410	break;
	411	}
	412	skip:
	413	if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
	414	kprintf("doff %016jx read file %016jx@%016jx\n",
	415	(intmax_t)bp->b_bio2.bio_offset,
	416	(intmax_t)ip->obj_id,
	417	(intmax_t)bp->b_loffset);
	418	}
	419	bp->b_flags &= ~B_IODEBUG;
	420
	421	/* bp->b_flags \|= B_CLUSTEROK; temporarily disabled */
	422	n = blksize - offset;
	423	if (n > uio->uio_resid)
	424	n = uio->uio_resid;
	425	if (n > ip->ino_data.size - uio->uio_offset)
	426	n = (int)(ip->ino_data.size - uio->uio_offset);
	427	error = uiomove((char *)bp->b_data + offset, n, uio);
	428
	429	/* data has a lower priority then meta-data */
	430	bp->b_flags \|= B_AGE;
	431	bqrelse(bp);
	432	if (error)
	433	break;
	434	hammer_stats_file_read += n;
	435	}
	436
	437	/*
	438	* XXX only update the atime if we had to get the MP lock.
	439	* XXX hack hack hack, fixme.
	440	*/
	441	if (got_fstoken) {
	442	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
	443	(ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
	444	ip->ino_data.atime = trans.time;
	445	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
	446	}
	447	hammer_done_transaction(&trans);
	448	lwkt_reltoken(&hmp->fs_token);
	449	}
	450	return (error);
	451	}
	452
	453	/*
	454	* hammer_vop_write { vp, uio, ioflag, cred }
	455	*/
	456	static
	457	int
	458	hammer_vop_write(struct vop_write_args *ap)
	459	{
	460	struct hammer_transaction trans;
	461	struct hammer_inode *ip;
	462	hammer_mount_t hmp;
	463	struct uio *uio;
	464	int offset;
	465	off_t base_offset;
	466	struct buf *bp;
	467	int kflags;
	468	int error;
	469	int n;
	470	int flags;
	471	int seqcount;
	472	int bigwrite;
	473
	474	if (ap->a_vp->v_type != VREG)
	475	return (EINVAL);
	476	ip = VTOI(ap->a_vp);
	477	hmp = ip->hmp;
	478	error = 0;
	479	kflags = 0;
	480	seqcount = ap->a_ioflag >> 16;
	481
	482	if (ip->flags & HAMMER_INODE_RO)
	483	return (EROFS);
	484
	485	/*
	486	* Create a transaction to cover the operations we perform.
	487	*/
	488	lwkt_gettoken(&hmp->fs_token);
	489	hammer_start_transaction(&trans, hmp);
	490	uio = ap->a_uio;
	491
	492	/*
	493	* Check append mode
	494	*/
	495	if (ap->a_ioflag & IO_APPEND)
	496	uio->uio_offset = ip->ino_data.size;
	497
	498	/*
	499	* Check for illegal write offsets. Valid range is 0...2^63-1.
	500	*
	501	* NOTE: the base_off assignment is required to work around what
	502	* I consider to be a GCC-4 optimization bug.
	503	*/
	504	if (uio->uio_offset < 0) {
	505	hammer_done_transaction(&trans);
	506	lwkt_reltoken(&hmp->fs_token);
	507	return (EFBIG);
	508	}
	509	base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
	510	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
	511	hammer_done_transaction(&trans);
	512	lwkt_reltoken(&hmp->fs_token);
	513	return (EFBIG);
	514	}
	515
	516	/*
	517	* If reading or writing a huge amount of data we have to break
	518	* atomicy and allow the operation to be interrupted by a signal
	519	* or it can DOS the machine.
	520	*
	521	* Preset redo_count so we stop generating REDOs earlier if the
	522	* limit is exceeded.
	523	*/
	524	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
	525	if ((ip->flags & HAMMER_INODE_REDO) &&
	526	ip->redo_count < hammer_limit_redo) {
	527	ip->redo_count += uio->uio_resid;
	528	}
	529
	530	/*
	531	* Access the data typically in HAMMER_BUFSIZE blocks via the
	532	* buffer cache, but HAMMER may use a variable block size based
	533	* on the offset.
	534	*/
	535	while (uio->uio_resid > 0) {
	536	int fixsize = 0;
	537	int blksize;
	538	int blkmask;
	539	int trivial;
	540	int endofblk;
	541	off_t nsize;
	542
	543	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
	544	break;
	545	if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
	546	break;
	547
	548	blksize = hammer_blocksize(uio->uio_offset);
	549
	550	/*
	551	* Do not allow HAMMER to blow out the buffer cache. Very
	552	* large UIOs can lockout other processes due to bwillwrite()
	553	* mechanics.
	554	*
	555	* The hammer inode is not locked during these operations.
	556	* The vnode is locked which can interfere with the pageout
	557	* daemon for non-UIO_NOCOPY writes but should not interfere
	558	* with the buffer cache. Even so, we cannot afford to
	559	* allow the pageout daemon to build up too many dirty buffer
	560	* cache buffers.
	561	*
	562	* Only call this if we aren't being recursively called from
	563	* a virtual disk device (vn), else we may deadlock.
	564	*/
	565	if ((ap->a_ioflag & IO_RECURSE) == 0)
	566	bwillwrite(blksize);
	567
	568	/*
	569	* Control the number of pending records associated with
	570	* this inode. If too many have accumulated start a
	571	* flush. Try to maintain a pipeline with the flusher.
	572	*/
	573	if (ip->rsv_recs >= hammer_limit_inode_recs) {
	574	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	575	}
	576	if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
	577	while (ip->rsv_recs >= hammer_limit_inode_recs) {
	578	tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
	579	}
	580	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	581	}
	582
	583	#if 0
	584	/*
	585	* Do not allow HAMMER to blow out system memory by
	586	* accumulating too many records. Records are so well
	587	* decoupled from the buffer cache that it is possible
	588	* for userland to push data out to the media via
	589	* direct-write, but build up the records queued to the
	590	* backend faster then the backend can flush them out.
	591	* HAMMER has hit its write limit but the frontend has
	592	* no pushback to slow it down.
	593	*/
	594	if (hmp->rsv_recs > hammer_limit_recs / 2) {
	595	/*
	596	* Get the inode on the flush list
	597	*/
	598	if (ip->rsv_recs >= 64)
	599	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	600	else if (ip->rsv_recs >= 16)
	601	hammer_flush_inode(ip, 0);
	602
	603	/*
	604	* Keep the flusher going if the system keeps
	605	* queueing records.
	606	*/
	607	delta = hmp->count_newrecords -
	608	hmp->last_newrecords;
	609	if (delta < 0 \|\| delta > hammer_limit_recs / 2) {
	610	hmp->last_newrecords = hmp->count_newrecords;
	611	hammer_sync_hmp(hmp, MNT_NOWAIT);
	612	}
	613
	614	/*
	615	* If we have gotten behind start slowing
	616	* down the writers.
	617	*/
	618	delta = (hmp->rsv_recs - hammer_limit_recs) *
	619	hz / hammer_limit_recs;
	620	if (delta > 0)
	621	tsleep(&trans, 0, "hmrslo", delta);
	622	}
	623	#endif
	624
	625	/*
	626	* Calculate the blocksize at the current offset and figure
	627	* out how much we can actually write.
	628	*/
	629	blkmask = blksize - 1;
	630	offset = (int)uio->uio_offset & blkmask;
	631	base_offset = uio->uio_offset & ~(int64_t)blkmask;
	632	n = blksize - offset;
	633	if (n > uio->uio_resid) {
	634	n = uio->uio_resid;
	635	endofblk = 0;
	636	} else {
	637	endofblk = 1;
	638	}
	639	nsize = uio->uio_offset + n;
	640	if (nsize > ip->ino_data.size) {
	641	if (uio->uio_offset > ip->ino_data.size)
	642	trivial = 0;
	643	else
	644	trivial = 1;
	645	nvextendbuf(ap->a_vp,
	646	ip->ino_data.size,
	647	nsize,
	648	hammer_blocksize(ip->ino_data.size),
	649	hammer_blocksize(nsize),
	650	hammer_blockoff(ip->ino_data.size),
	651	hammer_blockoff(nsize),
	652	trivial);
	653	fixsize = 1;
	654	kflags \|= NOTE_EXTEND;
	655	}
	656
	657	if (uio->uio_segflg == UIO_NOCOPY) {
	658	/*
	659	* Issuing a write with the same data backing the
	660	* buffer. Instantiate the buffer to collect the
	661	* backing vm pages, then read-in any missing bits.
	662	*
	663	* This case is used by vop_stdputpages().
	664	*/
	665	bp = getblk(ap->a_vp, base_offset,
	666	blksize, GETBLK_BHEAVY, 0);
	667	if ((bp->b_flags & B_CACHE) == 0) {
	668	bqrelse(bp);
	669	error = bread(ap->a_vp, base_offset,
	670	blksize, &bp);
	671	}
	672	} else if (offset == 0 && uio->uio_resid >= blksize) {
	673	/*
	674	* Even though we are entirely overwriting the buffer
	675	* we may still have to zero it out to avoid a
	676	* mmap/write visibility issue.
	677	*/
	678	bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
	679	if ((bp->b_flags & B_CACHE) == 0)
	680	vfs_bio_clrbuf(bp);
	681	} else if (base_offset >= ip->ino_data.size) {
	682	/*
	683	* If the base offset of the buffer is beyond the
	684	* file EOF, we don't have to issue a read.
	685	*/
	686	bp = getblk(ap->a_vp, base_offset,
	687	blksize, GETBLK_BHEAVY, 0);
	688	vfs_bio_clrbuf(bp);
	689	} else {
	690	/*
	691	* Partial overwrite, read in any missing bits then
	692	* replace the portion being written.
	693	*/
	694	error = bread(ap->a_vp, base_offset, blksize, &bp);
	695	if (error == 0)
	696	bheavy(bp);
	697	}
	698	if (error == 0)
	699	error = uiomove(bp->b_data + offset, n, uio);
	700
	701	/*
	702	* Generate REDO records if enabled and redo_count will not
	703	* exceeded the limit.
	704	*
	705	* If redo_count exceeds the limit we stop generating records
	706	* and clear HAMMER_INODE_REDO. This will cause the next
	707	* fsync() to do a full meta-data sync instead of just an
	708	* UNDO/REDO fifo update.
	709	*
	710	* When clearing HAMMER_INODE_REDO any pre-existing REDOs
	711	* will still be tracked. The tracks will be terminated
	712	* when the related meta-data (including possible data
	713	* modifications which are not tracked via REDO) is
	714	* flushed.
	715	*/
	716	if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
	717	if (ip->redo_count < hammer_limit_redo) {
	718	bp->b_flags \|= B_VFSFLAG1;
	719	error = hammer_generate_redo(&trans, ip,
	720	base_offset + offset,
	721	HAMMER_REDO_WRITE,
	722	bp->b_data + offset,
	723	(size_t)n);
	724	} else {
	725	ip->flags &= ~HAMMER_INODE_REDO;
	726	}
	727	}
	728
	729	/*
	730	* If we screwed up we have to undo any VM size changes we
	731	* made.
	732	*/
	733	if (error) {
	734	brelse(bp);
	735	if (fixsize) {
	736	nvtruncbuf(ap->a_vp, ip->ino_data.size,
	737	hammer_blocksize(ip->ino_data.size),
	738	hammer_blockoff(ip->ino_data.size));
	739	}
	740	break;
	741	}
	742	kflags \|= NOTE_WRITE;
	743	hammer_stats_file_write += n;
	744	/* bp->b_flags \|= B_CLUSTEROK; temporarily disabled */
	745	if (ip->ino_data.size < uio->uio_offset) {
	746	ip->ino_data.size = uio->uio_offset;
	747	flags = HAMMER_INODE_SDIRTY;
	748	} else {
	749	flags = 0;
	750	}
	751	ip->ino_data.mtime = trans.time;
	752	flags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_BUFS;
	753	hammer_modify_inode(&trans, ip, flags);
	754
	755	/*
	756	* Once we dirty the buffer any cached zone-X offset
	757	* becomes invalid. HAMMER NOTE: no-history mode cannot
	758	* allow overwriting over the same data sector unless
	759	* we provide UNDOs for the old data, which we don't.
	760	*/
	761	bp->b_bio2.bio_offset = NOOFFSET;
	762
	763	/*
	764	* Final buffer disposition.
	765	*
	766	* Because meta-data updates are deferred, HAMMER is
	767	* especially sensitive to excessive bdwrite()s because
	768	* the I/O stream is not broken up by disk reads. So the
	769	* buffer cache simply cannot keep up.
	770	*
	771	* WARNING! blksize is variable. cluster_write() is
	772	* expected to not blow up if it encounters
	773	* buffers that do not match the passed blksize.
	774	*
	775	* NOTE! Hammer shouldn't need to bawrite()/cluster_write().
	776	* The ip->rsv_recs check should burst-flush the data.
	777	* If we queue it immediately the buf could be left
	778	* locked on the device queue for a very long time.
	779	*
	780	* NOTE! To avoid degenerate stalls due to mismatched block
	781	* sizes we only honor IO_DIRECT on the write which
	782	* abuts the end of the buffer. However, we must
	783	* honor IO_SYNC in case someone is silly enough to
	784	* configure a HAMMER file as swap, or when HAMMER
	785	* is serving NFS (for commits). Ick ick.
	786	*/
	787	bp->b_flags \|= B_AGE;
	788	if (ap->a_ioflag & IO_SYNC) {
	789	bwrite(bp);
	790	} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
	791	bawrite(bp);
	792	} else {
	793	#if 0
	794	if (offset + n == blksize) {
	795	if (hammer_cluster_enable == 0 \|\|
	796	(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
	797	bawrite(bp);
	798	} else {
	799	cluster_write(bp, ip->ino_data.size,
	800	blksize, seqcount);
	801	}
	802	} else {
	803	#endif
	804	bdwrite(bp);
	805	}
	806	}
	807	hammer_done_transaction(&trans);
	808	hammer_knote(ap->a_vp, kflags);
	809	lwkt_reltoken(&hmp->fs_token);
	810	return (error);
	811	}
	812
	813	/*
	814	* hammer_vop_access { vp, mode, cred }
	815	*
	816	* MPSAFE - does not require fs_token
	817	*/
	818	static
	819	int
	820	hammer_vop_access(struct vop_access_args *ap)
	821	{
	822	struct hammer_inode *ip = VTOI(ap->a_vp);
	823	uid_t uid;
	824	gid_t gid;
	825	int error;
	826
	827	++hammer_stats_file_iopsr;
	828	uid = hammer_to_unix_xid(&ip->ino_data.uid);
	829	gid = hammer_to_unix_xid(&ip->ino_data.gid);
	830
	831	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
	832	ip->ino_data.uflags);
	833	return (error);
	834	}
	835
	836	/*
	837	* hammer_vop_advlock { vp, id, op, fl, flags }
	838	*
	839	* MPSAFE - does not require fs_token
	840	*/
	841	static
	842	int
	843	hammer_vop_advlock(struct vop_advlock_args *ap)
	844	{
	845	hammer_inode_t ip = VTOI(ap->a_vp);
	846
	847	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
	848	}
	849
	850	/*
	851	* hammer_vop_close { vp, fflag }
	852	*
	853	* We can only sync-on-close for normal closes. XXX disabled for now.
	854	*/
	855	static
	856	int
	857	hammer_vop_close(struct vop_close_args *ap)
	858	{
	859	#if 0
	860	struct vnode *vp = ap->a_vp;
	861	hammer_inode_t ip = VTOI(vp);
	862	int waitfor;
	863	if (ip->flags & (HAMMER_INODE_CLOSESYNC\|HAMMER_INODE_CLOSEASYNC)) {
	864	if (vn_islocked(vp) == LK_EXCLUSIVE &&
	865	(vp->v_flag & (VINACTIVE\|VRECLAIMED)) == 0) {
	866	if (ip->flags & HAMMER_INODE_CLOSESYNC)
	867	waitfor = MNT_WAIT;
	868	else
	869	waitfor = MNT_NOWAIT;
	870	ip->flags &= ~(HAMMER_INODE_CLOSESYNC \|
	871	HAMMER_INODE_CLOSEASYNC);
	872	VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
	873	}
	874	}
	875	#endif
	876	return (vop_stdclose(ap));
	877	}
	878
	879	/*
	880	* hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
	881	*
	882	* The operating system has already ensured that the directory entry
	883	* does not exist and done all appropriate namespace locking.
	884	*/
	885	static
	886	int
	887	hammer_vop_ncreate(struct vop_ncreate_args *ap)
	888	{
	889	struct hammer_transaction trans;
	890	struct hammer_inode *dip;
	891	struct hammer_inode *nip;
	892	struct nchandle *nch;
	893	hammer_mount_t hmp;
	894	int error;
	895
	896	nch = ap->a_nch;
	897	dip = VTOI(ap->a_dvp);
	898	hmp = dip->hmp;
	899
	900	if (dip->flags & HAMMER_INODE_RO)
	901	return (EROFS);
	902	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	903	return (error);
	904
	905	/*
	906	* Create a transaction to cover the operations we perform.
	907	*/
	908	lwkt_gettoken(&hmp->fs_token);
	909	hammer_start_transaction(&trans, hmp);
	910	++hammer_stats_file_iopsw;
	911
	912	/*
	913	* Create a new filesystem object of the requested type. The
	914	* returned inode will be referenced and shared-locked to prevent
	915	* it from being moved to the flusher.
	916	*/
	917	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	918	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	919	NULL, &nip);
	920	if (error) {
	921	hkprintf("hammer_create_inode error %d\n", error);
	922	hammer_done_transaction(&trans);
	923	*ap->a_vpp = NULL;
	924	lwkt_reltoken(&hmp->fs_token);
	925	return (error);
	926	}
	927
	928	/*
	929	* Add the new filesystem object to the directory. This will also
	930	* bump the inode's link count.
	931	*/
	932	error = hammer_ip_add_directory(&trans, dip,
	933	nch->ncp->nc_name, nch->ncp->nc_nlen,
	934	nip);
	935	if (error)
	936	hkprintf("hammer_ip_add_directory error %d\n", error);
	937
	938	/*
	939	* Finish up.
	940	*/
	941	if (error) {
	942	hammer_rel_inode(nip, 0);
	943	hammer_done_transaction(&trans);
	944	*ap->a_vpp = NULL;
	945	} else {
	946	error = hammer_get_vnode(nip, ap->a_vpp);
	947	hammer_done_transaction(&trans);
	948	hammer_rel_inode(nip, 0);
	949	if (error == 0) {
	950	cache_setunresolved(ap->a_nch);
	951	cache_setvp(ap->a_nch, *ap->a_vpp);
	952	}
	953	hammer_knote(ap->a_dvp, NOTE_WRITE);
	954	}
	955	lwkt_reltoken(&hmp->fs_token);
	956	return (error);
	957	}
	958
	959	/*
	960	* hammer_vop_getattr { vp, vap }
	961	*
	962	* Retrieve an inode's attribute information. When accessing inodes
	963	* historically we fake the atime field to ensure consistent results.
	964	* The atime field is stored in the B-Tree element and allowed to be
	965	* updated without cycling the element.
	966	*
	967	* MPSAFE - does not require fs_token
	968	*/
	969	static
	970	int
	971	hammer_vop_getattr(struct vop_getattr_args *ap)
	972	{
	973	struct hammer_inode *ip = VTOI(ap->a_vp);
	974	struct vattr *vap = ap->a_vap;
	975
	976	/*
	977	* We want the fsid to be different when accessing a filesystem
	978	* with different as-of's so programs like diff don't think
	979	* the files are the same.
	980	*
	981	* We also want the fsid to be the same when comparing snapshots,
	982	* or when comparing mirrors (which might be backed by different
	983	* physical devices). HAMMER fsids are based on the PFS's
	984	* shared_uuid field.
	985	*
	986	* XXX there is a chance of collision here. The va_fsid reported
	987	* by stat is different from the more involved fsid used in the
	988	* mount structure.
	989	*/
	990	++hammer_stats_file_iopsr;
	991	hammer_lock_sh(&ip->lock);
	992	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
	993	(u_int32_t)(ip->obj_asof >> 32);
	994
	995	vap->va_fileid = ip->ino_leaf.base.obj_id;
	996	vap->va_mode = ip->ino_data.mode;
	997	vap->va_nlink = ip->ino_data.nlinks;
	998	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	999	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	1000	vap->va_rmajor = 0;
	1001	vap->va_rminor = 0;
	1002	vap->va_size = ip->ino_data.size;
	1003
	1004	/*
	1005	* Special case for @@PFS softlinks. The actual size of the
	1006	* expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
	1007	* or for MAX_TID is "@@-1:%05d" == 10 bytes.
	1008	*/
	1009	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
	1010	ip->ino_data.size == 10 &&
	1011	ip->obj_asof == HAMMER_MAX_TID &&
	1012	ip->obj_localization == 0 &&
	1013	strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
	1014	if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
	1015	vap->va_size = 26;
	1016	else
	1017	vap->va_size = 10;
	1018	}
	1019
	1020	/*
	1021	* We must provide a consistent atime and mtime for snapshots
	1022	* so people can do a 'tar cf - ... \| md5' on them and get
	1023	* consistent results.
	1024	*/
	1025	if (ip->flags & HAMMER_INODE_RO) {
	1026	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
	1027	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
	1028	} else {
	1029	hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
	1030	hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
	1031	}
	1032	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
	1033	vap->va_flags = ip->ino_data.uflags;
	1034	vap->va_gen = 1; /* hammer inums are unique for all time */
	1035	vap->va_blocksize = HAMMER_BUFSIZE;
	1036	if (ip->ino_data.size >= HAMMER_XDEMARC) {
	1037	vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
	1038	~HAMMER_XBUFMASK64;
	1039	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
	1040	vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
	1041	~HAMMER_BUFMASK64;
	1042	} else {
	1043	vap->va_bytes = (ip->ino_data.size + 15) & ~15;
	1044	}
	1045
	1046	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
	1047	vap->va_filerev = 0; /* XXX */
	1048	vap->va_uid_uuid = ip->ino_data.uid;
	1049	vap->va_gid_uuid = ip->ino_data.gid;
	1050	vap->va_fsid_uuid = ip->hmp->fsid;
	1051	vap->va_vaflags = VA_UID_UUID_VALID \| VA_GID_UUID_VALID \|
	1052	VA_FSID_UUID_VALID;
	1053
	1054	switch (ip->ino_data.obj_type) {
	1055	case HAMMER_OBJTYPE_CDEV:
	1056	case HAMMER_OBJTYPE_BDEV:
	1057	vap->va_rmajor = ip->ino_data.rmajor;
	1058	vap->va_rminor = ip->ino_data.rminor;
	1059	break;
	1060	default:
	1061	break;
	1062	}
	1063	hammer_unlock(&ip->lock);
	1064	return(0);
	1065	}
	1066
	1067	/*
	1068	* hammer_vop_nresolve { nch, dvp, cred }
	1069	*
	1070	* Locate the requested directory entry.
	1071	*/
	1072	static
	1073	int
	1074	hammer_vop_nresolve(struct vop_nresolve_args *ap)
	1075	{
	1076	struct hammer_transaction trans;
	1077	struct namecache *ncp;
	1078	hammer_mount_t hmp;
	1079	hammer_inode_t dip;
	1080	hammer_inode_t ip;
	1081	hammer_tid_t asof;
	1082	struct hammer_cursor cursor;
	1083	struct vnode *vp;
	1084	int64_t namekey;
	1085	int error;
	1086	int i;
	1087	int nlen;
	1088	int flags;
	1089	int ispfs;
	1090	int64_t obj_id;
	1091	u_int32_t localization;
	1092	u_int32_t max_iterations;
	1093
	1094	/*
	1095	* Misc initialization, plus handle as-of name extensions. Look for
	1096	* the '@@' extension. Note that as-of files and directories cannot
	1097	* be modified.
	1098	*/
	1099	dip = VTOI(ap->a_dvp);
	1100	ncp = ap->a_nch->ncp;
	1101	asof = dip->obj_asof;
	1102	localization = dip->obj_localization; /* for code consistency */
	1103	nlen = ncp->nc_nlen;
	1104	flags = dip->flags & HAMMER_INODE_RO;
	1105	ispfs = 0;
	1106	hmp = dip->hmp;
	1107
	1108	lwkt_gettoken(&hmp->fs_token);
	1109	hammer_simple_transaction(&trans, hmp);
	1110	++hammer_stats_file_iopsr;
	1111
	1112	for (i = 0; i < nlen; ++i) {
	1113	if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
	1114	error = hammer_str_to_tid(ncp->nc_name + i + 2,
	1115	&ispfs, &asof, &localization);
	1116	if (error != 0) {
	1117	i = nlen;
	1118	break;
	1119	}
	1120	if (asof != HAMMER_MAX_TID)
	1121	flags \|= HAMMER_INODE_RO;
	1122	break;
	1123	}
	1124	}
	1125	nlen = i;
	1126
	1127	/*
	1128	* If this is a PFS softlink we dive into the PFS
	1129	*/
	1130	if (ispfs && nlen == 0) {
	1131	ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
	1132	asof, localization,
	1133	flags, &error);
	1134	if (error == 0) {
	1135	error = hammer_get_vnode(ip, &vp);
	1136	hammer_rel_inode(ip, 0);
	1137	} else {
	1138	vp = NULL;
	1139	}
	1140	if (error == 0) {
	1141	vn_unlock(vp);
	1142	cache_setvp(ap->a_nch, vp);
	1143	vrele(vp);
	1144	}
	1145	goto done;
	1146	}
	1147
	1148	/*
	1149	* If there is no path component the time extension is relative to dip.
	1150	* e.g. "fubar/@@<snapshot>"
	1151	*
	1152	* "." is handled by the kernel, but ".@@<snapshot>" is not.
	1153	* e.g. "fubar/.@@<snapshot>"
	1154	*
	1155	* ".." is handled by the kernel. We do not currently handle
	1156	* "..@<snapshot>".
	1157	*/
	1158	if (nlen == 0 \|\| (nlen == 1 && ncp->nc_name[0] == '.')) {
	1159	ip = hammer_get_inode(&trans, dip, dip->obj_id,
	1160	asof, dip->obj_localization,
	1161	flags, &error);
	1162	if (error == 0) {
	1163	error = hammer_get_vnode(ip, &vp);
	1164	hammer_rel_inode(ip, 0);
	1165	} else {
	1166	vp = NULL;
	1167	}
	1168	if (error == 0) {
	1169	vn_unlock(vp);
	1170	cache_setvp(ap->a_nch, vp);
	1171	vrele(vp);
	1172	}
	1173	goto done;
	1174	}
	1175
	1176	/*
	1177	* Calculate the namekey and setup the key range for the scan. This
	1178	* works kinda like a chained hash table where the lower 32 bits
	1179	* of the namekey synthesize the chain.
	1180	*
	1181	* The key range is inclusive of both key_beg and key_end.
	1182	*/
	1183	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
	1184	&max_iterations);
	1185
	1186	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
	1187	cursor.key_beg.localization = dip->obj_localization +
	1188	hammer_dir_localization(dip);
	1189	cursor.key_beg.obj_id = dip->obj_id;
	1190	cursor.key_beg.key = namekey;
	1191	cursor.key_beg.create_tid = 0;
	1192	cursor.key_beg.delete_tid = 0;
	1193	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1194	cursor.key_beg.obj_type = 0;
	1195
	1196	cursor.key_end = cursor.key_beg;
	1197	cursor.key_end.key += max_iterations;
	1198	cursor.asof = asof;
	1199	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1200
	1201	/*
	1202	* Scan all matching records (the chain), locate the one matching
	1203	* the requested path component.
	1204	*
	1205	* The hammer_ip_*() functions merge in-memory records with on-disk
	1206	* records for the purposes of the search.
	1207	*/
	1208	obj_id = 0;
	1209	localization = HAMMER_DEF_LOCALIZATION;
	1210
	1211	if (error == 0) {
	1212	error = hammer_ip_first(&cursor);
	1213	while (error == 0) {
	1214	error = hammer_ip_resolve_data(&cursor);
	1215	if (error)
	1216	break;
	1217	if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
	1218	bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	1219	obj_id = cursor.data->entry.obj_id;
	1220	localization = cursor.data->entry.localization;
	1221	break;
	1222	}
	1223	error = hammer_ip_next(&cursor);
	1224	}
	1225	}
	1226	hammer_done_cursor(&cursor);
	1227
	1228	/*
	1229	* Lookup the obj_id. This should always succeed. If it does not
	1230	* the filesystem may be damaged and we return a dummy inode.
	1231	*/
	1232	if (error == 0) {
	1233	ip = hammer_get_inode(&trans, dip, obj_id,
	1234	asof, localization,
	1235	flags, &error);
	1236	if (error == ENOENT) {
	1237	kprintf("HAMMER: WARNING: Missing "
	1238	"inode for dirent \"%s\"\n"
	1239	"\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
	1240	ncp->nc_name,
	1241	(long long)obj_id, (long long)asof,
	1242	localization);
	1243	error = 0;
	1244	ip = hammer_get_dummy_inode(&trans, dip, obj_id,
	1245	asof, localization,
	1246	flags, &error);
	1247	}
	1248	if (error == 0) {
	1249	error = hammer_get_vnode(ip, &vp);
	1250	hammer_rel_inode(ip, 0);
	1251	} else {
	1252	vp = NULL;
	1253	}
	1254	if (error == 0) {
	1255	vn_unlock(vp);
	1256	cache_setvp(ap->a_nch, vp);
	1257	vrele(vp);
	1258	}
	1259	} else if (error == ENOENT) {
	1260	cache_setvp(ap->a_nch, NULL);
	1261	}
	1262	done:
	1263	hammer_done_transaction(&trans);
	1264	lwkt_reltoken(&hmp->fs_token);
	1265	return (error);
	1266	}
	1267
	1268	/*
	1269	* hammer_vop_nlookupdotdot { dvp, vpp, cred }
	1270	*
	1271	* Locate the parent directory of a directory vnode.
	1272	*
	1273	* dvp is referenced but not locked. *vpp must be returned referenced and
	1274	* locked. A parent_obj_id of 0 does not necessarily indicate that we are
	1275	* at the root, instead it could indicate that the directory we were in was
	1276	* removed.
	1277	*
	1278	* NOTE: as-of sequences are not linked into the directory structure. If
	1279	* we are at the root with a different asof then the mount point, reload
	1280	* the same directory with the mount point's asof. I'm not sure what this
	1281	* will do to NFS. We encode ASOF stamps in NFS file handles so it might not
	1282	* get confused, but it hasn't been tested.
	1283	*/
	1284	static
	1285	int
	1286	hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
	1287	{
	1288	struct hammer_transaction trans;
	1289	struct hammer_inode *dip;
	1290	struct hammer_inode *ip;
	1291	hammer_mount_t hmp;
	1292	int64_t parent_obj_id;
	1293	u_int32_t parent_obj_localization;
	1294	hammer_tid_t asof;
	1295	int error;
	1296
	1297	dip = VTOI(ap->a_dvp);
	1298	asof = dip->obj_asof;
	1299	hmp = dip->hmp;
	1300
	1301	/*
	1302	* Whos are parent? This could be the root of a pseudo-filesystem
	1303	* whos parent is in another localization domain.
	1304	*/
	1305	lwkt_gettoken(&hmp->fs_token);
	1306	parent_obj_id = dip->ino_data.parent_obj_id;
	1307	if (dip->obj_id == HAMMER_OBJID_ROOT)
	1308	parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
	1309	else
	1310	parent_obj_localization = dip->obj_localization;
	1311
	1312	if (parent_obj_id == 0) {
	1313	if (dip->obj_id == HAMMER_OBJID_ROOT &&
	1314	asof != hmp->asof) {
	1315	parent_obj_id = dip->obj_id;
	1316	asof = hmp->asof;
	1317	*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
	1318	ksnprintf(*ap->a_fakename, 19, "0x%016llx",
	1319	(long long)dip->obj_asof);
	1320	} else {
	1321	*ap->a_vpp = NULL;
	1322	lwkt_reltoken(&hmp->fs_token);
	1323	return ENOENT;
	1324	}
	1325	}
	1326
	1327	hammer_simple_transaction(&trans, hmp);
	1328	++hammer_stats_file_iopsr;
	1329
	1330	ip = hammer_get_inode(&trans, dip, parent_obj_id,
	1331	asof, parent_obj_localization,
	1332	dip->flags, &error);
	1333	if (ip) {
	1334	error = hammer_get_vnode(ip, ap->a_vpp);
	1335	hammer_rel_inode(ip, 0);
	1336	} else {
	1337	*ap->a_vpp = NULL;
	1338	}
	1339	hammer_done_transaction(&trans);
	1340	lwkt_reltoken(&hmp->fs_token);
	1341	return (error);
	1342	}
	1343
	1344	/*
	1345	* hammer_vop_nlink { nch, dvp, vp, cred }
	1346	*/
	1347	static
	1348	int
	1349	hammer_vop_nlink(struct vop_nlink_args *ap)
	1350	{
	1351	struct hammer_transaction trans;
	1352	struct hammer_inode *dip;
	1353	struct hammer_inode *ip;
	1354	struct nchandle *nch;
	1355	hammer_mount_t hmp;
	1356	int error;
	1357
	1358	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
	1359	return(EXDEV);
	1360
	1361	nch = ap->a_nch;
	1362	dip = VTOI(ap->a_dvp);
	1363	ip = VTOI(ap->a_vp);
	1364	hmp = dip->hmp;
	1365
	1366	if (dip->obj_localization != ip->obj_localization)
	1367	return(EXDEV);
	1368
	1369	if (dip->flags & HAMMER_INODE_RO)
	1370	return (EROFS);
	1371	if (ip->flags & HAMMER_INODE_RO)
	1372	return (EROFS);
	1373	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1374	return (error);
	1375
	1376	/*
	1377	* Create a transaction to cover the operations we perform.
	1378	*/
	1379	lwkt_gettoken(&hmp->fs_token);
	1380	hammer_start_transaction(&trans, hmp);
	1381	++hammer_stats_file_iopsw;
	1382
	1383	/*
	1384	* Add the filesystem object to the directory. Note that neither
	1385	* dip nor ip are referenced or locked, but their vnodes are
	1386	* referenced. This function will bump the inode's link count.
	1387	*/
	1388	error = hammer_ip_add_directory(&trans, dip,
	1389	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1390	ip);
	1391
	1392	/*
	1393	* Finish up.
	1394	*/
	1395	if (error == 0) {
	1396	cache_setunresolved(nch);
	1397	cache_setvp(nch, ap->a_vp);
	1398	}
	1399	hammer_done_transaction(&trans);
	1400	hammer_knote(ap->a_vp, NOTE_LINK);
	1401	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1402	lwkt_reltoken(&hmp->fs_token);
	1403	return (error);
	1404	}
	1405
	1406	/*
	1407	* hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
	1408	*
	1409	* The operating system has already ensured that the directory entry
	1410	* does not exist and done all appropriate namespace locking.
	1411	*/
	1412	static
	1413	int
	1414	hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
	1415	{
	1416	struct hammer_transaction trans;
	1417	struct hammer_inode *dip;
	1418	struct hammer_inode *nip;
	1419	struct nchandle *nch;
	1420	hammer_mount_t hmp;
	1421	int error;
	1422
	1423	nch = ap->a_nch;
	1424	dip = VTOI(ap->a_dvp);
	1425	hmp = dip->hmp;
	1426
	1427	if (dip->flags & HAMMER_INODE_RO)
	1428	return (EROFS);
	1429	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1430	return (error);
	1431
	1432	/*
	1433	* Create a transaction to cover the operations we perform.
	1434	*/
	1435	lwkt_gettoken(&hmp->fs_token);
	1436	hammer_start_transaction(&trans, hmp);
	1437	++hammer_stats_file_iopsw;
	1438
	1439	/*
	1440	* Create a new filesystem object of the requested type. The
	1441	* returned inode will be referenced but not locked.
	1442	*/
	1443	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	1444	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	1445	NULL, &nip);
	1446	if (error) {
	1447	hkprintf("hammer_mkdir error %d\n", error);
	1448	hammer_done_transaction(&trans);
	1449	*ap->a_vpp = NULL;
	1450	lwkt_reltoken(&hmp->fs_token);
	1451	return (error);
	1452	}
	1453	/*
	1454	* Add the new filesystem object to the directory. This will also
	1455	* bump the inode's link count.
	1456	*/
	1457	error = hammer_ip_add_directory(&trans, dip,
	1458	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1459	nip);
	1460	if (error)
	1461	hkprintf("hammer_mkdir (add) error %d\n", error);
	1462
	1463	/*
	1464	* Finish up.
	1465	*/
	1466	if (error) {
	1467	hammer_rel_inode(nip, 0);
	1468	*ap->a_vpp = NULL;
	1469	} else {
	1470	error = hammer_get_vnode(nip, ap->a_vpp);
	1471	hammer_rel_inode(nip, 0);
	1472	if (error == 0) {
	1473	cache_setunresolved(ap->a_nch);
	1474	cache_setvp(ap->a_nch, *ap->a_vpp);
	1475	}
	1476	}
	1477	hammer_done_transaction(&trans);
	1478	if (error == 0)
	1479	hammer_knote(ap->a_dvp, NOTE_WRITE \| NOTE_LINK);
	1480	lwkt_reltoken(&hmp->fs_token);
	1481	return (error);
	1482	}
	1483
	1484	/*
	1485	* hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
	1486	*
	1487	* The operating system has already ensured that the directory entry
	1488	* does not exist and done all appropriate namespace locking.
	1489	*/
	1490	static
	1491	int
	1492	hammer_vop_nmknod(struct vop_nmknod_args *ap)
	1493	{
	1494	struct hammer_transaction trans;
	1495	struct hammer_inode *dip;
	1496	struct hammer_inode *nip;
	1497	struct nchandle *nch;
	1498	hammer_mount_t hmp;
	1499	int error;
	1500
	1501	nch = ap->a_nch;
	1502	dip = VTOI(ap->a_dvp);
	1503	hmp = dip->hmp;
	1504
	1505	if (dip->flags & HAMMER_INODE_RO)
	1506	return (EROFS);
	1507	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1508	return (error);
	1509
	1510	/*
	1511	* Create a transaction to cover the operations we perform.
	1512	*/
	1513	lwkt_gettoken(&hmp->fs_token);
	1514	hammer_start_transaction(&trans, hmp);
	1515	++hammer_stats_file_iopsw;
	1516
	1517	/*
	1518	* Create a new filesystem object of the requested type. The
	1519	* returned inode will be referenced but not locked.
	1520	*
	1521	* If mknod specifies a directory a pseudo-fs is created.
	1522	*/
	1523	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	1524	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	1525	NULL, &nip);
	1526	if (error) {
	1527	hammer_done_transaction(&trans);
	1528	*ap->a_vpp = NULL;
	1529	lwkt_reltoken(&hmp->fs_token);
	1530	return (error);
	1531	}
	1532
	1533	/*
	1534	* Add the new filesystem object to the directory. This will also
	1535	* bump the inode's link count.
	1536	*/
	1537	error = hammer_ip_add_directory(&trans, dip,
	1538	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1539	nip);
	1540
	1541	/*
	1542	* Finish up.
	1543	*/
	1544	if (error) {
	1545	hammer_rel_inode(nip, 0);
	1546	*ap->a_vpp = NULL;
	1547	} else {
	1548	error = hammer_get_vnode(nip, ap->a_vpp);
	1549	hammer_rel_inode(nip, 0);
	1550	if (error == 0) {
	1551	cache_setunresolved(ap->a_nch);
	1552	cache_setvp(ap->a_nch, *ap->a_vpp);
	1553	}
	1554	}
	1555	hammer_done_transaction(&trans);
	1556	if (error == 0)
	1557	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1558	lwkt_reltoken(&hmp->fs_token);
	1559	return (error);
	1560	}
	1561
	1562	/*
	1563	* hammer_vop_open { vp, mode, cred, fp }
	1564	*
	1565	* MPSAFE (does not require fs_token)
	1566	*/
	1567	static
	1568	int
	1569	hammer_vop_open(struct vop_open_args *ap)
	1570	{
	1571	hammer_inode_t ip;
	1572
	1573	++hammer_stats_file_iopsr;
	1574	ip = VTOI(ap->a_vp);
	1575
	1576	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
	1577	return (EROFS);
	1578	return(vop_stdopen(ap));
	1579	}
	1580
	1581	/*
	1582	* hammer_vop_print { vp }
	1583	*/
	1584	static
	1585	int
	1586	hammer_vop_print(struct vop_print_args *ap)
	1587	{
	1588	return EOPNOTSUPP;
	1589	}
	1590
	1591	/*
	1592	* hammer_vop_readdir { vp, uio, cred, eofflag, ncookies, off_t **cookies }
	1593	*/
	1594	static
	1595	int
	1596	hammer_vop_readdir(struct vop_readdir_args *ap)
	1597	{
	1598	struct hammer_transaction trans;
	1599	struct hammer_cursor cursor;
	1600	struct hammer_inode *ip;
	1601	hammer_mount_t hmp;
	1602	struct uio *uio;
	1603	hammer_base_elm_t base;
	1604	int error;
	1605	int cookie_index;
	1606	int ncookies;
	1607	off_t *cookies;
	1608	off_t saveoff;
	1609	int r;
	1610	int dtype;
	1611
	1612	++hammer_stats_file_iopsr;
	1613	ip = VTOI(ap->a_vp);
	1614	uio = ap->a_uio;
	1615	saveoff = uio->uio_offset;
	1616	hmp = ip->hmp;
	1617
	1618	if (ap->a_ncookies) {
	1619	ncookies = uio->uio_resid / 16 + 1;
	1620	if (ncookies > 1024)
	1621	ncookies = 1024;
	1622	cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
	1623	cookie_index = 0;
	1624	} else {
	1625	ncookies = -1;
	1626	cookies = NULL;
	1627	cookie_index = 0;
	1628	}
	1629
	1630	lwkt_gettoken(&hmp->fs_token);
	1631	hammer_simple_transaction(&trans, hmp);
	1632
	1633	/*
	1634	* Handle artificial entries
	1635	*
	1636	* It should be noted that the minimum value for a directory
	1637	* hash key on-media is 0x0000000100000000, so we can use anything
	1638	* less then that to represent our 'special' key space.
	1639	*/
	1640	error = 0;
	1641	if (saveoff == 0) {
	1642	r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
	1643	if (r)
	1644	goto done;
	1645	if (cookies)
	1646	cookies[cookie_index] = saveoff;
	1647	++saveoff;
	1648	++cookie_index;
	1649	if (cookie_index == ncookies)
	1650	goto done;
	1651	}
	1652	if (saveoff == 1) {
	1653	if (ip->ino_data.parent_obj_id) {
	1654	r = vop_write_dirent(&error, uio,
	1655	ip->ino_data.parent_obj_id,
	1656	DT_DIR, 2, "..");
	1657	} else {
	1658	r = vop_write_dirent(&error, uio,
	1659	ip->obj_id, DT_DIR, 2, "..");
	1660	}
	1661	if (r)
	1662	goto done;
	1663	if (cookies)
	1664	cookies[cookie_index] = saveoff;
	1665	++saveoff;
	1666	++cookie_index;
	1667	if (cookie_index == ncookies)
	1668	goto done;
	1669	}
	1670
	1671	/*
	1672	* Key range (begin and end inclusive) to scan. Directory keys
	1673	* directly translate to a 64 bit 'seek' position.
	1674	*/
	1675	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1676	cursor.key_beg.localization = ip->obj_localization +
	1677	hammer_dir_localization(ip);
	1678	cursor.key_beg.obj_id = ip->obj_id;
	1679	cursor.key_beg.create_tid = 0;
	1680	cursor.key_beg.delete_tid = 0;
	1681	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1682	cursor.key_beg.obj_type = 0;
	1683	cursor.key_beg.key = saveoff;
	1684
	1685	cursor.key_end = cursor.key_beg;
	1686	cursor.key_end.key = HAMMER_MAX_KEY;
	1687	cursor.asof = ip->obj_asof;
	1688	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1689
	1690	error = hammer_ip_first(&cursor);
	1691
	1692	while (error == 0) {
	1693	error = hammer_ip_resolve_data(&cursor);
	1694	if (error)
	1695	break;
	1696	base = &cursor.leaf->base;
	1697	saveoff = base->key;
	1698	KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
	1699
	1700	if (base->obj_id != ip->obj_id)
	1701	panic("readdir: bad record at %p", cursor.node);
	1702
	1703	/*
	1704	* Convert pseudo-filesystems into softlinks
	1705	*/
	1706	dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
	1707	r = vop_write_dirent(
	1708	&error, uio, cursor.data->entry.obj_id,
	1709	dtype,
	1710	cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
	1711	(void *)cursor.data->entry.name);
	1712	if (r)
	1713	break;
	1714	++saveoff;
	1715	if (cookies)
	1716	cookies[cookie_index] = base->key;
	1717	++cookie_index;
	1718	if (cookie_index == ncookies)
	1719	break;
	1720	error = hammer_ip_next(&cursor);
	1721	}
	1722	hammer_done_cursor(&cursor);
	1723
	1724	done:
	1725	hammer_done_transaction(&trans);
	1726
	1727	if (ap->a_eofflag)
	1728	*ap->a_eofflag = (error == ENOENT);
	1729	uio->uio_offset = saveoff;
	1730	if (error && cookie_index == 0) {
	1731	if (error == ENOENT)
	1732	error = 0;
	1733	if (cookies) {
	1734	kfree(cookies, M_TEMP);
	1735	*ap->a_ncookies = 0;
	1736	*ap->a_cookies = NULL;
	1737	}
	1738	} else {
	1739	if (error == ENOENT)
	1740	error = 0;
	1741	if (cookies) {
	1742	*ap->a_ncookies = cookie_index;
	1743	*ap->a_cookies = cookies;
	1744	}
	1745	}
	1746	lwkt_reltoken(&hmp->fs_token);
	1747	return(error);
	1748	}
	1749
	1750	/*
	1751	* hammer_vop_readlink { vp, uio, cred }
	1752	*/
	1753	static
	1754	int
	1755	hammer_vop_readlink(struct vop_readlink_args *ap)
	1756	{
	1757	struct hammer_transaction trans;
	1758	struct hammer_cursor cursor;
	1759	struct hammer_inode *ip;
	1760	hammer_mount_t hmp;
	1761	char buf[32];
	1762	u_int32_t localization;
	1763	hammer_pseudofs_inmem_t pfsm;
	1764	int error;
	1765
	1766	ip = VTOI(ap->a_vp);
	1767	hmp = ip->hmp;
	1768
	1769	lwkt_gettoken(&hmp->fs_token);
	1770
	1771	/*
	1772	* Shortcut if the symlink data was stuffed into ino_data.
	1773	*
	1774	* Also expand special "@@PFS%05d" softlinks (expansion only
	1775	* occurs for non-historical (current) accesses made from the
	1776	* primary filesystem).
	1777	*/
	1778	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
	1779	char *ptr;
	1780	int bytes;
	1781
	1782	ptr = ip->ino_data.ext.symlink;
	1783	bytes = (int)ip->ino_data.size;
	1784	if (bytes == 10 &&
	1785	ip->obj_asof == HAMMER_MAX_TID &&
	1786	ip->obj_localization == 0 &&
	1787	strncmp(ptr, "@@PFS", 5) == 0) {
	1788	hammer_simple_transaction(&trans, hmp);
	1789	bcopy(ptr + 5, buf, 5);
	1790	buf[5] = 0;
	1791	localization = strtoul(buf, NULL, 10) << 16;
	1792	pfsm = hammer_load_pseudofs(&trans, localization,
	1793	&error);
	1794	if (error == 0) {
	1795	if (pfsm->pfsd.mirror_flags &
	1796	HAMMER_PFSD_SLAVE) {
	1797	/* vap->va_size == 26 */
	1798	ksnprintf(buf, sizeof(buf),
	1799	"@@0x%016llx:%05d",
	1800	(long long)pfsm->pfsd.sync_end_tid,
	1801	localization >> 16);
	1802	} else {
	1803	/* vap->va_size == 10 */
	1804	ksnprintf(buf, sizeof(buf),
	1805	"@@-1:%05d",
	1806	localization >> 16);
	1807	#if 0
	1808	ksnprintf(buf, sizeof(buf),
	1809	"@@0x%016llx:%05d",
	1810	(long long)HAMMER_MAX_TID,
	1811	localization >> 16);
	1812	#endif
	1813	}
	1814	ptr = buf;
	1815	bytes = strlen(buf);
	1816	}
	1817	if (pfsm)
	1818	hammer_rel_pseudofs(hmp, pfsm);
	1819	hammer_done_transaction(&trans);
	1820	}
	1821	error = uiomove(ptr, bytes, ap->a_uio);
	1822	lwkt_reltoken(&hmp->fs_token);
	1823	return(error);
	1824	}
	1825
	1826	/*
	1827	* Long version
	1828	*/
	1829	hammer_simple_transaction(&trans, hmp);
	1830	++hammer_stats_file_iopsr;
	1831	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1832
	1833	/*
	1834	* Key range (begin and end inclusive) to scan. Directory keys
	1835	* directly translate to a 64 bit 'seek' position.
	1836	*/
	1837	cursor.key_beg.localization = ip->obj_localization +
	1838	HAMMER_LOCALIZE_MISC;
	1839	cursor.key_beg.obj_id = ip->obj_id;
	1840	cursor.key_beg.create_tid = 0;
	1841	cursor.key_beg.delete_tid = 0;
	1842	cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
	1843	cursor.key_beg.obj_type = 0;
	1844	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
	1845	cursor.asof = ip->obj_asof;
	1846	cursor.flags \|= HAMMER_CURSOR_ASOF;
	1847
	1848	error = hammer_ip_lookup(&cursor);
	1849	if (error == 0) {
	1850	error = hammer_ip_resolve_data(&cursor);
	1851	if (error == 0) {
	1852	KKASSERT(cursor.leaf->data_len >=
	1853	HAMMER_SYMLINK_NAME_OFF);
	1854	error = uiomove(cursor.data->symlink.name,
	1855	cursor.leaf->data_len -
	1856	HAMMER_SYMLINK_NAME_OFF,
	1857	ap->a_uio);
	1858	}
	1859	}
	1860	hammer_done_cursor(&cursor);
	1861	hammer_done_transaction(&trans);
	1862	lwkt_reltoken(&hmp->fs_token);
	1863	return(error);
	1864	}
	1865
	1866	/*
	1867	* hammer_vop_nremove { nch, dvp, cred }
	1868	*/
	1869	static
	1870	int
	1871	hammer_vop_nremove(struct vop_nremove_args *ap)
	1872	{
	1873	struct hammer_transaction trans;
	1874	struct hammer_inode *dip;
	1875	hammer_mount_t hmp;
	1876	int error;
	1877
	1878	dip = VTOI(ap->a_dvp);
	1879	hmp = dip->hmp;
	1880
	1881	if (hammer_nohistory(dip) == 0 &&
	1882	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	1883	return (error);
	1884	}
	1885
	1886	lwkt_gettoken(&hmp->fs_token);
	1887	hammer_start_transaction(&trans, hmp);
	1888	++hammer_stats_file_iopsw;
	1889	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
	1890	hammer_done_transaction(&trans);
	1891	if (error == 0)
	1892	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1893	lwkt_reltoken(&hmp->fs_token);
	1894	return (error);
	1895	}
	1896
	1897	/*
	1898	* hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
	1899	*/
	1900	static
	1901	int
	1902	hammer_vop_nrename(struct vop_nrename_args *ap)
	1903	{
	1904	struct hammer_transaction trans;
	1905	struct namecache *fncp;
	1906	struct namecache *tncp;
	1907	struct hammer_inode *fdip;
	1908	struct hammer_inode *tdip;
	1909	struct hammer_inode *ip;
	1910	hammer_mount_t hmp;
	1911	struct hammer_cursor cursor;
	1912	int64_t namekey;
	1913	u_int32_t max_iterations;
	1914	int nlen, error;
	1915
	1916	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
	1917	return(EXDEV);
	1918	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
	1919	return(EXDEV);
	1920
	1921	fdip = VTOI(ap->a_fdvp);
	1922	tdip = VTOI(ap->a_tdvp);
	1923	fncp = ap->a_fnch->ncp;
	1924	tncp = ap->a_tnch->ncp;
	1925	ip = VTOI(fncp->nc_vp);
	1926	KKASSERT(ip != NULL);
	1927
	1928	hmp = ip->hmp;
	1929
	1930	if (fdip->obj_localization != tdip->obj_localization)
	1931	return(EXDEV);
	1932	if (fdip->obj_localization != ip->obj_localization)
	1933	return(EXDEV);
	1934
	1935	if (fdip->flags & HAMMER_INODE_RO)
	1936	return (EROFS);
	1937	if (tdip->flags & HAMMER_INODE_RO)
	1938	return (EROFS);
	1939	if (ip->flags & HAMMER_INODE_RO)
	1940	return (EROFS);
	1941	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1942	return (error);
	1943
	1944	lwkt_gettoken(&hmp->fs_token);
	1945	hammer_start_transaction(&trans, hmp);
	1946	++hammer_stats_file_iopsw;
	1947
	1948	/*
	1949	* Remove tncp from the target directory and then link ip as
	1950	* tncp. XXX pass trans to dounlink
	1951	*
	1952	* Force the inode sync-time to match the transaction so it is
	1953	* in-sync with the creation of the target directory entry.
	1954	*/
	1955	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
	1956	ap->a_cred, 0, -1);
	1957	if (error == 0 \|\| error == ENOENT) {
	1958	error = hammer_ip_add_directory(&trans, tdip,
	1959	tncp->nc_name, tncp->nc_nlen,
	1960	ip);
	1961	if (error == 0) {
	1962	ip->ino_data.parent_obj_id = tdip->obj_id;
	1963	ip->ino_data.ctime = trans.time;
	1964	hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
	1965	}
	1966	}
	1967	if (error)
	1968	goto failed; /* XXX */
	1969
	1970	/*
	1971	* Locate the record in the originating directory and remove it.
	1972	*
	1973	* Calculate the namekey and setup the key range for the scan. This
	1974	* works kinda like a chained hash table where the lower 32 bits
	1975	* of the namekey synthesize the chain.
	1976	*
	1977	* The key range is inclusive of both key_beg and key_end.
	1978	*/
	1979	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
	1980	&max_iterations);
	1981	retry:
	1982	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
	1983	cursor.key_beg.localization = fdip->obj_localization +
	1984	hammer_dir_localization(fdip);
	1985	cursor.key_beg.obj_id = fdip->obj_id;
	1986	cursor.key_beg.key = namekey;
	1987	cursor.key_beg.create_tid = 0;
	1988	cursor.key_beg.delete_tid = 0;
	1989	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1990	cursor.key_beg.obj_type = 0;
	1991
	1992	cursor.key_end = cursor.key_beg;
	1993	cursor.key_end.key += max_iterations;
	1994	cursor.asof = fdip->obj_asof;
	1995	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1996
	1997	/*
	1998	* Scan all matching records (the chain), locate the one matching
	1999	* the requested path component.
	2000	*
	2001	* The hammer_ip_*() functions merge in-memory records with on-disk
	2002	* records for the purposes of the search.
	2003	*/
	2004	error = hammer_ip_first(&cursor);
	2005	while (error == 0) {
	2006	if (hammer_ip_resolve_data(&cursor) != 0)
	2007	break;
	2008	nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
	2009	KKASSERT(nlen > 0);
	2010	if (fncp->nc_nlen == nlen &&
	2011	bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	2012	break;
	2013	}
	2014	error = hammer_ip_next(&cursor);
	2015	}
	2016
	2017	/*
	2018	* If all is ok we have to get the inode so we can adjust nlinks.
	2019	*
	2020	* WARNING: hammer_ip_del_directory() may have to terminate the
	2021	* cursor to avoid a recursion. It's ok to call hammer_done_cursor()
	2022	* twice.
	2023	*/
	2024	if (error == 0)
	2025	error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
	2026
	2027	/*
	2028	* XXX A deadlock here will break rename's atomicy for the purposes
	2029	* of crash recovery.
	2030	*/
	2031	if (error == EDEADLK) {
	2032	hammer_done_cursor(&cursor);
	2033	goto retry;
	2034	}
	2035
	2036	/*
	2037	* Cleanup and tell the kernel that the rename succeeded.
	2038	*
	2039	* NOTE: ip->vp, if non-NULL, cannot be directly referenced
	2040	* without formally acquiring the vp since the vp might
	2041	* have zero refs on it, or in the middle of a reclaim,
	2042	* etc.
	2043	*/
	2044	hammer_done_cursor(&cursor);
	2045	if (error == 0) {
	2046	cache_rename(ap->a_fnch, ap->a_tnch);
	2047	hammer_knote(ap->a_fdvp, NOTE_WRITE);
	2048	hammer_knote(ap->a_tdvp, NOTE_WRITE);
	2049	while (ip->vp) {
	2050	struct vnode *vp;
	2051
	2052	error = hammer_get_vnode(ip, &vp);
	2053	if (error == 0 && vp) {
	2054	vn_unlock(vp);
	2055	hammer_knote(ip->vp, NOTE_RENAME);
	2056	vrele(vp);
	2057	break;
	2058	}
	2059	kprintf("Debug: HAMMER ip/vp race2 avoided\n");
	2060	}
	2061	}
	2062
	2063	failed:
	2064	hammer_done_transaction(&trans);
	2065	lwkt_reltoken(&hmp->fs_token);
	2066	return (error);
	2067	}
	2068
	2069	/*
	2070	* hammer_vop_nrmdir { nch, dvp, cred }
	2071	*/
	2072	static
	2073	int
	2074	hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
	2075	{
	2076	struct hammer_transaction trans;
	2077	struct hammer_inode *dip;
	2078	hammer_mount_t hmp;
	2079	int error;
	2080
	2081	dip = VTOI(ap->a_dvp);
	2082	hmp = dip->hmp;
	2083
	2084	if (hammer_nohistory(dip) == 0 &&
	2085	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	2086	return (error);
	2087	}
	2088
	2089	lwkt_gettoken(&hmp->fs_token);
	2090	hammer_start_transaction(&trans, hmp);
	2091	++hammer_stats_file_iopsw;
	2092	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
	2093	hammer_done_transaction(&trans);
	2094	if (error == 0)
	2095	hammer_knote(ap->a_dvp, NOTE_WRITE \| NOTE_LINK);
	2096	lwkt_reltoken(&hmp->fs_token);
	2097	return (error);
	2098	}
	2099
	2100	/*
	2101	* hammer_vop_markatime { vp, cred }
	2102	*/
	2103	static
	2104	int
	2105	hammer_vop_markatime(struct vop_markatime_args *ap)
	2106	{
	2107	struct hammer_transaction trans;
	2108	struct hammer_inode *ip;
	2109	hammer_mount_t hmp;
	2110
	2111	ip = VTOI(ap->a_vp);
	2112	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	2113	return (EROFS);
	2114	if (ip->flags & HAMMER_INODE_RO)
	2115	return (EROFS);
	2116	hmp = ip->hmp;
	2117	if (hmp->mp->mnt_flag & MNT_NOATIME)
	2118	return (0);
	2119	lwkt_gettoken(&hmp->fs_token);
	2120	hammer_start_transaction(&trans, hmp);
	2121	++hammer_stats_file_iopsw;
	2122
	2123	ip->ino_data.atime = trans.time;
	2124	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
	2125	hammer_done_transaction(&trans);
	2126	hammer_knote(ap->a_vp, NOTE_ATTRIB);
	2127	lwkt_reltoken(&hmp->fs_token);
	2128	return (0);
	2129	}
	2130
	2131	/*
	2132	* hammer_vop_setattr { vp, vap, cred }
	2133	*/
	2134	static
	2135	int
	2136	hammer_vop_setattr(struct vop_setattr_args *ap)
	2137	{
	2138	struct hammer_transaction trans;
	2139	struct hammer_inode *ip;
	2140	struct vattr *vap;
	2141	hammer_mount_t hmp;
	2142	int modflags;
	2143	int error;
	2144	int truncating;
	2145	int blksize;
	2146	int kflags;
	2147	#if 0
	2148	int64_t aligned_size;
	2149	#endif
	2150	u_int32_t flags;
	2151
	2152	vap = ap->a_vap;
	2153	ip = ap->a_vp->v_data;
	2154	modflags = 0;
	2155	kflags = 0;
	2156	hmp = ip->hmp;
	2157
	2158	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	2159	return(EROFS);
	2160	if (ip->flags & HAMMER_INODE_RO)
	2161	return (EROFS);
	2162	if (hammer_nohistory(ip) == 0 &&
	2163	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	2164	return (error);
	2165	}
	2166
	2167	lwkt_gettoken(&hmp->fs_token);
	2168	hammer_start_transaction(&trans, hmp);
	2169	++hammer_stats_file_iopsw;
	2170	error = 0;
	2171
	2172	if (vap->va_flags != VNOVAL) {
	2173	flags = ip->ino_data.uflags;
	2174	error = vop_helper_setattr_flags(&flags, vap->va_flags,
	2175	hammer_to_unix_xid(&ip->ino_data.uid),
	2176	ap->a_cred);
	2177	if (error == 0) {
	2178	if (ip->ino_data.uflags != flags) {
	2179	ip->ino_data.uflags = flags;
	2180	ip->ino_data.ctime = trans.time;
	2181	modflags \|= HAMMER_INODE_DDIRTY;
	2182	kflags \|= NOTE_ATTRIB;
	2183	}
	2184	if (ip->ino_data.uflags & (IMMUTABLE \| APPEND)) {
	2185	error = 0;
	2186	goto done;
	2187	}
	2188	}
	2189	goto done;
	2190	}
	2191	if (ip->ino_data.uflags & (IMMUTABLE \| APPEND)) {
	2192	error = EPERM;
	2193	goto done;
	2194	}
	2195	if (vap->va_uid != (uid_t)VNOVAL \|\| vap->va_gid != (gid_t)VNOVAL) {
	2196	mode_t cur_mode = ip->ino_data.mode;
	2197	uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	2198	gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	2199	uuid_t uuid_uid;
	2200	uuid_t uuid_gid;
	2201
	2202	error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
	2203	ap->a_cred,
	2204	&cur_uid, &cur_gid, &cur_mode);
	2205	if (error == 0) {
	2206	hammer_guid_to_uuid(&uuid_uid, cur_uid);
	2207	hammer_guid_to_uuid(&uuid_gid, cur_gid);
	2208	if (bcmp(&uuid_uid, &ip->ino_data.uid,
	2209	sizeof(uuid_uid)) \|\|
	2210	bcmp(&uuid_gid, &ip->ino_data.gid,
	2211	sizeof(uuid_gid)) \|\|
	2212	ip->ino_data.mode != cur_mode
	2213	) {
	2214	ip->ino_data.uid = uuid_uid;
	2215	ip->ino_data.gid = uuid_gid;
	2216	ip->ino_data.mode = cur_mode;
	2217	ip->ino_data.ctime = trans.time;
	2218	modflags \|= HAMMER_INODE_DDIRTY;
	2219	}
	2220	kflags \|= NOTE_ATTRIB;
	2221	}
	2222	}
	2223	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
	2224	switch(ap->a_vp->v_type) {
	2225	case VREG:
	2226	if (vap->va_size == ip->ino_data.size)
	2227	break;
	2228
	2229	/*
	2230	* Log the operation if in fast-fsync mode or if
	2231	* there are unterminated redo write records present.
	2232	*
	2233	* The second check is needed so the recovery code
	2234	* properly truncates write redos even if nominal
	2235	* REDO operations is turned off due to excessive
	2236	* writes, because the related records might be
	2237	* destroyed and never lay down a TERM_WRITE.
	2238	*/
	2239	if ((ip->flags & HAMMER_INODE_REDO) \|\|
	2240	(ip->flags & HAMMER_INODE_RDIRTY)) {
	2241	error = hammer_generate_redo(&trans, ip,
	2242	vap->va_size,
	2243	HAMMER_REDO_TRUNC,
	2244	NULL, 0);
	2245	}
	2246	blksize = hammer_blocksize(vap->va_size);
	2247
	2248	/*
	2249	* XXX break atomicy, we can deadlock the backend
	2250	* if we do not release the lock. Probably not a
	2251	* big deal here.
	2252	*/
	2253	if (vap->va_size < ip->ino_data.size) {
	2254	nvtruncbuf(ap->a_vp, vap->va_size,
	2255	blksize,
	2256	hammer_blockoff(vap->va_size));
	2257	truncating = 1;
	2258	kflags \|= NOTE_WRITE;
	2259	} else {
	2260	nvextendbuf(ap->a_vp,
	2261	ip->ino_data.size,
	2262	vap->va_size,
	2263	hammer_blocksize(ip->ino_data.size),
	2264	hammer_blocksize(vap->va_size),
	2265	hammer_blockoff(ip->ino_data.size),
	2266	hammer_blockoff(vap->va_size),
	2267	0);
	2268	truncating = 0;
	2269	kflags \|= NOTE_WRITE \| NOTE_EXTEND;
	2270	}
	2271	ip->ino_data.size = vap->va_size;
	2272	ip->ino_data.mtime = trans.time;
	2273	/* XXX safe to use SDIRTY instead of DDIRTY here? */
	2274	modflags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_DDIRTY;
	2275
	2276	/*
	2277	* On-media truncation is cached in the inode until
	2278	* the inode is synchronized. We must immediately
	2279	* handle any frontend records.
	2280	*/
	2281	if (truncating) {
	2282	hammer_ip_frontend_trunc(ip, vap->va_size);
	2283	#ifdef DEBUG_TRUNCATE
	2284	if (HammerTruncIp == NULL)
	2285	HammerTruncIp = ip;
	2286	#endif
	2287	if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
	2288	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2289	ip->trunc_off = vap->va_size;
	2290	#ifdef DEBUG_TRUNCATE
	2291	if (ip == HammerTruncIp)
	2292	kprintf("truncate1 %016llx\n",
	2293	(long long)ip->trunc_off);
	2294	#endif
	2295	} else if (ip->trunc_off > vap->va_size) {
	2296	ip->trunc_off = vap->va_size;
	2297	#ifdef DEBUG_TRUNCATE
	2298	if (ip == HammerTruncIp)
	2299	kprintf("truncate2 %016llx\n",
	2300	(long long)ip->trunc_off);
	2301	#endif
	2302	} else {
	2303	#ifdef DEBUG_TRUNCATE
	2304	if (ip == HammerTruncIp)
	2305	kprintf("truncate3 %016llx (ignored)\n",
	2306	(long long)vap->va_size);
	2307	#endif
	2308	}
	2309	}
	2310
	2311	#if 0
	2312	/*
	2313	* When truncating, nvtruncbuf() may have cleaned out
	2314	* a portion of the last block on-disk in the buffer
	2315	* cache. We must clean out any frontend records
	2316	* for blocks beyond the new last block.
	2317	*/
	2318	aligned_size = (vap->va_size + (blksize - 1)) &
	2319	~(int64_t)(blksize - 1);
	2320	if (truncating && vap->va_size < aligned_size) {
	2321	aligned_size -= blksize;
	2322	hammer_ip_frontend_trunc(ip, aligned_size);
	2323	}
	2324	#endif
	2325	break;
	2326	case VDATABASE:
	2327	if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
	2328	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2329	ip->trunc_off = vap->va_size;
	2330	} else if (ip->trunc_off > vap->va_size) {
	2331	ip->trunc_off = vap->va_size;
	2332	}
	2333	hammer_ip_frontend_trunc(ip, vap->va_size);
	2334	ip->ino_data.size = vap->va_size;
	2335	ip->ino_data.mtime = trans.time;
	2336	modflags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_DDIRTY;
	2337	kflags \|= NOTE_ATTRIB;
	2338	break;
	2339	default:
	2340	error = EINVAL;
	2341	goto done;
	2342	}
	2343	break;
	2344	}
	2345	if (vap->va_atime.tv_sec != VNOVAL) {
	2346	ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
	2347	modflags \|= HAMMER_INODE_ATIME;
	2348	kflags \|= NOTE_ATTRIB;
	2349	}
	2350	if (vap->va_mtime.tv_sec != VNOVAL) {
	2351	ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
	2352	modflags \|= HAMMER_INODE_MTIME;
	2353	kflags \|= NOTE_ATTRIB;
	2354	}
	2355	if (vap->va_mode != (mode_t)VNOVAL) {
	2356	mode_t cur_mode = ip->ino_data.mode;
	2357	uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	2358	gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	2359
	2360	error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
	2361	cur_uid, cur_gid, &cur_mode);
	2362	if (error == 0 && ip->ino_data.mode != cur_mode) {
	2363	ip->ino_data.mode = cur_mode;
	2364	ip->ino_data.ctime = trans.time;
	2365	modflags \|= HAMMER_INODE_DDIRTY;
	2366	kflags \|= NOTE_ATTRIB;
	2367	}
	2368	}
	2369	done:
	2370	if (error == 0)
	2371	hammer_modify_inode(&trans, ip, modflags);
	2372	hammer_done_transaction(&trans);
	2373	hammer_knote(ap->a_vp, kflags);
	2374	lwkt_reltoken(&hmp->fs_token);
	2375	return (error);
	2376	}
	2377
	2378	/*
	2379	* hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
	2380	*/
	2381	static
	2382	int
	2383	hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
	2384	{
	2385	struct hammer_transaction trans;
	2386	struct hammer_inode *dip;
	2387	struct hammer_inode *nip;
	2388	hammer_record_t record;
	2389	struct nchandle *nch;
	2390	hammer_mount_t hmp;
	2391	int error;
	2392	int bytes;
	2393
	2394	ap->a_vap->va_type = VLNK;
	2395
	2396	nch = ap->a_nch;
	2397	dip = VTOI(ap->a_dvp);
	2398	hmp = dip->hmp;
	2399
	2400	if (dip->flags & HAMMER_INODE_RO)
	2401	return (EROFS);
	2402	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	2403	return (error);
	2404
	2405	/*
	2406	* Create a transaction to cover the operations we perform.
	2407	*/
	2408	lwkt_gettoken(&hmp->fs_token);
	2409	hammer_start_transaction(&trans, hmp);
	2410	++hammer_stats_file_iopsw;
	2411
	2412	/*
	2413	* Create a new filesystem object of the requested type. The
	2414	* returned inode will be referenced but not locked.
	2415	*/
	2416
	2417	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	2418	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	2419	NULL, &nip);
	2420	if (error) {
	2421	hammer_done_transaction(&trans);
	2422	*ap->a_vpp = NULL;
	2423	lwkt_reltoken(&hmp->fs_token);
	2424	return (error);
	2425	}
	2426
	2427	/*
	2428	* Add a record representing the symlink. symlink stores the link
	2429	* as pure data, not a string, and is no \0 terminated.
	2430	*/
	2431	if (error == 0) {
	2432	bytes = strlen(ap->a_target);
	2433
	2434	if (bytes <= HAMMER_INODE_BASESYMLEN) {
	2435	bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
	2436	} else {
	2437	record = hammer_alloc_mem_record(nip, bytes);
	2438	record->type = HAMMER_MEM_RECORD_GENERAL;
	2439
	2440	record->leaf.base.localization = nip->obj_localization +
	2441	HAMMER_LOCALIZE_MISC;
	2442	record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
	2443	record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
	2444	record->leaf.data_len = bytes;
	2445	KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
	2446	bcopy(ap->a_target, record->data->symlink.name, bytes);
	2447	error = hammer_ip_add_record(&trans, record);
	2448	}
	2449
	2450	/*
	2451	* Set the file size to the length of the link.
	2452	*/
	2453	if (error == 0) {
	2454	nip->ino_data.size = bytes;
	2455	hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
	2456	}
	2457	}
	2458	if (error == 0)
	2459	error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
	2460	nch->ncp->nc_nlen, nip);
	2461
	2462	/*
	2463	* Finish up.
	2464	*/
	2465	if (error) {
	2466	hammer_rel_inode(nip, 0);
	2467	*ap->a_vpp = NULL;
	2468	} else {
	2469	error = hammer_get_vnode(nip, ap->a_vpp);
	2470	hammer_rel_inode(nip, 0);
	2471	if (error == 0) {
	2472	cache_setunresolved(ap->a_nch);
	2473	cache_setvp(ap->a_nch, *ap->a_vpp);
	2474	hammer_knote(ap->a_dvp, NOTE_WRITE);
	2475	}
	2476	}
	2477	hammer_done_transaction(&trans);
	2478	lwkt_reltoken(&hmp->fs_token);
	2479	return (error);
	2480	}
	2481
	2482	/*
	2483	* hammer_vop_nwhiteout { nch, dvp, cred, flags }
	2484	*/
	2485	static
	2486	int
	2487	hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
	2488	{
	2489	struct hammer_transaction trans;
	2490	struct hammer_inode *dip;
	2491	hammer_mount_t hmp;
	2492	int error;
	2493
	2494	dip = VTOI(ap->a_dvp);
	2495	hmp = dip->hmp;
	2496
	2497	if (hammer_nohistory(dip) == 0 &&
	2498	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
	2499	return (error);
	2500	}
	2501
	2502	lwkt_gettoken(&hmp->fs_token);
	2503	hammer_start_transaction(&trans, hmp);
	2504	++hammer_stats_file_iopsw;
	2505	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
	2506	ap->a_cred, ap->a_flags, -1);
	2507	hammer_done_transaction(&trans);
	2508	lwkt_reltoken(&hmp->fs_token);
	2509
	2510	return (error);
	2511	}
	2512
	2513	/*
	2514	* hammer_vop_ioctl { vp, command, data, fflag, cred }
	2515	*/
	2516	static
	2517	int
	2518	hammer_vop_ioctl(struct vop_ioctl_args *ap)
	2519	{
	2520	struct hammer_inode *ip = ap->a_vp->v_data;
	2521	hammer_mount_t hmp = ip->hmp;
	2522	int error;
	2523
	2524	++hammer_stats_file_iopsr;
	2525	lwkt_gettoken(&hmp->fs_token);
	2526	error = hammer_ioctl(ip, ap->a_command, ap->a_data,
	2527	ap->a_fflag, ap->a_cred);
	2528	lwkt_reltoken(&hmp->fs_token);
	2529	return (error);
	2530	}
	2531
	2532	static
	2533	int
	2534	hammer_vop_mountctl(struct vop_mountctl_args *ap)
	2535	{
	2536	static const struct mountctl_opt extraopt[] = {
	2537	{ HMNT_NOHISTORY, "nohistory" },
	2538	{ HMNT_MASTERID, "master" },
	2539	{ 0, NULL}
	2540
	2541	};
	2542	struct hammer_mount *hmp;
	2543	struct mount *mp;
	2544	int usedbytes;
	2545	int error;
	2546
	2547	error = 0;
	2548	usedbytes = 0;
	2549	mp = ap->a_head.a_ops->head.vv_mount;
	2550	KKASSERT(mp->mnt_data != NULL);
	2551	hmp = (struct hammer_mount *)mp->mnt_data;
	2552
	2553	lwkt_gettoken(&hmp->fs_token);
	2554
	2555	switch(ap->a_op) {
	2556	case MOUNTCTL_SET_EXPORT:
	2557	if (ap->a_ctllen != sizeof(struct export_args))
	2558	error = EINVAL;
	2559	else
	2560	error = hammer_vfs_export(mp, ap->a_op,
	2561	(const struct export_args *)ap->a_ctl);
	2562	break;
	2563	case MOUNTCTL_MOUNTFLAGS:
	2564	{
	2565	/*
	2566	* Call standard mountctl VOP function
	2567	* so we get user mount flags.
	2568	*/
	2569	error = vop_stdmountctl(ap);
	2570	if (error)
	2571	break;
	2572
	2573	usedbytes = *ap->a_res;
	2574
	2575	if (usedbytes > 0 && usedbytes < ap->a_buflen) {
	2576	usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
	2577	ap->a_buf,
	2578	ap->a_buflen - usedbytes,
	2579	&error);
	2580	}
	2581
	2582	*ap->a_res += usedbytes;
	2583	break;
	2584	}
	2585	default:
	2586	error = vop_stdmountctl(ap);
	2587	break;
	2588	}
	2589	lwkt_reltoken(&hmp->fs_token);
	2590	return(error);
	2591	}
	2592
	2593	/*
	2594	* hammer_vop_strategy { vp, bio }
	2595	*
	2596	* Strategy call, used for regular file read & write only. Note that the
	2597	* bp may represent a cluster.
	2598	*
	2599	* To simplify operation and allow better optimizations in the future,
	2600	* this code does not make any assumptions with regards to buffer alignment
	2601	* or size.
	2602	*/
	2603	static
	2604	int
	2605	hammer_vop_strategy(struct vop_strategy_args *ap)
	2606	{
	2607	struct buf *bp;
	2608	int error;
	2609
	2610	bp = ap->a_bio->bio_buf;
	2611
	2612	switch(bp->b_cmd) {
	2613	case BUF_CMD_READ:
	2614	error = hammer_vop_strategy_read(ap);
	2615	break;
	2616	case BUF_CMD_WRITE:
	2617	error = hammer_vop_strategy_write(ap);
	2618	break;
	2619	default:
	2620	bp->b_error = error = EINVAL;
	2621	bp->b_flags \|= B_ERROR;
	2622	biodone(ap->a_bio);
	2623	break;
	2624	}
	2625
	2626	/* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
	2627
	2628	return (error);
	2629	}
	2630
	2631	/*
	2632	* Read from a regular file. Iterate the related records and fill in the
	2633	* BIO/BUF. Gaps are zero-filled.
	2634	*
	2635	* The support code in hammer_object.c should be used to deal with mixed
	2636	* in-memory and on-disk records.
	2637	*
	2638	* NOTE: Can be called from the cluster code with an oversized buf.
	2639	*
	2640	* XXX atime update
	2641	*/
	2642	static
	2643	int
	2644	hammer_vop_strategy_read(struct vop_strategy_args *ap)
	2645	{
	2646	struct hammer_transaction trans;
	2647	struct hammer_inode *ip;
	2648	struct hammer_inode *dip;
	2649	hammer_mount_t hmp;
	2650	struct hammer_cursor cursor;
	2651	hammer_base_elm_t base;
	2652	hammer_off_t disk_offset;
	2653	struct bio *bio;
	2654	struct bio *nbio;
	2655	struct buf *bp;
	2656	int64_t rec_offset;
	2657	int64_t ran_end;
	2658	int64_t tmp64;
	2659	int error;
	2660	int boff;
	2661	int roff;
	2662	int n;
	2663
	2664	bio = ap->a_bio;
	2665	bp = bio->bio_buf;
	2666	ip = ap->a_vp->v_data;
	2667	hmp = ip->hmp;
	2668
	2669	/*
	2670	* The zone-2 disk offset may have been set by the cluster code via
	2671	* a BMAP operation, or else should be NOOFFSET.
	2672	*
	2673	* Checking the high bits for a match against zone-2 should suffice.
	2674	*/
	2675	nbio = push_bio(bio);
	2676	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
	2677	HAMMER_ZONE_LARGE_DATA) {
	2678	lwkt_gettoken(&hmp->fs_token);
	2679	error = hammer_io_direct_read(hmp, nbio, NULL);
	2680	lwkt_reltoken(&hmp->fs_token);
	2681	return (error);
	2682	}
	2683
	2684	/*
	2685	* Well, that sucked. Do it the hard way. If all the stars are
	2686	* aligned we may still be able to issue a direct-read.
	2687	*/
	2688	lwkt_gettoken(&hmp->fs_token);
	2689	hammer_simple_transaction(&trans, hmp);
	2690	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	2691
	2692	/*
	2693	* Key range (begin and end inclusive) to scan. Note that the key's
	2694	* stored in the actual records represent BASE+LEN, not BASE. The
	2695	* first record containing bio_offset will have a key > bio_offset.
	2696	*/
	2697	cursor.key_beg.localization = ip->obj_localization +
	2698	HAMMER_LOCALIZE_MISC;
	2699	cursor.key_beg.obj_id = ip->obj_id;
	2700	cursor.key_beg.create_tid = 0;
	2701	cursor.key_beg.delete_tid = 0;
	2702	cursor.key_beg.obj_type = 0;
	2703	cursor.key_beg.key = bio->bio_offset + 1;
	2704	cursor.asof = ip->obj_asof;
	2705	cursor.flags \|= HAMMER_CURSOR_ASOF;
	2706
	2707	cursor.key_end = cursor.key_beg;
	2708	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
	2709	#if 0
	2710	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
	2711	cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
	2712	cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
	2713	cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
	2714	} else
	2715	#endif
	2716	{
	2717	ran_end = bio->bio_offset + bp->b_bufsize;
	2718	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
	2719	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
	2720	tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
	2721	if (tmp64 < ran_end)
	2722	cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
	2723	else
	2724	cursor.key_end.key = ran_end + MAXPHYS + 1;
	2725	}
	2726	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE;
	2727
	2728	error = hammer_ip_first(&cursor);
	2729	boff = 0;
	2730
	2731	while (error == 0) {
	2732	/*
	2733	* Get the base file offset of the record. The key for
	2734	* data records is (base + bytes) rather then (base).
	2735	*/
	2736	base = &cursor.leaf->base;
	2737	rec_offset = base->key - cursor.leaf->data_len;
	2738
	2739	/*
	2740	* Calculate the gap, if any, and zero-fill it.
	2741	*
	2742	* n is the offset of the start of the record verses our
	2743	* current seek offset in the bio.
	2744	*/
	2745	n = (int)(rec_offset - (bio->bio_offset + boff));
	2746	if (n > 0) {
	2747	if (n > bp->b_bufsize - boff)
	2748	n = bp->b_bufsize - boff;
	2749	bzero((char *)bp->b_data + boff, n);
	2750	boff += n;
	2751	n = 0;
	2752	}
	2753
	2754	/*
	2755	* Calculate the data offset in the record and the number
	2756	* of bytes we can copy.
	2757	*
	2758	* There are two degenerate cases. First, boff may already
	2759	* be at bp->b_bufsize. Secondly, the data offset within
	2760	* the record may exceed the record's size.
	2761	*/
	2762	roff = -n;
	2763	rec_offset += roff;
	2764	n = cursor.leaf->data_len - roff;
	2765	if (n <= 0) {
	2766	kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
	2767	n = 0;
	2768	} else if (n > bp->b_bufsize - boff) {
	2769	n = bp->b_bufsize - boff;
	2770	}
	2771
	2772	/*
	2773	* Deal with cached truncations. This cool bit of code
	2774	* allows truncate()/ftruncate() to avoid having to sync
	2775	* the file.
	2776	*
	2777	* If the frontend is truncated then all backend records are
	2778	* subject to the frontend's truncation.
	2779	*
	2780	* If the backend is truncated then backend records on-disk
	2781	* (but not in-memory) are subject to the backend's
	2782	* truncation. In-memory records owned by the backend
	2783	* represent data written after the truncation point on the
	2784	* backend and must not be truncated.
	2785	*
	2786	* Truncate operations deal with frontend buffer cache
	2787	* buffers and frontend-owned in-memory records synchronously.
	2788	*/
	2789	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	2790	if (hammer_cursor_ondisk(&cursor)/* \|\|
	2791	cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
	2792	if (ip->trunc_off <= rec_offset)
	2793	n = 0;
	2794	else if (ip->trunc_off < rec_offset + n)
	2795	n = (int)(ip->trunc_off - rec_offset);
	2796	}
	2797	}
	2798	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2799	if (hammer_cursor_ondisk(&cursor)) {
	2800	if (ip->sync_trunc_off <= rec_offset)
	2801	n = 0;
	2802	else if (ip->sync_trunc_off < rec_offset + n)
	2803	n = (int)(ip->sync_trunc_off - rec_offset);
	2804	}
	2805	}
	2806
	2807	/*
	2808	* Try to issue a direct read into our bio if possible,
	2809	* otherwise resolve the element data into a hammer_buffer
	2810	* and copy.
	2811	*
	2812	* The buffer on-disk should be zerod past any real
	2813	* truncation point, but may not be for any synthesized
	2814	* truncation point from above.
	2815	*/
	2816	disk_offset = cursor.leaf->data_offset + roff;
	2817	if (boff == 0 && n == bp->b_bufsize &&
	2818	hammer_cursor_ondisk(&cursor) &&
	2819	(disk_offset & HAMMER_BUFMASK) == 0) {
	2820	KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
	2821	HAMMER_ZONE_LARGE_DATA);
	2822	nbio->bio_offset = disk_offset;
	2823	error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
	2824	if (hammer_live_dedup)
	2825	hammer_dedup_cache_add(ip, cursor.leaf);
	2826	goto done;
	2827	} else if (n) {
	2828	error = hammer_ip_resolve_data(&cursor);
	2829	if (error == 0) {
	2830	bcopy((char *)cursor.data + roff,
	2831	(char *)bp->b_data + boff, n);
	2832	}
	2833	}
	2834	if (error)
	2835	break;
	2836
	2837	/*
	2838	* We have to be sure that the only elements added to the
	2839	* dedup cache are those which are already on-media.
	2840	*/
	2841	if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
	2842	hammer_dedup_cache_add(ip, cursor.leaf);
	2843
	2844	/*
	2845	* Iterate until we have filled the request.
	2846	*/
	2847	boff += n;
	2848	if (boff == bp->b_bufsize)
	2849	break;
	2850	error = hammer_ip_next(&cursor);
	2851	}
	2852
	2853	/*
	2854	* There may have been a gap after the last record
	2855	*/
	2856	if (error == ENOENT)
	2857	error = 0;
	2858	if (error == 0 && boff != bp->b_bufsize) {
	2859	KKASSERT(boff < bp->b_bufsize);
	2860	bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
	2861	/* boff = bp->b_bufsize; */
	2862	}
	2863	bp->b_resid = 0;
	2864	bp->b_error = error;
	2865	if (error)
	2866	bp->b_flags \|= B_ERROR;
	2867	biodone(ap->a_bio);
	2868
	2869	done:
	2870	/*
	2871	* Cache the b-tree node for the last data read in cache[1].
	2872	*
	2873	* If we hit the file EOF then also cache the node in the
	2874	* governing director's cache[3], it will be used to initialize
	2875	* the inode's cache[1] for any inodes looked up via the directory.
	2876	*
	2877	* This doesn't reduce disk accesses since the B-Tree chain is
	2878	* likely cached, but it does reduce cpu overhead when looking
	2879	* up file offsets for cpdup/tar/cpio style iterations.
	2880	*/
	2881	if (cursor.node)
	2882	hammer_cache_node(&ip->cache[1], cursor.node);
	2883	if (ran_end >= ip->ino_data.size) {
	2884	dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
	2885	ip->obj_asof, ip->obj_localization);
	2886	if (dip) {
	2887	hammer_cache_node(&dip->cache[3], cursor.node);
	2888	hammer_rel_inode(dip, 0);
	2889	}
	2890	}
	2891	hammer_done_cursor(&cursor);
	2892	hammer_done_transaction(&trans);
	2893	lwkt_reltoken(&hmp->fs_token);
	2894	return(error);
	2895	}
	2896
	2897	/*
	2898	* BMAP operation - used to support cluster_read() only.
	2899	*
	2900	* (struct vnode vp, off_t loffset, off_t doffsetp, int runp, int runb)
	2901	*
	2902	* This routine may return EOPNOTSUPP if the opration is not supported for
	2903	* the specified offset. The contents of the pointer arguments do not
	2904	* need to be initialized in that case.
	2905	*
	2906	* If a disk address is available and properly aligned return 0 with
	2907	* doffsetp set to the zone-2 address, and runp / *runb set appropriately
	2908	* to the run-length relative to that offset. Callers may assume that
	2909	* doffsetp is valid if 0 is returned, even if runp is not sufficiently
	2910	* large, so return EOPNOTSUPP if it is not sufficiently large.
	2911	*/
	2912	static
	2913	int
	2914	hammer_vop_bmap(struct vop_bmap_args *ap)
	2915	{
	2916	struct hammer_transaction trans;
	2917	struct hammer_inode *ip;
	2918	hammer_mount_t hmp;
	2919	struct hammer_cursor cursor;
	2920	hammer_base_elm_t base;
	2921	int64_t rec_offset;
	2922	int64_t ran_end;
	2923	int64_t tmp64;
	2924	int64_t base_offset;
	2925	int64_t base_disk_offset;
	2926	int64_t last_offset;
	2927	hammer_off_t last_disk_offset;
	2928	hammer_off_t disk_offset;
	2929	int rec_len;
	2930	int error;
	2931	int blksize;
	2932
	2933	++hammer_stats_file_iopsr;
	2934	ip = ap->a_vp->v_data;
	2935	hmp = ip->hmp;
	2936
	2937	/*
	2938	* We can only BMAP regular files. We can't BMAP database files,
	2939	* directories, etc.
	2940	*/
	2941	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
	2942	return(EOPNOTSUPP);
	2943
	2944	/*
	2945	* bmap is typically called with runp/runb both NULL when used
	2946	* for writing. We do not support BMAP for writing atm.
	2947	*/
	2948	if (ap->a_cmd != BUF_CMD_READ)
	2949	return(EOPNOTSUPP);
	2950
	2951	/*
	2952	* Scan the B-Tree to acquire blockmap addresses, then translate
	2953	* to raw addresses.
	2954	*/
	2955	lwkt_gettoken(&hmp->fs_token);
	2956	hammer_simple_transaction(&trans, hmp);
	2957	#if 0
	2958	kprintf("bmap_beg %016llx ip->cache %p\n",
	2959	(long long)ap->a_loffset, ip->cache[1]);
	2960	#endif
	2961	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	2962
	2963	/*
	2964	* Key range (begin and end inclusive) to scan. Note that the key's
	2965	* stored in the actual records represent BASE+LEN, not BASE. The
	2966	* first record containing bio_offset will have a key > bio_offset.
	2967	*/
	2968	cursor.key_beg.localization = ip->obj_localization +
	2969	HAMMER_LOCALIZE_MISC;
	2970	cursor.key_beg.obj_id = ip->obj_id;
	2971	cursor.key_beg.create_tid = 0;
	2972	cursor.key_beg.delete_tid = 0;
	2973	cursor.key_beg.obj_type = 0;
	2974	if (ap->a_runb)
	2975	cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
	2976	else
	2977	cursor.key_beg.key = ap->a_loffset + 1;
	2978	if (cursor.key_beg.key < 0)
	2979	cursor.key_beg.key = 0;
	2980	cursor.asof = ip->obj_asof;
	2981	cursor.flags \|= HAMMER_CURSOR_ASOF;
	2982
	2983	cursor.key_end = cursor.key_beg;
	2984	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
	2985
	2986	ran_end = ap->a_loffset + MAXPHYS;
	2987	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
	2988	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
	2989	tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
	2990	if (tmp64 < ran_end)
	2991	cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
	2992	else
	2993	cursor.key_end.key = ran_end + MAXPHYS + 1;
	2994
	2995	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE;
	2996
	2997	error = hammer_ip_first(&cursor);
	2998	base_offset = last_offset = 0;
	2999	base_disk_offset = last_disk_offset = 0;
	3000
	3001	while (error == 0) {
	3002	/*
	3003	* Get the base file offset of the record. The key for
	3004	* data records is (base + bytes) rather then (base).
	3005	*
	3006	* NOTE: rec_offset + rec_len may exceed the end-of-file.
	3007	* The extra bytes should be zero on-disk and the BMAP op
	3008	* should still be ok.
	3009	*/
	3010	base = &cursor.leaf->base;
	3011	rec_offset = base->key - cursor.leaf->data_len;
	3012	rec_len = cursor.leaf->data_len;
	3013
	3014	/*
	3015	* Incorporate any cached truncation.
	3016	*
	3017	* NOTE: Modifications to rec_len based on synthesized
	3018	* truncation points remove the guarantee that any extended
	3019	* data on disk is zero (since the truncations may not have
	3020	* taken place on-media yet).
	3021	*/
	3022	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	3023	if (hammer_cursor_ondisk(&cursor) \|\|
	3024	cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
	3025	if (ip->trunc_off <= rec_offset)
	3026	rec_len = 0;
	3027	else if (ip->trunc_off < rec_offset + rec_len)
	3028	rec_len = (int)(ip->trunc_off - rec_offset);
	3029	}
	3030	}
	3031	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	3032	if (hammer_cursor_ondisk(&cursor)) {
	3033	if (ip->sync_trunc_off <= rec_offset)
	3034	rec_len = 0;
	3035	else if (ip->sync_trunc_off < rec_offset + rec_len)
	3036	rec_len = (int)(ip->sync_trunc_off - rec_offset);
	3037	}
	3038	}
	3039
	3040	/*
	3041	* Accumulate information. If we have hit a discontiguous
	3042	* block reset base_offset unless we are already beyond the
	3043	* requested offset. If we are, that's it, we stop.
	3044	*/
	3045	if (error)
	3046	break;
	3047	if (hammer_cursor_ondisk(&cursor)) {
	3048	disk_offset = cursor.leaf->data_offset;
	3049	if (rec_offset != last_offset \|\|
	3050	disk_offset != last_disk_offset) {
	3051	if (rec_offset > ap->a_loffset)
	3052	break;
	3053	base_offset = rec_offset;
	3054	base_disk_offset = disk_offset;
	3055	}
	3056	last_offset = rec_offset + rec_len;
	3057	last_disk_offset = disk_offset + rec_len;
	3058
	3059	if (hammer_live_dedup)
	3060	hammer_dedup_cache_add(ip, cursor.leaf);
	3061	}
	3062
	3063	error = hammer_ip_next(&cursor);
	3064	}
	3065
	3066	#if 0
	3067	kprintf("BMAP %016llx: %016llx - %016llx\n",
	3068	(long long)ap->a_loffset,
	3069	(long long)base_offset,
	3070	(long long)last_offset);
	3071	kprintf("BMAP %16s: %016llx - %016llx\n", "",
	3072	(long long)base_disk_offset,
	3073	(long long)last_disk_offset);
	3074	#endif
	3075
	3076	if (cursor.node) {
	3077	hammer_cache_node(&ip->cache[1], cursor.node);
	3078	#if 0
	3079	kprintf("bmap_end2 %016llx ip->cache %p\n",
	3080	(long long)ap->a_loffset, ip->cache[1]);
	3081	#endif
	3082	}
	3083	hammer_done_cursor(&cursor);
	3084	hammer_done_transaction(&trans);
	3085	lwkt_reltoken(&hmp->fs_token);
	3086
	3087	/*
	3088	* If we couldn't find any records or the records we did find were
	3089	* all behind the requested offset, return failure. A forward
	3090	* truncation can leave a hole w/ no on-disk records.
	3091	*/
	3092	if (last_offset == 0 \|\| last_offset < ap->a_loffset)
	3093	return (EOPNOTSUPP);
	3094
	3095	/*
	3096	* Figure out the block size at the requested offset and adjust
	3097	* our limits so the cluster_read() does not create inappropriately
	3098	* sized buffer cache buffers.
	3099	*/
	3100	blksize = hammer_blocksize(ap->a_loffset);
	3101	if (hammer_blocksize(base_offset) != blksize) {
	3102	base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
	3103	}
	3104	if (last_offset != ap->a_loffset &&
	3105	hammer_blocksize(last_offset - 1) != blksize) {
	3106	last_offset = hammer_blockdemarc(ap->a_loffset,
	3107	last_offset - 1);
	3108	}
	3109
	3110	/*
	3111	* Returning EOPNOTSUPP simply prevents the direct-IO optimization
	3112	* from occuring.
	3113	*/
	3114	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
	3115
	3116	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
	3117	/*
	3118	* Only large-data zones can be direct-IOd
	3119	*/
	3120	error = EOPNOTSUPP;
	3121	} else if ((disk_offset & HAMMER_BUFMASK) \|\|
	3122	(last_offset - ap->a_loffset) < blksize) {
	3123	/*
	3124	* doffsetp is not aligned or the forward run size does
	3125	* not cover a whole buffer, disallow the direct I/O.
	3126	*/
	3127	error = EOPNOTSUPP;
	3128	} else {
	3129	/*
	3130	* We're good.
	3131	*/
	3132	*ap->a_doffsetp = disk_offset;
	3133	if (ap->a_runb) {
	3134	*ap->a_runb = ap->a_loffset - base_offset;
	3135	KKASSERT(*ap->a_runb >= 0);
	3136	}
	3137	if (ap->a_runp) {
	3138	*ap->a_runp = last_offset - ap->a_loffset;
	3139	KKASSERT(*ap->a_runp >= 0);
	3140	}
	3141	error = 0;
	3142	}
	3143	return(error);
	3144	}
	3145
	3146	/*
	3147	* Write to a regular file. Because this is a strategy call the OS is
	3148	* trying to actually get data onto the media.
	3149	*/
	3150	static
	3151	int
	3152	hammer_vop_strategy_write(struct vop_strategy_args *ap)
	3153	{
	3154	hammer_record_t record;
	3155	hammer_mount_t hmp;
	3156	hammer_inode_t ip;
	3157	struct bio *bio;
	3158	struct buf *bp;
	3159	int blksize;
	3160	int bytes;
	3161	int error;
	3162
	3163	bio = ap->a_bio;
	3164	bp = bio->bio_buf;
	3165	ip = ap->a_vp->v_data;
	3166	hmp = ip->hmp;
	3167
	3168	blksize = hammer_blocksize(bio->bio_offset);
	3169	KKASSERT(bp->b_bufsize == blksize);
	3170
	3171	if (ip->flags & HAMMER_INODE_RO) {
	3172	bp->b_error = EROFS;
	3173	bp->b_flags \|= B_ERROR;
	3174	biodone(ap->a_bio);
	3175	return(EROFS);
	3176	}
	3177
	3178	lwkt_gettoken(&hmp->fs_token);
	3179
	3180	/*
	3181	* Interlock with inode destruction (no in-kernel or directory
	3182	* topology visibility). If we queue new IO while trying to
	3183	* destroy the inode we can deadlock the vtrunc call in
	3184	* hammer_inode_unloadable_check().
	3185	*
	3186	* Besides, there's no point flushing a bp associated with an
	3187	* inode that is being destroyed on-media and has no kernel
	3188	* references.
	3189	*/
	3190	if ((ip->flags \| ip->sync_flags) &
	3191	(HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) {
	3192	bp->b_resid = 0;
	3193	biodone(ap->a_bio);
	3194	lwkt_reltoken(&hmp->fs_token);
	3195	return(0);
	3196	}
	3197
	3198	/*
	3199	* Reserve space and issue a direct-write from the front-end.
	3200	* NOTE: The direct_io code will hammer_bread/bcopy smaller
	3201	* allocations.
	3202	*
	3203	* An in-memory record will be installed to reference the storage
	3204	* until the flusher can get to it.
	3205	*
	3206	* Since we own the high level bio the front-end will not try to
	3207	* do a direct-read until the write completes.
	3208	*
	3209	* NOTE: The only time we do not reserve a full-sized buffers
	3210	* worth of data is if the file is small. We do not try to
	3211	* allocate a fragment (from the small-data zone) at the end of
	3212	* an otherwise large file as this can lead to wildly separated
	3213	* data.
	3214	*/
	3215	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
	3216	KKASSERT(bio->bio_offset < ip->ino_data.size);
	3217	if (bio->bio_offset \|\| ip->ino_data.size > HAMMER_BUFSIZE / 2)
	3218	bytes = bp->b_bufsize;
	3219	else
	3220	bytes = ((int)ip->ino_data.size + 15) & ~15;
	3221
	3222	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
	3223	bytes, &error);
	3224
	3225	/*
	3226	* B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
	3227	* in hammer_vop_write(). We must flag the record so the proper
	3228	* REDO_TERM_WRITE entry is generated during the flush.
	3229	*/
	3230	if (record) {
	3231	if (bp->b_flags & B_VFSFLAG1) {
	3232	record->flags \|= HAMMER_RECF_REDO;
	3233	bp->b_flags &= ~B_VFSFLAG1;
	3234	}
	3235	if (record->flags & HAMMER_RECF_DEDUPED) {
	3236	bp->b_resid = 0;
	3237	hammer_ip_replace_bulk(hmp, record);
	3238	biodone(ap->a_bio);
	3239	} else {
	3240	hammer_io_direct_write(hmp, bio, record);
	3241	}
	3242	if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
	3243	hammer_flush_inode(ip, 0);
	3244	} else {
	3245	bp->b_bio2.bio_offset = NOOFFSET;
	3246	bp->b_error = error;
	3247	bp->b_flags \|= B_ERROR;
	3248	biodone(ap->a_bio);
	3249	}
	3250	lwkt_reltoken(&hmp->fs_token);
	3251	return(error);
	3252	}
	3253
	3254	/*
	3255	* dounlink - disconnect a directory entry
	3256	*
	3257	* XXX whiteout support not really in yet
	3258	*/
	3259	static int
	3260	hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
	3261	struct vnode dvp, struct ucred cred,
	3262	int flags, int isdir)
	3263	{
	3264	struct namecache *ncp;
	3265	hammer_inode_t dip;
	3266	hammer_inode_t ip;
	3267	hammer_mount_t hmp;
	3268	struct hammer_cursor cursor;
	3269	int64_t namekey;
	3270	u_int32_t max_iterations;
	3271	int nlen, error;
	3272
	3273	/*
	3274	* Calculate the namekey and setup the key range for the scan. This
	3275	* works kinda like a chained hash table where the lower 32 bits
	3276	* of the namekey synthesize the chain.
	3277	*
	3278	* The key range is inclusive of both key_beg and key_end.
	3279	*/
	3280	dip = VTOI(dvp);
	3281	ncp = nch->ncp;
	3282	hmp = dip->hmp;
	3283
	3284	if (dip->flags & HAMMER_INODE_RO)
	3285	return (EROFS);
	3286
	3287	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
	3288	&max_iterations);
	3289	retry:
	3290	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
	3291	cursor.key_beg.localization = dip->obj_localization +
	3292	hammer_dir_localization(dip);
	3293	cursor.key_beg.obj_id = dip->obj_id;
	3294	cursor.key_beg.key = namekey;
	3295	cursor.key_beg.create_tid = 0;
	3296	cursor.key_beg.delete_tid = 0;
	3297	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	3298	cursor.key_beg.obj_type = 0;
	3299
	3300	cursor.key_end = cursor.key_beg;
	3301	cursor.key_end.key += max_iterations;
	3302	cursor.asof = dip->obj_asof;
	3303	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	3304
	3305	/*
	3306	* Scan all matching records (the chain), locate the one matching
	3307	* the requested path component. info->last_error contains the
	3308	* error code on search termination and could be 0, ENOENT, or
	3309	* something else.
	3310	*
	3311	* The hammer_ip_*() functions merge in-memory records with on-disk
	3312	* records for the purposes of the search.
	3313	*/
	3314	error = hammer_ip_first(&cursor);
	3315
	3316	while (error == 0) {
	3317	error = hammer_ip_resolve_data(&cursor);
	3318	if (error)
	3319	break;
	3320	nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
	3321	KKASSERT(nlen > 0);
	3322	if (ncp->nc_nlen == nlen &&
	3323	bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	3324	break;
	3325	}
	3326	error = hammer_ip_next(&cursor);
	3327	}
	3328
	3329	/*
	3330	* If all is ok we have to get the inode so we can adjust nlinks.
	3331	* To avoid a deadlock with the flusher we must release the inode
	3332	* lock on the directory when acquiring the inode for the entry.
	3333	*
	3334	* If the target is a directory, it must be empty.
	3335	*/
	3336	if (error == 0) {
	3337	hammer_unlock(&cursor.ip->lock);
	3338	ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
	3339	hmp->asof,
	3340	cursor.data->entry.localization,
	3341	0, &error);
	3342	hammer_lock_sh(&cursor.ip->lock);
	3343	if (error == ENOENT) {
	3344	kprintf("HAMMER: WARNING: Removing "
	3345	"dirent w/missing inode \"%s\"\n"
	3346	"\tobj_id = %016llx\n",
	3347	ncp->nc_name,
	3348	(long long)cursor.data->entry.obj_id);
	3349	error = 0;
	3350	}
	3351
	3352	/*
	3353	* If isdir >= 0 we validate that the entry is or is not a
	3354	* directory. If isdir < 0 we don't care.
	3355	*/
	3356	if (error == 0 && isdir >= 0 && ip) {
	3357	if (isdir &&
	3358	ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
	3359	error = ENOTDIR;
	3360	} else if (isdir == 0 &&
	3361	ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	3362	error = EISDIR;
	3363	}
	3364	}
	3365
	3366	/*
	3367	* If we are trying to remove a directory the directory must
	3368	* be empty.
	3369	*
	3370	* The check directory code can loop and deadlock/retry. Our
	3371	* own cursor's node locks must be released to avoid a 3-way
	3372	* deadlock with the flusher if the check directory code
	3373	* blocks.
	3374	*
	3375	* If any changes whatsoever have been made to the cursor
	3376	* set EDEADLK and retry.
	3377	*
	3378	* WARNING: See warnings in hammer_unlock_cursor()
	3379	* function.
	3380	*/
	3381	if (error == 0 && ip && ip->ino_data.obj_type ==
	3382	HAMMER_OBJTYPE_DIRECTORY) {
	3383	hammer_unlock_cursor(&cursor);
	3384	error = hammer_ip_check_directory_empty(trans, ip);
	3385	hammer_lock_cursor(&cursor);
	3386	if (cursor.flags & HAMMER_CURSOR_RETEST) {
	3387	kprintf("HAMMER: Warning: avoided deadlock "
	3388	"on rmdir '%s'\n",
	3389	ncp->nc_name);
	3390	error = EDEADLK;
	3391	}
	3392	}
	3393
	3394	/*
	3395	* Delete the directory entry.
	3396	*
	3397	* WARNING: hammer_ip_del_directory() may have to terminate
	3398	* the cursor to avoid a deadlock. It is ok to call
	3399	* hammer_done_cursor() twice.
	3400	*/
	3401	if (error == 0) {
	3402	error = hammer_ip_del_directory(trans, &cursor,
	3403	dip, ip);
	3404	}
	3405	hammer_done_cursor(&cursor);
	3406	if (error == 0) {
	3407	cache_setunresolved(nch);
	3408	cache_setvp(nch, NULL);
	3409
	3410	/*
	3411	* NOTE: ip->vp, if non-NULL, cannot be directly
	3412	* referenced without formally acquiring the
	3413	* vp since the vp might have zero refs on it,
	3414	* or in the middle of a reclaim, etc.
	3415	*
	3416	* NOTE: The cache_setunresolved() can rip the vp
	3417	* out from under us since the vp may not have
	3418	* any refs, in which case ip->vp will be NULL
	3419	* from the outset.
	3420	*/
	3421	while (ip && ip->vp) {
	3422	struct vnode *vp;
	3423
	3424	error = hammer_get_vnode(ip, &vp);
	3425	if (error == 0 && vp) {
	3426	vn_unlock(vp);
	3427	hammer_knote(ip->vp, NOTE_DELETE);
	3428	cache_inval_vp(ip->vp, CINV_DESTROY);
	3429	vrele(vp);
	3430	break;
	3431	}
	3432	kprintf("Debug: HAMMER ip/vp race1 avoided\n");
	3433	}
	3434	}
	3435	if (ip)
	3436	hammer_rel_inode(ip, 0);
	3437	} else {
	3438	hammer_done_cursor(&cursor);
	3439	}
	3440	if (error == EDEADLK)
	3441	goto retry;
	3442
	3443	return (error);
	3444	}
	3445
	3446	/************************************************************************
	3447	* FIFO AND SPECFS OPS *
	3448	************************************************************************
	3449	*
	3450	*/
	3451	static int
	3452	hammer_vop_fifoclose (struct vop_close_args *ap)
	3453	{
	3454	/* XXX update itimes */
	3455	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
	3456	}
	3457
	3458	static int
	3459	hammer_vop_fiforead (struct vop_read_args *ap)
	3460	{
	3461	int error;
	3462
	3463	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3464	/* XXX update access time */
	3465	return (error);
	3466	}
	3467
	3468	static int
	3469	hammer_vop_fifowrite (struct vop_write_args *ap)
	3470	{
	3471	int error;
	3472
	3473	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3474	/* XXX update access time */
	3475	return (error);
	3476	}
	3477
	3478	static
	3479	int
	3480	hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
	3481	{
	3482	int error;
	3483
	3484	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3485	if (error)
	3486	error = hammer_vop_kqfilter(ap);
	3487	return(error);
	3488	}
	3489
	3490	/************************************************************************
	3491	* KQFILTER OPS *
	3492	************************************************************************
	3493	*
	3494	*/
	3495	static void filt_hammerdetach(struct knote *kn);
	3496	static int filt_hammerread(struct knote *kn, long hint);
	3497	static int filt_hammerwrite(struct knote *kn, long hint);
	3498	static int filt_hammervnode(struct knote *kn, long hint);
	3499
	3500	static struct filterops hammerread_filtops =
	3501	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
	3502	static struct filterops hammerwrite_filtops =
	3503	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
	3504	static struct filterops hammervnode_filtops =
	3505	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
	3506
	3507	static
	3508	int
	3509	hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
	3510	{
	3511	struct vnode *vp = ap->a_vp;
	3512	struct knote *kn = ap->a_kn;
	3513
	3514	switch (kn->kn_filter) {
	3515	case EVFILT_READ:
	3516	kn->kn_fop = &hammerread_filtops;
	3517	break;
	3518	case EVFILT_WRITE:
	3519	kn->kn_fop = &hammerwrite_filtops;
	3520	break;
	3521	case EVFILT_VNODE:
	3522	kn->kn_fop = &hammervnode_filtops;
	3523	break;
	3524	default:
	3525	return (EOPNOTSUPP);
	3526	}
	3527
	3528	kn->kn_hook = (caddr_t)vp;
	3529
	3530	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
	3531
	3532	return(0);
	3533	}
	3534
	3535	static void
	3536	filt_hammerdetach(struct knote *kn)
	3537	{
	3538	struct vnode vp = (void )kn->kn_hook;
	3539
	3540	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
	3541	}
	3542
	3543	static int
	3544	filt_hammerread(struct knote *kn, long hint)
	3545	{
	3546	struct vnode vp = (void )kn->kn_hook;
	3547	hammer_inode_t ip = VTOI(vp);
	3548	hammer_mount_t hmp = ip->hmp;
	3549	off_t off;
	3550
	3551	if (hint == NOTE_REVOKE) {
	3552	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	3553	return(1);
	3554	}
	3555	lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */
	3556	off = ip->ino_data.size - kn->kn_fp->f_offset;
	3557	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
	3558	lwkt_reltoken(&hmp->fs_token);
	3559	if (kn->kn_sfflags & NOTE_OLDAPI)
	3560	return(1);
	3561	return (kn->kn_data != 0);
	3562	}
	3563
	3564	static int
	3565	filt_hammerwrite(struct knote *kn, long hint)
	3566	{
	3567	if (hint == NOTE_REVOKE)
	3568	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	3569	kn->kn_data = 0;
	3570	return (1);
	3571	}
	3572
	3573	static int
	3574	filt_hammervnode(struct knote *kn, long hint)
	3575	{
	3576	if (kn->kn_sfflags & hint)
	3577	kn->kn_fflags \|= hint;
	3578	if (hint == NOTE_REVOKE) {
	3579	kn->kn_flags \|= EV_EOF;
	3580	return (1);
	3581	}
	3582	return (kn->kn_fflags != 0);
	3583	}
	3584