gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
	35	*/
	36
	37	#include <sys/param.h>
	38	#include <sys/systm.h>
	39	#include <sys/kernel.h>
	40	#include <sys/fcntl.h>
	41	#include <sys/namecache.h>
	42	#include <sys/vnode.h>
	43	#include <sys/lockf.h>
	44	#include <sys/event.h>
	45	#include <sys/stat.h>
	46	#include <sys/dirent.h>
	47	#include <sys/file.h>
	48	#include <vm/vm_extern.h>
	49	#include <vfs/fifofs/fifo.h>
	50
	51	#include "hammer.h"
	52
	53	/*
	54	* USERFS VNOPS
	55	*/
	56	/static int hammer_vop_vnoperate(struct vop_generic_args );*/
	57	static int hammer_vop_fsync(struct vop_fsync_args *);
	58	static int hammer_vop_read(struct vop_read_args *);
	59	static int hammer_vop_write(struct vop_write_args *);
	60	static int hammer_vop_access(struct vop_access_args *);
	61	static int hammer_vop_advlock(struct vop_advlock_args *);
	62	static int hammer_vop_close(struct vop_close_args *);
	63	static int hammer_vop_ncreate(struct vop_ncreate_args *);
	64	static int hammer_vop_getattr(struct vop_getattr_args *);
	65	static int hammer_vop_nresolve(struct vop_nresolve_args *);
	66	static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
	67	static int hammer_vop_nlink(struct vop_nlink_args *);
	68	static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
	69	static int hammer_vop_nmknod(struct vop_nmknod_args *);
	70	static int hammer_vop_open(struct vop_open_args *);
	71	static int hammer_vop_print(struct vop_print_args *);
	72	static int hammer_vop_readdir(struct vop_readdir_args *);
	73	static int hammer_vop_readlink(struct vop_readlink_args *);
	74	static int hammer_vop_nremove(struct vop_nremove_args *);
	75	static int hammer_vop_nrename(struct vop_nrename_args *);
	76	static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
	77	static int hammer_vop_markatime(struct vop_markatime_args *);
	78	static int hammer_vop_setattr(struct vop_setattr_args *);
	79	static int hammer_vop_strategy(struct vop_strategy_args *);
	80	static int hammer_vop_bmap(struct vop_bmap_args *ap);
	81	static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
	82	static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
	83	static int hammer_vop_ioctl(struct vop_ioctl_args *);
	84	static int hammer_vop_mountctl(struct vop_mountctl_args *);
	85	static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
	86
	87	static int hammer_vop_fifoclose (struct vop_close_args *);
	88	static int hammer_vop_fiforead (struct vop_read_args *);
	89	static int hammer_vop_fifowrite (struct vop_write_args *);
	90	static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
	91
	92	struct vop_ops hammer_vnode_vops = {
	93	.vop_default = vop_defaultop,
	94	.vop_fsync = hammer_vop_fsync,
	95	.vop_getpages = vop_stdgetpages,
	96	.vop_putpages = vop_stdputpages,
	97	.vop_read = hammer_vop_read,
	98	.vop_write = hammer_vop_write,
	99	.vop_access = hammer_vop_access,
	100	.vop_advlock = hammer_vop_advlock,
	101	.vop_close = hammer_vop_close,
	102	.vop_ncreate = hammer_vop_ncreate,
	103	.vop_getattr = hammer_vop_getattr,
	104	.vop_inactive = hammer_vop_inactive,
	105	.vop_reclaim = hammer_vop_reclaim,
	106	.vop_nresolve = hammer_vop_nresolve,
	107	.vop_nlookupdotdot = hammer_vop_nlookupdotdot,
	108	.vop_nlink = hammer_vop_nlink,
	109	.vop_nmkdir = hammer_vop_nmkdir,
	110	.vop_nmknod = hammer_vop_nmknod,
	111	.vop_open = hammer_vop_open,
	112	.vop_pathconf = vop_stdpathconf,
	113	.vop_print = hammer_vop_print,
	114	.vop_readdir = hammer_vop_readdir,
	115	.vop_readlink = hammer_vop_readlink,
	116	.vop_nremove = hammer_vop_nremove,
	117	.vop_nrename = hammer_vop_nrename,
	118	.vop_nrmdir = hammer_vop_nrmdir,
	119	.vop_markatime = hammer_vop_markatime,
	120	.vop_setattr = hammer_vop_setattr,
	121	.vop_bmap = hammer_vop_bmap,
	122	.vop_strategy = hammer_vop_strategy,
	123	.vop_nsymlink = hammer_vop_nsymlink,
	124	.vop_nwhiteout = hammer_vop_nwhiteout,
	125	.vop_ioctl = hammer_vop_ioctl,
	126	.vop_mountctl = hammer_vop_mountctl,
	127	.vop_kqfilter = hammer_vop_kqfilter
	128	};
	129
	130	struct vop_ops hammer_spec_vops = {
	131	.vop_default = vop_defaultop,
	132	.vop_fsync = hammer_vop_fsync,
	133	.vop_read = vop_stdnoread,
	134	.vop_write = vop_stdnowrite,
	135	.vop_access = hammer_vop_access,
	136	.vop_close = hammer_vop_close,
	137	.vop_markatime = hammer_vop_markatime,
	138	.vop_getattr = hammer_vop_getattr,
	139	.vop_inactive = hammer_vop_inactive,
	140	.vop_reclaim = hammer_vop_reclaim,
	141	.vop_setattr = hammer_vop_setattr
	142	};
	143
	144	struct vop_ops hammer_fifo_vops = {
	145	.vop_default = fifo_vnoperate,
	146	.vop_fsync = hammer_vop_fsync,
	147	.vop_read = hammer_vop_fiforead,
	148	.vop_write = hammer_vop_fifowrite,
	149	.vop_access = hammer_vop_access,
	150	.vop_close = hammer_vop_fifoclose,
	151	.vop_markatime = hammer_vop_markatime,
	152	.vop_getattr = hammer_vop_getattr,
	153	.vop_inactive = hammer_vop_inactive,
	154	.vop_reclaim = hammer_vop_reclaim,
	155	.vop_setattr = hammer_vop_setattr,
	156	.vop_kqfilter = hammer_vop_fifokqfilter
	157	};
	158
	159	static __inline
	160	void
	161	hammer_knote(struct vnode *vp, int flags)
	162	{
	163	if (flags)
	164	KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
	165	}
	166
	167	#ifdef DEBUG_TRUNCATE
	168	struct hammer_inode *HammerTruncIp;
	169	#endif
	170
	171	static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
	172	struct vnode dvp, struct ucred cred,
	173	int flags, int isdir);
	174	static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
	175	static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
	176
	177	#if 0
	178	static
	179	int
	180	hammer_vop_vnoperate(struct vop_generic_args *)
	181	{
	182	return (VOCALL(&hammer_vnode_vops, ap));
	183	}
	184	#endif
	185
	186	/*
	187	* hammer_vop_fsync { vp, waitfor }
	188	*
	189	* fsync() an inode to disk and wait for it to be completely committed
	190	* such that the information would not be undone if a crash occured after
	191	* return.
	192	*
	193	* NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
	194	* a REDO log. A sysctl is provided to relax HAMMER's fsync()
	195	* operation.
	196	*
	197	* Ultimately the combination of a REDO log and use of fast storage
	198	* to front-end cluster caches will make fsync fast, but it aint
	199	* here yet. And, in anycase, we need real transactional
	200	* all-or-nothing features which are not restricted to a single file.
	201	*/
	202	static
	203	int
	204	hammer_vop_fsync(struct vop_fsync_args *ap)
	205	{
	206	hammer_inode_t ip = VTOI(ap->a_vp);
	207	hammer_mount_t hmp = ip->hmp;
	208	int waitfor = ap->a_waitfor;
	209	int mode;
	210
	211	lwkt_gettoken(&hmp->fs_token);
	212
	213	/*
	214	* Fsync rule relaxation (default is either full synchronous flush
	215	* or REDO semantics with synchronous flush).
	216	*/
	217	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
	218	switch(hammer_fsync_mode) {
	219	case 0:
	220	mode0:
	221	/* no REDO, full synchronous flush */
	222	goto skip;
	223	case 1:
	224	mode1:
	225	/* no REDO, full asynchronous flush */
	226	if (waitfor == MNT_WAIT)
	227	waitfor = MNT_NOWAIT;
	228	goto skip;
	229	case 2:
	230	/* REDO semantics, synchronous flush */
	231	if (hmp->version < HAMMER_VOL_VERSION_FOUR)
	232	goto mode0;
	233	mode = HAMMER_FLUSH_UNDOS_AUTO;
	234	break;
	235	case 3:
	236	/* REDO semantics, relaxed asynchronous flush */
	237	if (hmp->version < HAMMER_VOL_VERSION_FOUR)
	238	goto mode1;
	239	mode = HAMMER_FLUSH_UNDOS_RELAXED;
	240	if (waitfor == MNT_WAIT)
	241	waitfor = MNT_NOWAIT;
	242	break;
	243	case 4:
	244	/* ignore the fsync() system call */
	245	lwkt_reltoken(&hmp->fs_token);
	246	return(0);
	247	default:
	248	/* we have to do something */
	249	mode = HAMMER_FLUSH_UNDOS_RELAXED;
	250	if (waitfor == MNT_WAIT)
	251	waitfor = MNT_NOWAIT;
	252	break;
	253	}
	254
	255	/*
	256	* Fast fsync only needs to flush the UNDO/REDO fifo if
	257	* HAMMER_INODE_REDO is non-zero and the only modifications
	258	* made to the file are write or write-extends.
	259	*/
	260	if ((ip->flags & HAMMER_INODE_REDO) &&
	261	(ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
	262	) {
	263	++hammer_count_fsyncs;
	264	hammer_flusher_flush_undos(hmp, mode);
	265	ip->redo_count = 0;
	266	lwkt_reltoken(&hmp->fs_token);
	267	return(0);
	268	}
	269
	270	/*
	271	* REDO is enabled by fsync(), the idea being we really only
	272	* want to lay down REDO records when programs are using
	273	* fsync() heavily. The first fsync() on the file starts
	274	* the gravy train going and later fsync()s keep it hot by
	275	* resetting the redo_count.
	276	*
	277	* We weren't running REDOs before now so we have to fall
	278	* through and do a full fsync of what we have.
	279	*/
	280	if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
	281	(hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
	282	ip->flags \|= HAMMER_INODE_REDO;
	283	ip->redo_count = 0;
	284	}
	285	}
	286	skip:
	287
	288	/*
	289	* Do a full flush sequence.
	290	*/
	291	++hammer_count_fsyncs;
	292	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
	293	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	294	if (waitfor == MNT_WAIT) {
	295	vn_unlock(ap->a_vp);
	296	hammer_wait_inode(ip);
	297	vn_lock(ap->a_vp, LK_EXCLUSIVE \| LK_RETRY);
	298	}
	299	lwkt_reltoken(&hmp->fs_token);
	300	return (ip->error);
	301	}
	302
	303	/*
	304	* hammer_vop_read { vp, uio, ioflag, cred }
	305	*
	306	* MPSAFE (for the cache safe does not require fs_token)
	307	*/
	308	static
	309	int
	310	hammer_vop_read(struct vop_read_args *ap)
	311	{
	312	struct hammer_transaction trans;
	313	hammer_inode_t ip;
	314	hammer_mount_t hmp;
	315	off_t offset;
	316	struct buf *bp;
	317	struct uio *uio;
	318	int error;
	319	int n;
	320	int seqcount;
	321	int ioseqcount;
	322	int blksize;
	323	int bigread;
	324	int got_fstoken;
	325
	326	if (ap->a_vp->v_type != VREG)
	327	return (EINVAL);
	328	ip = VTOI(ap->a_vp);
	329	hmp = ip->hmp;
	330	error = 0;
	331	uio = ap->a_uio;
	332
	333	/*
	334	* Allow the UIO's size to override the sequential heuristic.
	335	*/
	336	blksize = hammer_blocksize(uio->uio_offset);
	337	seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
	338	ioseqcount = (ap->a_ioflag >> 16);
	339	if (seqcount < ioseqcount)
	340	seqcount = ioseqcount;
	341
	342	/*
	343	* If reading or writing a huge amount of data we have to break
	344	* atomicy and allow the operation to be interrupted by a signal
	345	* or it can DOS the machine.
	346	*/
	347	bigread = (uio->uio_resid > 100 * 1024 * 1024);
	348	got_fstoken = 0;
	349
	350	/*
	351	* Access the data typically in HAMMER_BUFSIZE blocks via the
	352	* buffer cache, but HAMMER may use a variable block size based
	353	* on the offset.
	354	*
	355	* XXX Temporary hack, delay the start transaction while we remain
	356	* MPSAFE. NOTE: ino_data.size cannot change while vnode is
	357	* locked-shared.
	358	*/
	359	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
	360	int64_t base_offset;
	361	int64_t file_limit;
	362
	363	blksize = hammer_blocksize(uio->uio_offset);
	364	offset = (int)uio->uio_offset & (blksize - 1);
	365	base_offset = uio->uio_offset - offset;
	366
	367	if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
	368	break;
	369
	370	/*
	371	* MPSAFE
	372	*/
	373	bp = getcacheblk(ap->a_vp, base_offset, blksize);
	374	if (bp) {
	375	error = 0;
	376	goto skip;
	377	} else {
	378	if (ap->a_ioflag & IO_NRDELAY)
	379	return (EWOULDBLOCK);
	380	}
	381
	382	/*
	383	* MPUNSAFE
	384	*/
	385	if (got_fstoken == 0) {
	386	lwkt_gettoken(&hmp->fs_token);
	387	got_fstoken = 1;
	388	hammer_start_transaction(&trans, ip->hmp);
	389	}
	390
	391	if (hammer_cluster_enable) {
	392	/*
	393	* Use file_limit to prevent cluster_read() from
	394	* creating buffers of the wrong block size past
	395	* the demarc.
	396	*/
	397	file_limit = ip->ino_data.size;
	398	if (base_offset < HAMMER_XDEMARC &&
	399	file_limit > HAMMER_XDEMARC) {
	400	file_limit = HAMMER_XDEMARC;
	401	}
	402	error = cluster_read(ap->a_vp,
	403	file_limit, base_offset,
	404	blksize, uio->uio_resid,
	405	seqcount * BKVASIZE, &bp);
	406	} else {
	407	error = bread(ap->a_vp, base_offset, blksize, &bp);
	408	}
	409	if (error) {
	410	brelse(bp);
	411	break;
	412	}
	413	skip:
	414	if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
	415	kprintf("doff %016jx read file %016jx@%016jx\n",
	416	(intmax_t)bp->b_bio2.bio_offset,
	417	(intmax_t)ip->obj_id,
	418	(intmax_t)bp->b_loffset);
	419	}
	420	bp->b_flags &= ~B_IODEBUG;
	421
	422	/* bp->b_flags \|= B_CLUSTEROK; temporarily disabled */
	423	n = blksize - offset;
	424	if (n > uio->uio_resid)
	425	n = uio->uio_resid;
	426	if (n > ip->ino_data.size - uio->uio_offset)
	427	n = (int)(ip->ino_data.size - uio->uio_offset);
	428	if (got_fstoken)
	429	lwkt_reltoken(&hmp->fs_token);
	430
	431	/*
	432	* Set B_AGE, data has a lower priority than meta-data.
	433	*
	434	* Use a hold/unlock/drop sequence to run the uiomove
	435	* with the buffer unlocked, avoiding deadlocks against
	436	* read()s on mmap()'d spaces.
	437	*/
	438	bp->b_flags \|= B_AGE;
	439	bqhold(bp);
	440	bqrelse(bp);
	441	error = uiomove((char *)bp->b_data + offset, n, uio);
	442	bqdrop(bp);
	443
	444	if (got_fstoken)
	445	lwkt_gettoken(&hmp->fs_token);
	446
	447	if (error)
	448	break;
	449	hammer_stats_file_read += n;
	450	}
	451
	452	/*
	453	* XXX only update the atime if we had to get the MP lock.
	454	* XXX hack hack hack, fixme.
	455	*/
	456	if (got_fstoken) {
	457	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
	458	(ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
	459	ip->ino_data.atime = trans.time;
	460	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
	461	}
	462	hammer_done_transaction(&trans);
	463	lwkt_reltoken(&hmp->fs_token);
	464	}
	465	return (error);
	466	}
	467
	468	/*
	469	* hammer_vop_write { vp, uio, ioflag, cred }
	470	*/
	471	static
	472	int
	473	hammer_vop_write(struct vop_write_args *ap)
	474	{
	475	struct hammer_transaction trans;
	476	struct hammer_inode *ip;
	477	hammer_mount_t hmp;
	478	struct uio *uio;
	479	int offset;
	480	off_t base_offset;
	481	struct buf *bp;
	482	int kflags;
	483	int error;
	484	int n;
	485	int flags;
	486	int seqcount;
	487	int bigwrite;
	488
	489	if (ap->a_vp->v_type != VREG)
	490	return (EINVAL);
	491	ip = VTOI(ap->a_vp);
	492	hmp = ip->hmp;
	493	error = 0;
	494	kflags = 0;
	495	seqcount = ap->a_ioflag >> 16;
	496
	497	if (ip->flags & HAMMER_INODE_RO)
	498	return (EROFS);
	499
	500	/*
	501	* Create a transaction to cover the operations we perform.
	502	*/
	503	lwkt_gettoken(&hmp->fs_token);
	504	hammer_start_transaction(&trans, hmp);
	505	uio = ap->a_uio;
	506
	507	/*
	508	* Check append mode
	509	*/
	510	if (ap->a_ioflag & IO_APPEND)
	511	uio->uio_offset = ip->ino_data.size;
	512
	513	/*
	514	* Check for illegal write offsets. Valid range is 0...2^63-1.
	515	*
	516	* NOTE: the base_off assignment is required to work around what
	517	* I consider to be a GCC-4 optimization bug.
	518	*/
	519	if (uio->uio_offset < 0) {
	520	hammer_done_transaction(&trans);
	521	lwkt_reltoken(&hmp->fs_token);
	522	return (EFBIG);
	523	}
	524	base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
	525	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
	526	hammer_done_transaction(&trans);
	527	lwkt_reltoken(&hmp->fs_token);
	528	return (EFBIG);
	529	}
	530
	531	/*
	532	* If reading or writing a huge amount of data we have to break
	533	* atomicy and allow the operation to be interrupted by a signal
	534	* or it can DOS the machine.
	535	*
	536	* Preset redo_count so we stop generating REDOs earlier if the
	537	* limit is exceeded.
	538	*/
	539	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
	540	if ((ip->flags & HAMMER_INODE_REDO) &&
	541	ip->redo_count < hammer_limit_redo) {
	542	ip->redo_count += uio->uio_resid;
	543	}
	544
	545	/*
	546	* Access the data typically in HAMMER_BUFSIZE blocks via the
	547	* buffer cache, but HAMMER may use a variable block size based
	548	* on the offset.
	549	*/
	550	while (uio->uio_resid > 0) {
	551	int fixsize = 0;
	552	int blksize;
	553	int blkmask;
	554	int trivial;
	555	int endofblk;
	556	off_t nsize;
	557
	558	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
	559	break;
	560	if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
	561	break;
	562
	563	blksize = hammer_blocksize(uio->uio_offset);
	564
	565	/*
	566	* Do not allow HAMMER to blow out the buffer cache. Very
	567	* large UIOs can lockout other processes due to bwillwrite()
	568	* mechanics.
	569	*
	570	* The hammer inode is not locked during these operations.
	571	* The vnode is locked which can interfere with the pageout
	572	* daemon for non-UIO_NOCOPY writes but should not interfere
	573	* with the buffer cache. Even so, we cannot afford to
	574	* allow the pageout daemon to build up too many dirty buffer
	575	* cache buffers.
	576	*
	577	* Only call this if we aren't being recursively called from
	578	* a virtual disk device (vn), else we may deadlock.
	579	*/
	580	if ((ap->a_ioflag & IO_RECURSE) == 0)
	581	bwillwrite(blksize);
	582
	583	/*
	584	* Control the number of pending records associated with
	585	* this inode. If too many have accumulated start a
	586	* flush. Try to maintain a pipeline with the flusher.
	587	*
	588	* NOTE: It is possible for other sources to grow the
	589	* records but not necessarily issue another flush,
	590	* so use a timeout and ensure that a re-flush occurs.
	591	*/
	592	if (ip->rsv_recs >= hammer_limit_inode_recs) {
	593	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	594	while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
	595	ip->flags \|= HAMMER_INODE_RECSW;
	596	tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
	597	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	598	}
	599	}
	600
	601	#if 0
	602	/*
	603	* Do not allow HAMMER to blow out system memory by
	604	* accumulating too many records. Records are so well
	605	* decoupled from the buffer cache that it is possible
	606	* for userland to push data out to the media via
	607	* direct-write, but build up the records queued to the
	608	* backend faster then the backend can flush them out.
	609	* HAMMER has hit its write limit but the frontend has
	610	* no pushback to slow it down.
	611	*/
	612	if (hmp->rsv_recs > hammer_limit_recs / 2) {
	613	/*
	614	* Get the inode on the flush list
	615	*/
	616	if (ip->rsv_recs >= 64)
	617	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	618	else if (ip->rsv_recs >= 16)
	619	hammer_flush_inode(ip, 0);
	620
	621	/*
	622	* Keep the flusher going if the system keeps
	623	* queueing records.
	624	*/
	625	delta = hmp->count_newrecords -
	626	hmp->last_newrecords;
	627	if (delta < 0 \|\| delta > hammer_limit_recs / 2) {
	628	hmp->last_newrecords = hmp->count_newrecords;
	629	hammer_sync_hmp(hmp, MNT_NOWAIT);
	630	}
	631
	632	/*
	633	* If we have gotten behind start slowing
	634	* down the writers.
	635	*/
	636	delta = (hmp->rsv_recs - hammer_limit_recs) *
	637	hz / hammer_limit_recs;
	638	if (delta > 0)
	639	tsleep(&trans, 0, "hmrslo", delta);
	640	}
	641	#endif
	642
	643	/*
	644	* Calculate the blocksize at the current offset and figure
	645	* out how much we can actually write.
	646	*/
	647	blkmask = blksize - 1;
	648	offset = (int)uio->uio_offset & blkmask;
	649	base_offset = uio->uio_offset & ~(int64_t)blkmask;
	650	n = blksize - offset;
	651	if (n > uio->uio_resid) {
	652	n = uio->uio_resid;
	653	endofblk = 0;
	654	} else {
	655	endofblk = 1;
	656	}
	657	nsize = uio->uio_offset + n;
	658	if (nsize > ip->ino_data.size) {
	659	if (uio->uio_offset > ip->ino_data.size)
	660	trivial = 0;
	661	else
	662	trivial = 1;
	663	nvextendbuf(ap->a_vp,
	664	ip->ino_data.size,
	665	nsize,
	666	hammer_blocksize(ip->ino_data.size),
	667	hammer_blocksize(nsize),
	668	hammer_blockoff(ip->ino_data.size),
	669	hammer_blockoff(nsize),
	670	trivial);
	671	fixsize = 1;
	672	kflags \|= NOTE_EXTEND;
	673	}
	674
	675	if (uio->uio_segflg == UIO_NOCOPY) {
	676	/*
	677	* Issuing a write with the same data backing the
	678	* buffer. Instantiate the buffer to collect the
	679	* backing vm pages, then read-in any missing bits.
	680	*
	681	* This case is used by vop_stdputpages().
	682	*/
	683	bp = getblk(ap->a_vp, base_offset,
	684	blksize, GETBLK_BHEAVY, 0);
	685	if ((bp->b_flags & B_CACHE) == 0) {
	686	bqrelse(bp);
	687	error = bread(ap->a_vp, base_offset,
	688	blksize, &bp);
	689	}
	690	} else if (offset == 0 && uio->uio_resid >= blksize) {
	691	/*
	692	* Even though we are entirely overwriting the buffer
	693	* we may still have to zero it out to avoid a
	694	* mmap/write visibility issue.
	695	*/
	696	bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
	697	if ((bp->b_flags & B_CACHE) == 0)
	698	vfs_bio_clrbuf(bp);
	699	} else if (base_offset >= ip->ino_data.size) {
	700	/*
	701	* If the base offset of the buffer is beyond the
	702	* file EOF, we don't have to issue a read.
	703	*/
	704	bp = getblk(ap->a_vp, base_offset,
	705	blksize, GETBLK_BHEAVY, 0);
	706	vfs_bio_clrbuf(bp);
	707	} else {
	708	/*
	709	* Partial overwrite, read in any missing bits then
	710	* replace the portion being written.
	711	*/
	712	error = bread(ap->a_vp, base_offset, blksize, &bp);
	713	if (error == 0)
	714	bheavy(bp);
	715	}
	716	if (error == 0) {
	717	lwkt_reltoken(&hmp->fs_token);
	718	error = uiomove(bp->b_data + offset, n, uio);
	719	lwkt_gettoken(&hmp->fs_token);
	720	}
	721
	722	/*
	723	* Generate REDO records if enabled and redo_count will not
	724	* exceeded the limit.
	725	*
	726	* If redo_count exceeds the limit we stop generating records
	727	* and clear HAMMER_INODE_REDO. This will cause the next
	728	* fsync() to do a full meta-data sync instead of just an
	729	* UNDO/REDO fifo update.
	730	*
	731	* When clearing HAMMER_INODE_REDO any pre-existing REDOs
	732	* will still be tracked. The tracks will be terminated
	733	* when the related meta-data (including possible data
	734	* modifications which are not tracked via REDO) is
	735	* flushed.
	736	*/
	737	if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
	738	if (ip->redo_count < hammer_limit_redo) {
	739	bp->b_flags \|= B_VFSFLAG1;
	740	error = hammer_generate_redo(&trans, ip,
	741	base_offset + offset,
	742	HAMMER_REDO_WRITE,
	743	bp->b_data + offset,
	744	(size_t)n);
	745	} else {
	746	ip->flags &= ~HAMMER_INODE_REDO;
	747	}
	748	}
	749
	750	/*
	751	* If we screwed up we have to undo any VM size changes we
	752	* made.
	753	*/
	754	if (error) {
	755	brelse(bp);
	756	if (fixsize) {
	757	nvtruncbuf(ap->a_vp, ip->ino_data.size,
	758	hammer_blocksize(ip->ino_data.size),
	759	hammer_blockoff(ip->ino_data.size));
	760	}
	761	break;
	762	}
	763	kflags \|= NOTE_WRITE;
	764	hammer_stats_file_write += n;
	765	/* bp->b_flags \|= B_CLUSTEROK; temporarily disabled */
	766	if (ip->ino_data.size < uio->uio_offset) {
	767	ip->ino_data.size = uio->uio_offset;
	768	flags = HAMMER_INODE_SDIRTY;
	769	} else {
	770	flags = 0;
	771	}
	772	ip->ino_data.mtime = trans.time;
	773	flags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_BUFS;
	774	hammer_modify_inode(&trans, ip, flags);
	775
	776	/*
	777	* Once we dirty the buffer any cached zone-X offset
	778	* becomes invalid. HAMMER NOTE: no-history mode cannot
	779	* allow overwriting over the same data sector unless
	780	* we provide UNDOs for the old data, which we don't.
	781	*/
	782	bp->b_bio2.bio_offset = NOOFFSET;
	783
	784	/*
	785	* Final buffer disposition.
	786	*
	787	* Because meta-data updates are deferred, HAMMER is
	788	* especially sensitive to excessive bdwrite()s because
	789	* the I/O stream is not broken up by disk reads. So the
	790	* buffer cache simply cannot keep up.
	791	*
	792	* WARNING! blksize is variable. cluster_write() is
	793	* expected to not blow up if it encounters
	794	* buffers that do not match the passed blksize.
	795	*
	796	* NOTE! Hammer shouldn't need to bawrite()/cluster_write().
	797	* The ip->rsv_recs check should burst-flush the data.
	798	* If we queue it immediately the buf could be left
	799	* locked on the device queue for a very long time.
	800	*
	801	* NOTE! To avoid degenerate stalls due to mismatched block
	802	* sizes we only honor IO_DIRECT on the write which
	803	* abuts the end of the buffer. However, we must
	804	* honor IO_SYNC in case someone is silly enough to
	805	* configure a HAMMER file as swap, or when HAMMER
	806	* is serving NFS (for commits). Ick ick.
	807	*/
	808	bp->b_flags \|= B_AGE;
	809	if (ap->a_ioflag & IO_SYNC) {
	810	bwrite(bp);
	811	} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
	812	bawrite(bp);
	813	} else {
	814	#if 0
	815	if (offset + n == blksize) {
	816	if (hammer_cluster_enable == 0 \|\|
	817	(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
	818	bawrite(bp);
	819	} else {
	820	cluster_write(bp, ip->ino_data.size,
	821	blksize, seqcount);
	822	}
	823	} else {
	824	#endif
	825	bdwrite(bp);
	826	}
	827	}
	828	hammer_done_transaction(&trans);
	829	hammer_knote(ap->a_vp, kflags);
	830	lwkt_reltoken(&hmp->fs_token);
	831	return (error);
	832	}
	833
	834	/*
	835	* hammer_vop_access { vp, mode, cred }
	836	*
	837	* MPSAFE - does not require fs_token
	838	*/
	839	static
	840	int
	841	hammer_vop_access(struct vop_access_args *ap)
	842	{
	843	struct hammer_inode *ip = VTOI(ap->a_vp);
	844	uid_t uid;
	845	gid_t gid;
	846	int error;
	847
	848	++hammer_stats_file_iopsr;
	849	uid = hammer_to_unix_xid(&ip->ino_data.uid);
	850	gid = hammer_to_unix_xid(&ip->ino_data.gid);
	851
	852	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
	853	ip->ino_data.uflags);
	854	return (error);
	855	}
	856
	857	/*
	858	* hammer_vop_advlock { vp, id, op, fl, flags }
	859	*
	860	* MPSAFE - does not require fs_token
	861	*/
	862	static
	863	int
	864	hammer_vop_advlock(struct vop_advlock_args *ap)
	865	{
	866	hammer_inode_t ip = VTOI(ap->a_vp);
	867
	868	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
	869	}
	870
	871	/*
	872	* hammer_vop_close { vp, fflag }
	873	*
	874	* We can only sync-on-close for normal closes. XXX disabled for now.
	875	*/
	876	static
	877	int
	878	hammer_vop_close(struct vop_close_args *ap)
	879	{
	880	#if 0
	881	struct vnode *vp = ap->a_vp;
	882	hammer_inode_t ip = VTOI(vp);
	883	int waitfor;
	884	if (ip->flags & (HAMMER_INODE_CLOSESYNC\|HAMMER_INODE_CLOSEASYNC)) {
	885	if (vn_islocked(vp) == LK_EXCLUSIVE &&
	886	(vp->v_flag & (VINACTIVE\|VRECLAIMED)) == 0) {
	887	if (ip->flags & HAMMER_INODE_CLOSESYNC)
	888	waitfor = MNT_WAIT;
	889	else
	890	waitfor = MNT_NOWAIT;
	891	ip->flags &= ~(HAMMER_INODE_CLOSESYNC \|
	892	HAMMER_INODE_CLOSEASYNC);
	893	VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
	894	}
	895	}
	896	#endif
	897	return (vop_stdclose(ap));
	898	}
	899
	900	/*
	901	* hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
	902	*
	903	* The operating system has already ensured that the directory entry
	904	* does not exist and done all appropriate namespace locking.
	905	*/
	906	static
	907	int
	908	hammer_vop_ncreate(struct vop_ncreate_args *ap)
	909	{
	910	struct hammer_transaction trans;
	911	struct hammer_inode *dip;
	912	struct hammer_inode *nip;
	913	struct nchandle *nch;
	914	hammer_mount_t hmp;
	915	int error;
	916
	917	nch = ap->a_nch;
	918	dip = VTOI(ap->a_dvp);
	919	hmp = dip->hmp;
	920
	921	if (dip->flags & HAMMER_INODE_RO)
	922	return (EROFS);
	923	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	924	return (error);
	925
	926	/*
	927	* Create a transaction to cover the operations we perform.
	928	*/
	929	lwkt_gettoken(&hmp->fs_token);
	930	hammer_start_transaction(&trans, hmp);
	931	++hammer_stats_file_iopsw;
	932
	933	/*
	934	* Create a new filesystem object of the requested type. The
	935	* returned inode will be referenced and shared-locked to prevent
	936	* it from being moved to the flusher.
	937	*/
	938	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	939	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	940	NULL, &nip);
	941	if (error) {
	942	hkprintf("hammer_create_inode error %d\n", error);
	943	hammer_done_transaction(&trans);
	944	*ap->a_vpp = NULL;
	945	lwkt_reltoken(&hmp->fs_token);
	946	return (error);
	947	}
	948
	949	/*
	950	* Add the new filesystem object to the directory. This will also
	951	* bump the inode's link count.
	952	*/
	953	error = hammer_ip_add_directory(&trans, dip,
	954	nch->ncp->nc_name, nch->ncp->nc_nlen,
	955	nip);
	956	if (error)
	957	hkprintf("hammer_ip_add_directory error %d\n", error);
	958
	959	/*
	960	* Finish up.
	961	*/
	962	if (error) {
	963	hammer_rel_inode(nip, 0);
	964	hammer_done_transaction(&trans);
	965	*ap->a_vpp = NULL;
	966	} else {
	967	error = hammer_get_vnode(nip, ap->a_vpp);
	968	hammer_done_transaction(&trans);
	969	hammer_rel_inode(nip, 0);
	970	if (error == 0) {
	971	cache_setunresolved(ap->a_nch);
	972	cache_setvp(ap->a_nch, *ap->a_vpp);
	973	}
	974	hammer_knote(ap->a_dvp, NOTE_WRITE);
	975	}
	976	lwkt_reltoken(&hmp->fs_token);
	977	return (error);
	978	}
	979
	980	/*
	981	* hammer_vop_getattr { vp, vap }
	982	*
	983	* Retrieve an inode's attribute information. When accessing inodes
	984	* historically we fake the atime field to ensure consistent results.
	985	* The atime field is stored in the B-Tree element and allowed to be
	986	* updated without cycling the element.
	987	*
	988	* MPSAFE - does not require fs_token
	989	*/
	990	static
	991	int
	992	hammer_vop_getattr(struct vop_getattr_args *ap)
	993	{
	994	struct hammer_inode *ip = VTOI(ap->a_vp);
	995	struct vattr *vap = ap->a_vap;
	996
	997	/*
	998	* We want the fsid to be different when accessing a filesystem
	999	* with different as-of's so programs like diff don't think
	1000	* the files are the same.
	1001	*
	1002	* We also want the fsid to be the same when comparing snapshots,
	1003	* or when comparing mirrors (which might be backed by different
	1004	* physical devices). HAMMER fsids are based on the PFS's
	1005	* shared_uuid field.
	1006	*
	1007	* XXX there is a chance of collision here. The va_fsid reported
	1008	* by stat is different from the more involved fsid used in the
	1009	* mount structure.
	1010	*/
	1011	++hammer_stats_file_iopsr;
	1012	hammer_lock_sh(&ip->lock);
	1013	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
	1014	(u_int32_t)(ip->obj_asof >> 32);
	1015
	1016	vap->va_fileid = ip->ino_leaf.base.obj_id;
	1017	vap->va_mode = ip->ino_data.mode;
	1018	vap->va_nlink = ip->ino_data.nlinks;
	1019	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	1020	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	1021	vap->va_rmajor = 0;
	1022	vap->va_rminor = 0;
	1023	vap->va_size = ip->ino_data.size;
	1024
	1025	/*
	1026	* Special case for @@PFS softlinks. The actual size of the
	1027	* expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
	1028	* or for MAX_TID is "@@-1:%05d" == 10 bytes.
	1029	*/
	1030	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
	1031	ip->ino_data.size == 10 &&
	1032	ip->obj_asof == HAMMER_MAX_TID &&
	1033	ip->obj_localization == 0 &&
	1034	strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
	1035	if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
	1036	vap->va_size = 26;
	1037	else
	1038	vap->va_size = 10;
	1039	}
	1040
	1041	/*
	1042	* We must provide a consistent atime and mtime for snapshots
	1043	* so people can do a 'tar cf - ... \| md5' on them and get
	1044	* consistent results.
	1045	*/
	1046	if (ip->flags & HAMMER_INODE_RO) {
	1047	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
	1048	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
	1049	} else {
	1050	hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
	1051	hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
	1052	}
	1053	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
	1054	vap->va_flags = ip->ino_data.uflags;
	1055	vap->va_gen = 1; /* hammer inums are unique for all time */
	1056	vap->va_blocksize = HAMMER_BUFSIZE;
	1057	if (ip->ino_data.size >= HAMMER_XDEMARC) {
	1058	vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
	1059	~HAMMER_XBUFMASK64;
	1060	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
	1061	vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
	1062	~HAMMER_BUFMASK64;
	1063	} else {
	1064	vap->va_bytes = (ip->ino_data.size + 15) & ~15;
	1065	}
	1066
	1067	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
	1068	vap->va_filerev = 0; /* XXX */
	1069	vap->va_uid_uuid = ip->ino_data.uid;
	1070	vap->va_gid_uuid = ip->ino_data.gid;
	1071	vap->va_fsid_uuid = ip->hmp->fsid;
	1072	vap->va_vaflags = VA_UID_UUID_VALID \| VA_GID_UUID_VALID \|
	1073	VA_FSID_UUID_VALID;
	1074
	1075	switch (ip->ino_data.obj_type) {
	1076	case HAMMER_OBJTYPE_CDEV:
	1077	case HAMMER_OBJTYPE_BDEV:
	1078	vap->va_rmajor = ip->ino_data.rmajor;
	1079	vap->va_rminor = ip->ino_data.rminor;
	1080	break;
	1081	default:
	1082	break;
	1083	}
	1084	hammer_unlock(&ip->lock);
	1085	return(0);
	1086	}
	1087
	1088	/*
	1089	* hammer_vop_nresolve { nch, dvp, cred }
	1090	*
	1091	* Locate the requested directory entry.
	1092	*/
	1093	static
	1094	int
	1095	hammer_vop_nresolve(struct vop_nresolve_args *ap)
	1096	{
	1097	struct hammer_transaction trans;
	1098	struct namecache *ncp;
	1099	hammer_mount_t hmp;
	1100	hammer_inode_t dip;
	1101	hammer_inode_t ip;
	1102	hammer_tid_t asof;
	1103	struct hammer_cursor cursor;
	1104	struct vnode *vp;
	1105	int64_t namekey;
	1106	int error;
	1107	int i;
	1108	int nlen;
	1109	int flags;
	1110	int ispfs;
	1111	int64_t obj_id;
	1112	u_int32_t localization;
	1113	u_int32_t max_iterations;
	1114
	1115	/*
	1116	* Misc initialization, plus handle as-of name extensions. Look for
	1117	* the '@@' extension. Note that as-of files and directories cannot
	1118	* be modified.
	1119	*/
	1120	dip = VTOI(ap->a_dvp);
	1121	ncp = ap->a_nch->ncp;
	1122	asof = dip->obj_asof;
	1123	localization = dip->obj_localization; /* for code consistency */
	1124	nlen = ncp->nc_nlen;
	1125	flags = dip->flags & HAMMER_INODE_RO;
	1126	ispfs = 0;
	1127	hmp = dip->hmp;
	1128
	1129	lwkt_gettoken(&hmp->fs_token);
	1130	hammer_simple_transaction(&trans, hmp);
	1131	++hammer_stats_file_iopsr;
	1132
	1133	for (i = 0; i < nlen; ++i) {
	1134	if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
	1135	error = hammer_str_to_tid(ncp->nc_name + i + 2,
	1136	&ispfs, &asof, &localization);
	1137	if (error != 0) {
	1138	i = nlen;
	1139	break;
	1140	}
	1141	if (asof != HAMMER_MAX_TID)
	1142	flags \|= HAMMER_INODE_RO;
	1143	break;
	1144	}
	1145	}
	1146	nlen = i;
	1147
	1148	/*
	1149	* If this is a PFS softlink we dive into the PFS
	1150	*/
	1151	if (ispfs && nlen == 0) {
	1152	ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
	1153	asof, localization,
	1154	flags, &error);
	1155	if (error == 0) {
	1156	error = hammer_get_vnode(ip, &vp);
	1157	hammer_rel_inode(ip, 0);
	1158	} else {
	1159	vp = NULL;
	1160	}
	1161	if (error == 0) {
	1162	vn_unlock(vp);
	1163	cache_setvp(ap->a_nch, vp);
	1164	vrele(vp);
	1165	}
	1166	goto done;
	1167	}
	1168
	1169	/*
	1170	* If there is no path component the time extension is relative to dip.
	1171	* e.g. "fubar/@@<snapshot>"
	1172	*
	1173	* "." is handled by the kernel, but ".@@<snapshot>" is not.
	1174	* e.g. "fubar/.@@<snapshot>"
	1175	*
	1176	* ".." is handled by the kernel. We do not currently handle
	1177	* "..@<snapshot>".
	1178	*/
	1179	if (nlen == 0 \|\| (nlen == 1 && ncp->nc_name[0] == '.')) {
	1180	ip = hammer_get_inode(&trans, dip, dip->obj_id,
	1181	asof, dip->obj_localization,
	1182	flags, &error);
	1183	if (error == 0) {
	1184	error = hammer_get_vnode(ip, &vp);
	1185	hammer_rel_inode(ip, 0);
	1186	} else {
	1187	vp = NULL;
	1188	}
	1189	if (error == 0) {
	1190	vn_unlock(vp);
	1191	cache_setvp(ap->a_nch, vp);
	1192	vrele(vp);
	1193	}
	1194	goto done;
	1195	}
	1196
	1197	/*
	1198	* Calculate the namekey and setup the key range for the scan. This
	1199	* works kinda like a chained hash table where the lower 32 bits
	1200	* of the namekey synthesize the chain.
	1201	*
	1202	* The key range is inclusive of both key_beg and key_end.
	1203	*/
	1204	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
	1205	&max_iterations);
	1206
	1207	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
	1208	cursor.key_beg.localization = dip->obj_localization +
	1209	hammer_dir_localization(dip);
	1210	cursor.key_beg.obj_id = dip->obj_id;
	1211	cursor.key_beg.key = namekey;
	1212	cursor.key_beg.create_tid = 0;
	1213	cursor.key_beg.delete_tid = 0;
	1214	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1215	cursor.key_beg.obj_type = 0;
	1216
	1217	cursor.key_end = cursor.key_beg;
	1218	cursor.key_end.key += max_iterations;
	1219	cursor.asof = asof;
	1220	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1221
	1222	/*
	1223	* Scan all matching records (the chain), locate the one matching
	1224	* the requested path component.
	1225	*
	1226	* The hammer_ip_*() functions merge in-memory records with on-disk
	1227	* records for the purposes of the search.
	1228	*/
	1229	obj_id = 0;
	1230	localization = HAMMER_DEF_LOCALIZATION;
	1231
	1232	if (error == 0) {
	1233	error = hammer_ip_first(&cursor);
	1234	while (error == 0) {
	1235	error = hammer_ip_resolve_data(&cursor);
	1236	if (error)
	1237	break;
	1238	if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
	1239	bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	1240	obj_id = cursor.data->entry.obj_id;
	1241	localization = cursor.data->entry.localization;
	1242	break;
	1243	}
	1244	error = hammer_ip_next(&cursor);
	1245	}
	1246	}
	1247	hammer_done_cursor(&cursor);
	1248
	1249	/*
	1250	* Lookup the obj_id. This should always succeed. If it does not
	1251	* the filesystem may be damaged and we return a dummy inode.
	1252	*/
	1253	if (error == 0) {
	1254	ip = hammer_get_inode(&trans, dip, obj_id,
	1255	asof, localization,
	1256	flags, &error);
	1257	if (error == ENOENT) {
	1258	kprintf("HAMMER: WARNING: Missing "
	1259	"inode for dirent \"%s\"\n"
	1260	"\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
	1261	ncp->nc_name,
	1262	(long long)obj_id, (long long)asof,
	1263	localization);
	1264	error = 0;
	1265	ip = hammer_get_dummy_inode(&trans, dip, obj_id,
	1266	asof, localization,
	1267	flags, &error);
	1268	}
	1269	if (error == 0) {
	1270	error = hammer_get_vnode(ip, &vp);
	1271	hammer_rel_inode(ip, 0);
	1272	} else {
	1273	vp = NULL;
	1274	}
	1275	if (error == 0) {
	1276	vn_unlock(vp);
	1277	cache_setvp(ap->a_nch, vp);
	1278	vrele(vp);
	1279	}
	1280	} else if (error == ENOENT) {
	1281	cache_setvp(ap->a_nch, NULL);
	1282	}
	1283	done:
	1284	hammer_done_transaction(&trans);
	1285	lwkt_reltoken(&hmp->fs_token);
	1286	return (error);
	1287	}
	1288
	1289	/*
	1290	* hammer_vop_nlookupdotdot { dvp, vpp, cred }
	1291	*
	1292	* Locate the parent directory of a directory vnode.
	1293	*
	1294	* dvp is referenced but not locked. *vpp must be returned referenced and
	1295	* locked. A parent_obj_id of 0 does not necessarily indicate that we are
	1296	* at the root, instead it could indicate that the directory we were in was
	1297	* removed.
	1298	*
	1299	* NOTE: as-of sequences are not linked into the directory structure. If
	1300	* we are at the root with a different asof then the mount point, reload
	1301	* the same directory with the mount point's asof. I'm not sure what this
	1302	* will do to NFS. We encode ASOF stamps in NFS file handles so it might not
	1303	* get confused, but it hasn't been tested.
	1304	*/
	1305	static
	1306	int
	1307	hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
	1308	{
	1309	struct hammer_transaction trans;
	1310	struct hammer_inode *dip;
	1311	struct hammer_inode *ip;
	1312	hammer_mount_t hmp;
	1313	int64_t parent_obj_id;
	1314	u_int32_t parent_obj_localization;
	1315	hammer_tid_t asof;
	1316	int error;
	1317
	1318	dip = VTOI(ap->a_dvp);
	1319	asof = dip->obj_asof;
	1320	hmp = dip->hmp;
	1321
	1322	/*
	1323	* Whos are parent? This could be the root of a pseudo-filesystem
	1324	* whos parent is in another localization domain.
	1325	*/
	1326	lwkt_gettoken(&hmp->fs_token);
	1327	parent_obj_id = dip->ino_data.parent_obj_id;
	1328	if (dip->obj_id == HAMMER_OBJID_ROOT)
	1329	parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
	1330	else
	1331	parent_obj_localization = dip->obj_localization;
	1332
	1333	if (parent_obj_id == 0) {
	1334	if (dip->obj_id == HAMMER_OBJID_ROOT &&
	1335	asof != hmp->asof) {
	1336	parent_obj_id = dip->obj_id;
	1337	asof = hmp->asof;
	1338	*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
	1339	ksnprintf(*ap->a_fakename, 19, "0x%016llx",
	1340	(long long)dip->obj_asof);
	1341	} else {
	1342	*ap->a_vpp = NULL;
	1343	lwkt_reltoken(&hmp->fs_token);
	1344	return ENOENT;
	1345	}
	1346	}
	1347
	1348	hammer_simple_transaction(&trans, hmp);
	1349	++hammer_stats_file_iopsr;
	1350
	1351	ip = hammer_get_inode(&trans, dip, parent_obj_id,
	1352	asof, parent_obj_localization,
	1353	dip->flags, &error);
	1354	if (ip) {
	1355	error = hammer_get_vnode(ip, ap->a_vpp);
	1356	hammer_rel_inode(ip, 0);
	1357	} else {
	1358	*ap->a_vpp = NULL;
	1359	}
	1360	hammer_done_transaction(&trans);
	1361	lwkt_reltoken(&hmp->fs_token);
	1362	return (error);
	1363	}
	1364
	1365	/*
	1366	* hammer_vop_nlink { nch, dvp, vp, cred }
	1367	*/
	1368	static
	1369	int
	1370	hammer_vop_nlink(struct vop_nlink_args *ap)
	1371	{
	1372	struct hammer_transaction trans;
	1373	struct hammer_inode *dip;
	1374	struct hammer_inode *ip;
	1375	struct nchandle *nch;
	1376	hammer_mount_t hmp;
	1377	int error;
	1378
	1379	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
	1380	return(EXDEV);
	1381
	1382	nch = ap->a_nch;
	1383	dip = VTOI(ap->a_dvp);
	1384	ip = VTOI(ap->a_vp);
	1385	hmp = dip->hmp;
	1386
	1387	if (dip->obj_localization != ip->obj_localization)
	1388	return(EXDEV);
	1389
	1390	if (dip->flags & HAMMER_INODE_RO)
	1391	return (EROFS);
	1392	if (ip->flags & HAMMER_INODE_RO)
	1393	return (EROFS);
	1394	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1395	return (error);
	1396
	1397	/*
	1398	* Create a transaction to cover the operations we perform.
	1399	*/
	1400	lwkt_gettoken(&hmp->fs_token);
	1401	hammer_start_transaction(&trans, hmp);
	1402	++hammer_stats_file_iopsw;
	1403
	1404	/*
	1405	* Add the filesystem object to the directory. Note that neither
	1406	* dip nor ip are referenced or locked, but their vnodes are
	1407	* referenced. This function will bump the inode's link count.
	1408	*/
	1409	error = hammer_ip_add_directory(&trans, dip,
	1410	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1411	ip);
	1412
	1413	/*
	1414	* Finish up.
	1415	*/
	1416	if (error == 0) {
	1417	cache_setunresolved(nch);
	1418	cache_setvp(nch, ap->a_vp);
	1419	}
	1420	hammer_done_transaction(&trans);
	1421	hammer_knote(ap->a_vp, NOTE_LINK);
	1422	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1423	lwkt_reltoken(&hmp->fs_token);
	1424	return (error);
	1425	}
	1426
	1427	/*
	1428	* hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
	1429	*
	1430	* The operating system has already ensured that the directory entry
	1431	* does not exist and done all appropriate namespace locking.
	1432	*/
	1433	static
	1434	int
	1435	hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
	1436	{
	1437	struct hammer_transaction trans;
	1438	struct hammer_inode *dip;
	1439	struct hammer_inode *nip;
	1440	struct nchandle *nch;
	1441	hammer_mount_t hmp;
	1442	int error;
	1443
	1444	nch = ap->a_nch;
	1445	dip = VTOI(ap->a_dvp);
	1446	hmp = dip->hmp;
	1447
	1448	if (dip->flags & HAMMER_INODE_RO)
	1449	return (EROFS);
	1450	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1451	return (error);
	1452
	1453	/*
	1454	* Create a transaction to cover the operations we perform.
	1455	*/
	1456	lwkt_gettoken(&hmp->fs_token);
	1457	hammer_start_transaction(&trans, hmp);
	1458	++hammer_stats_file_iopsw;
	1459
	1460	/*
	1461	* Create a new filesystem object of the requested type. The
	1462	* returned inode will be referenced but not locked.
	1463	*/
	1464	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	1465	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	1466	NULL, &nip);
	1467	if (error) {
	1468	hkprintf("hammer_mkdir error %d\n", error);
	1469	hammer_done_transaction(&trans);
	1470	*ap->a_vpp = NULL;
	1471	lwkt_reltoken(&hmp->fs_token);
	1472	return (error);
	1473	}
	1474	/*
	1475	* Add the new filesystem object to the directory. This will also
	1476	* bump the inode's link count.
	1477	*/
	1478	error = hammer_ip_add_directory(&trans, dip,
	1479	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1480	nip);
	1481	if (error)
	1482	hkprintf("hammer_mkdir (add) error %d\n", error);
	1483
	1484	/*
	1485	* Finish up.
	1486	*/
	1487	if (error) {
	1488	hammer_rel_inode(nip, 0);
	1489	*ap->a_vpp = NULL;
	1490	} else {
	1491	error = hammer_get_vnode(nip, ap->a_vpp);
	1492	hammer_rel_inode(nip, 0);
	1493	if (error == 0) {
	1494	cache_setunresolved(ap->a_nch);
	1495	cache_setvp(ap->a_nch, *ap->a_vpp);
	1496	}
	1497	}
	1498	hammer_done_transaction(&trans);
	1499	if (error == 0)
	1500	hammer_knote(ap->a_dvp, NOTE_WRITE \| NOTE_LINK);
	1501	lwkt_reltoken(&hmp->fs_token);
	1502	return (error);
	1503	}
	1504
	1505	/*
	1506	* hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
	1507	*
	1508	* The operating system has already ensured that the directory entry
	1509	* does not exist and done all appropriate namespace locking.
	1510	*/
	1511	static
	1512	int
	1513	hammer_vop_nmknod(struct vop_nmknod_args *ap)
	1514	{
	1515	struct hammer_transaction trans;
	1516	struct hammer_inode *dip;
	1517	struct hammer_inode *nip;
	1518	struct nchandle *nch;
	1519	hammer_mount_t hmp;
	1520	int error;
	1521
	1522	nch = ap->a_nch;
	1523	dip = VTOI(ap->a_dvp);
	1524	hmp = dip->hmp;
	1525
	1526	if (dip->flags & HAMMER_INODE_RO)
	1527	return (EROFS);
	1528	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1529	return (error);
	1530
	1531	/*
	1532	* Create a transaction to cover the operations we perform.
	1533	*/
	1534	lwkt_gettoken(&hmp->fs_token);
	1535	hammer_start_transaction(&trans, hmp);
	1536	++hammer_stats_file_iopsw;
	1537
	1538	/*
	1539	* Create a new filesystem object of the requested type. The
	1540	* returned inode will be referenced but not locked.
	1541	*
	1542	* If mknod specifies a directory a pseudo-fs is created.
	1543	*/
	1544	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	1545	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	1546	NULL, &nip);
	1547	if (error) {
	1548	hammer_done_transaction(&trans);
	1549	*ap->a_vpp = NULL;
	1550	lwkt_reltoken(&hmp->fs_token);
	1551	return (error);
	1552	}
	1553
	1554	/*
	1555	* Add the new filesystem object to the directory. This will also
	1556	* bump the inode's link count.
	1557	*/
	1558	error = hammer_ip_add_directory(&trans, dip,
	1559	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1560	nip);
	1561
	1562	/*
	1563	* Finish up.
	1564	*/
	1565	if (error) {
	1566	hammer_rel_inode(nip, 0);
	1567	*ap->a_vpp = NULL;
	1568	} else {
	1569	error = hammer_get_vnode(nip, ap->a_vpp);
	1570	hammer_rel_inode(nip, 0);
	1571	if (error == 0) {
	1572	cache_setunresolved(ap->a_nch);
	1573	cache_setvp(ap->a_nch, *ap->a_vpp);
	1574	}
	1575	}
	1576	hammer_done_transaction(&trans);
	1577	if (error == 0)
	1578	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1579	lwkt_reltoken(&hmp->fs_token);
	1580	return (error);
	1581	}
	1582
	1583	/*
	1584	* hammer_vop_open { vp, mode, cred, fp }
	1585	*
	1586	* MPSAFE (does not require fs_token)
	1587	*/
	1588	static
	1589	int
	1590	hammer_vop_open(struct vop_open_args *ap)
	1591	{
	1592	hammer_inode_t ip;
	1593
	1594	++hammer_stats_file_iopsr;
	1595	ip = VTOI(ap->a_vp);
	1596
	1597	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
	1598	return (EROFS);
	1599	return(vop_stdopen(ap));
	1600	}
	1601
	1602	/*
	1603	* hammer_vop_print { vp }
	1604	*/
	1605	static
	1606	int
	1607	hammer_vop_print(struct vop_print_args *ap)
	1608	{
	1609	return EOPNOTSUPP;
	1610	}
	1611
	1612	/*
	1613	* hammer_vop_readdir { vp, uio, cred, eofflag, ncookies, off_t **cookies }
	1614	*/
	1615	static
	1616	int
	1617	hammer_vop_readdir(struct vop_readdir_args *ap)
	1618	{
	1619	struct hammer_transaction trans;
	1620	struct hammer_cursor cursor;
	1621	struct hammer_inode *ip;
	1622	hammer_mount_t hmp;
	1623	struct uio *uio;
	1624	hammer_base_elm_t base;
	1625	int error;
	1626	int cookie_index;
	1627	int ncookies;
	1628	off_t *cookies;
	1629	off_t saveoff;
	1630	int r;
	1631	int dtype;
	1632
	1633	++hammer_stats_file_iopsr;
	1634	ip = VTOI(ap->a_vp);
	1635	uio = ap->a_uio;
	1636	saveoff = uio->uio_offset;
	1637	hmp = ip->hmp;
	1638
	1639	if (ap->a_ncookies) {
	1640	ncookies = uio->uio_resid / 16 + 1;
	1641	if (ncookies > 1024)
	1642	ncookies = 1024;
	1643	cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
	1644	cookie_index = 0;
	1645	} else {
	1646	ncookies = -1;
	1647	cookies = NULL;
	1648	cookie_index = 0;
	1649	}
	1650
	1651	lwkt_gettoken(&hmp->fs_token);
	1652	hammer_simple_transaction(&trans, hmp);
	1653
	1654	/*
	1655	* Handle artificial entries
	1656	*
	1657	* It should be noted that the minimum value for a directory
	1658	* hash key on-media is 0x0000000100000000, so we can use anything
	1659	* less then that to represent our 'special' key space.
	1660	*/
	1661	error = 0;
	1662	if (saveoff == 0) {
	1663	r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
	1664	if (r)
	1665	goto done;
	1666	if (cookies)
	1667	cookies[cookie_index] = saveoff;
	1668	++saveoff;
	1669	++cookie_index;
	1670	if (cookie_index == ncookies)
	1671	goto done;
	1672	}
	1673	if (saveoff == 1) {
	1674	if (ip->ino_data.parent_obj_id) {
	1675	r = vop_write_dirent(&error, uio,
	1676	ip->ino_data.parent_obj_id,
	1677	DT_DIR, 2, "..");
	1678	} else {
	1679	r = vop_write_dirent(&error, uio,
	1680	ip->obj_id, DT_DIR, 2, "..");
	1681	}
	1682	if (r)
	1683	goto done;
	1684	if (cookies)
	1685	cookies[cookie_index] = saveoff;
	1686	++saveoff;
	1687	++cookie_index;
	1688	if (cookie_index == ncookies)
	1689	goto done;
	1690	}
	1691
	1692	/*
	1693	* Key range (begin and end inclusive) to scan. Directory keys
	1694	* directly translate to a 64 bit 'seek' position.
	1695	*/
	1696	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1697	cursor.key_beg.localization = ip->obj_localization +
	1698	hammer_dir_localization(ip);
	1699	cursor.key_beg.obj_id = ip->obj_id;
	1700	cursor.key_beg.create_tid = 0;
	1701	cursor.key_beg.delete_tid = 0;
	1702	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1703	cursor.key_beg.obj_type = 0;
	1704	cursor.key_beg.key = saveoff;
	1705
	1706	cursor.key_end = cursor.key_beg;
	1707	cursor.key_end.key = HAMMER_MAX_KEY;
	1708	cursor.asof = ip->obj_asof;
	1709	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1710
	1711	error = hammer_ip_first(&cursor);
	1712
	1713	while (error == 0) {
	1714	error = hammer_ip_resolve_data(&cursor);
	1715	if (error)
	1716	break;
	1717	base = &cursor.leaf->base;
	1718	saveoff = base->key;
	1719	KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
	1720
	1721	if (base->obj_id != ip->obj_id)
	1722	panic("readdir: bad record at %p", cursor.node);
	1723
	1724	/*
	1725	* Convert pseudo-filesystems into softlinks
	1726	*/
	1727	dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
	1728	r = vop_write_dirent(
	1729	&error, uio, cursor.data->entry.obj_id,
	1730	dtype,
	1731	cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
	1732	(void *)cursor.data->entry.name);
	1733	if (r)
	1734	break;
	1735	++saveoff;
	1736	if (cookies)
	1737	cookies[cookie_index] = base->key;
	1738	++cookie_index;
	1739	if (cookie_index == ncookies)
	1740	break;
	1741	error = hammer_ip_next(&cursor);
	1742	}
	1743	hammer_done_cursor(&cursor);
	1744
	1745	done:
	1746	hammer_done_transaction(&trans);
	1747
	1748	if (ap->a_eofflag)
	1749	*ap->a_eofflag = (error == ENOENT);
	1750	uio->uio_offset = saveoff;
	1751	if (error && cookie_index == 0) {
	1752	if (error == ENOENT)
	1753	error = 0;
	1754	if (cookies) {
	1755	kfree(cookies, M_TEMP);
	1756	*ap->a_ncookies = 0;
	1757	*ap->a_cookies = NULL;
	1758	}
	1759	} else {
	1760	if (error == ENOENT)
	1761	error = 0;
	1762	if (cookies) {
	1763	*ap->a_ncookies = cookie_index;
	1764	*ap->a_cookies = cookies;
	1765	}
	1766	}
	1767	lwkt_reltoken(&hmp->fs_token);
	1768	return(error);
	1769	}
	1770
	1771	/*
	1772	* hammer_vop_readlink { vp, uio, cred }
	1773	*/
	1774	static
	1775	int
	1776	hammer_vop_readlink(struct vop_readlink_args *ap)
	1777	{
	1778	struct hammer_transaction trans;
	1779	struct hammer_cursor cursor;
	1780	struct hammer_inode *ip;
	1781	hammer_mount_t hmp;
	1782	char buf[32];
	1783	u_int32_t localization;
	1784	hammer_pseudofs_inmem_t pfsm;
	1785	int error;
	1786
	1787	ip = VTOI(ap->a_vp);
	1788	hmp = ip->hmp;
	1789
	1790	lwkt_gettoken(&hmp->fs_token);
	1791
	1792	/*
	1793	* Shortcut if the symlink data was stuffed into ino_data.
	1794	*
	1795	* Also expand special "@@PFS%05d" softlinks (expansion only
	1796	* occurs for non-historical (current) accesses made from the
	1797	* primary filesystem).
	1798	*/
	1799	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
	1800	char *ptr;
	1801	int bytes;
	1802
	1803	ptr = ip->ino_data.ext.symlink;
	1804	bytes = (int)ip->ino_data.size;
	1805	if (bytes == 10 &&
	1806	ip->obj_asof == HAMMER_MAX_TID &&
	1807	ip->obj_localization == 0 &&
	1808	strncmp(ptr, "@@PFS", 5) == 0) {
	1809	hammer_simple_transaction(&trans, hmp);
	1810	bcopy(ptr + 5, buf, 5);
	1811	buf[5] = 0;
	1812	localization = strtoul(buf, NULL, 10) << 16;
	1813	pfsm = hammer_load_pseudofs(&trans, localization,
	1814	&error);
	1815	if (error == 0) {
	1816	if (pfsm->pfsd.mirror_flags &
	1817	HAMMER_PFSD_SLAVE) {
	1818	/* vap->va_size == 26 */
	1819	ksnprintf(buf, sizeof(buf),
	1820	"@@0x%016llx:%05d",
	1821	(long long)pfsm->pfsd.sync_end_tid,
	1822	localization >> 16);
	1823	} else {
	1824	/* vap->va_size == 10 */
	1825	ksnprintf(buf, sizeof(buf),
	1826	"@@-1:%05d",
	1827	localization >> 16);
	1828	#if 0
	1829	ksnprintf(buf, sizeof(buf),
	1830	"@@0x%016llx:%05d",
	1831	(long long)HAMMER_MAX_TID,
	1832	localization >> 16);
	1833	#endif
	1834	}
	1835	ptr = buf;
	1836	bytes = strlen(buf);
	1837	}
	1838	if (pfsm)
	1839	hammer_rel_pseudofs(hmp, pfsm);
	1840	hammer_done_transaction(&trans);
	1841	}
	1842	error = uiomove(ptr, bytes, ap->a_uio);
	1843	lwkt_reltoken(&hmp->fs_token);
	1844	return(error);
	1845	}
	1846
	1847	/*
	1848	* Long version
	1849	*/
	1850	hammer_simple_transaction(&trans, hmp);
	1851	++hammer_stats_file_iopsr;
	1852	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1853
	1854	/*
	1855	* Key range (begin and end inclusive) to scan. Directory keys
	1856	* directly translate to a 64 bit 'seek' position.
	1857	*/
	1858	cursor.key_beg.localization = ip->obj_localization +
	1859	HAMMER_LOCALIZE_MISC;
	1860	cursor.key_beg.obj_id = ip->obj_id;
	1861	cursor.key_beg.create_tid = 0;
	1862	cursor.key_beg.delete_tid = 0;
	1863	cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
	1864	cursor.key_beg.obj_type = 0;
	1865	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
	1866	cursor.asof = ip->obj_asof;
	1867	cursor.flags \|= HAMMER_CURSOR_ASOF;
	1868
	1869	error = hammer_ip_lookup(&cursor);
	1870	if (error == 0) {
	1871	error = hammer_ip_resolve_data(&cursor);
	1872	if (error == 0) {
	1873	KKASSERT(cursor.leaf->data_len >=
	1874	HAMMER_SYMLINK_NAME_OFF);
	1875	error = uiomove(cursor.data->symlink.name,
	1876	cursor.leaf->data_len -
	1877	HAMMER_SYMLINK_NAME_OFF,
	1878	ap->a_uio);
	1879	}
	1880	}
	1881	hammer_done_cursor(&cursor);
	1882	hammer_done_transaction(&trans);
	1883	lwkt_reltoken(&hmp->fs_token);
	1884	return(error);
	1885	}
	1886
	1887	/*
	1888	* hammer_vop_nremove { nch, dvp, cred }
	1889	*/
	1890	static
	1891	int
	1892	hammer_vop_nremove(struct vop_nremove_args *ap)
	1893	{
	1894	struct hammer_transaction trans;
	1895	struct hammer_inode *dip;
	1896	hammer_mount_t hmp;
	1897	int error;
	1898
	1899	dip = VTOI(ap->a_dvp);
	1900	hmp = dip->hmp;
	1901
	1902	if (hammer_nohistory(dip) == 0 &&
	1903	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	1904	return (error);
	1905	}
	1906
	1907	lwkt_gettoken(&hmp->fs_token);
	1908	hammer_start_transaction(&trans, hmp);
	1909	++hammer_stats_file_iopsw;
	1910	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
	1911	hammer_done_transaction(&trans);
	1912	if (error == 0)
	1913	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1914	lwkt_reltoken(&hmp->fs_token);
	1915	return (error);
	1916	}
	1917
	1918	/*
	1919	* hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
	1920	*/
	1921	static
	1922	int
	1923	hammer_vop_nrename(struct vop_nrename_args *ap)
	1924	{
	1925	struct hammer_transaction trans;
	1926	struct namecache *fncp;
	1927	struct namecache *tncp;
	1928	struct hammer_inode *fdip;
	1929	struct hammer_inode *tdip;
	1930	struct hammer_inode *ip;
	1931	hammer_mount_t hmp;
	1932	struct hammer_cursor cursor;
	1933	int64_t namekey;
	1934	u_int32_t max_iterations;
	1935	int nlen, error;
	1936
	1937	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
	1938	return(EXDEV);
	1939	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
	1940	return(EXDEV);
	1941
	1942	fdip = VTOI(ap->a_fdvp);
	1943	tdip = VTOI(ap->a_tdvp);
	1944	fncp = ap->a_fnch->ncp;
	1945	tncp = ap->a_tnch->ncp;
	1946	ip = VTOI(fncp->nc_vp);
	1947	KKASSERT(ip != NULL);
	1948
	1949	hmp = ip->hmp;
	1950
	1951	if (fdip->obj_localization != tdip->obj_localization)
	1952	return(EXDEV);
	1953	if (fdip->obj_localization != ip->obj_localization)
	1954	return(EXDEV);
	1955
	1956	if (fdip->flags & HAMMER_INODE_RO)
	1957	return (EROFS);
	1958	if (tdip->flags & HAMMER_INODE_RO)
	1959	return (EROFS);
	1960	if (ip->flags & HAMMER_INODE_RO)
	1961	return (EROFS);
	1962	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1963	return (error);
	1964
	1965	lwkt_gettoken(&hmp->fs_token);
	1966	hammer_start_transaction(&trans, hmp);
	1967	++hammer_stats_file_iopsw;
	1968
	1969	/*
	1970	* Remove tncp from the target directory and then link ip as
	1971	* tncp. XXX pass trans to dounlink
	1972	*
	1973	* Force the inode sync-time to match the transaction so it is
	1974	* in-sync with the creation of the target directory entry.
	1975	*/
	1976	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
	1977	ap->a_cred, 0, -1);
	1978	if (error == 0 \|\| error == ENOENT) {
	1979	error = hammer_ip_add_directory(&trans, tdip,
	1980	tncp->nc_name, tncp->nc_nlen,
	1981	ip);
	1982	if (error == 0) {
	1983	ip->ino_data.parent_obj_id = tdip->obj_id;
	1984	ip->ino_data.ctime = trans.time;
	1985	hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
	1986	}
	1987	}
	1988	if (error)
	1989	goto failed; /* XXX */
	1990
	1991	/*
	1992	* Locate the record in the originating directory and remove it.
	1993	*
	1994	* Calculate the namekey and setup the key range for the scan. This
	1995	* works kinda like a chained hash table where the lower 32 bits
	1996	* of the namekey synthesize the chain.
	1997	*
	1998	* The key range is inclusive of both key_beg and key_end.
	1999	*/
	2000	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
	2001	&max_iterations);
	2002	retry:
	2003	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
	2004	cursor.key_beg.localization = fdip->obj_localization +
	2005	hammer_dir_localization(fdip);
	2006	cursor.key_beg.obj_id = fdip->obj_id;
	2007	cursor.key_beg.key = namekey;
	2008	cursor.key_beg.create_tid = 0;
	2009	cursor.key_beg.delete_tid = 0;
	2010	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	2011	cursor.key_beg.obj_type = 0;
	2012
	2013	cursor.key_end = cursor.key_beg;
	2014	cursor.key_end.key += max_iterations;
	2015	cursor.asof = fdip->obj_asof;
	2016	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	2017
	2018	/*
	2019	* Scan all matching records (the chain), locate the one matching
	2020	* the requested path component.
	2021	*
	2022	* The hammer_ip_*() functions merge in-memory records with on-disk
	2023	* records for the purposes of the search.
	2024	*/
	2025	error = hammer_ip_first(&cursor);
	2026	while (error == 0) {
	2027	if (hammer_ip_resolve_data(&cursor) != 0)
	2028	break;
	2029	nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
	2030	KKASSERT(nlen > 0);
	2031	if (fncp->nc_nlen == nlen &&
	2032	bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	2033	break;
	2034	}
	2035	error = hammer_ip_next(&cursor);
	2036	}
	2037
	2038	/*
	2039	* If all is ok we have to get the inode so we can adjust nlinks.
	2040	*
	2041	* WARNING: hammer_ip_del_directory() may have to terminate the
	2042	* cursor to avoid a recursion. It's ok to call hammer_done_cursor()
	2043	* twice.
	2044	*/
	2045	if (error == 0)
	2046	error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
	2047
	2048	/*
	2049	* XXX A deadlock here will break rename's atomicy for the purposes
	2050	* of crash recovery.
	2051	*/
	2052	if (error == EDEADLK) {
	2053	hammer_done_cursor(&cursor);
	2054	goto retry;
	2055	}
	2056
	2057	/*
	2058	* Cleanup and tell the kernel that the rename succeeded.
	2059	*
	2060	* NOTE: ip->vp, if non-NULL, cannot be directly referenced
	2061	* without formally acquiring the vp since the vp might
	2062	* have zero refs on it, or in the middle of a reclaim,
	2063	* etc.
	2064	*/
	2065	hammer_done_cursor(&cursor);
	2066	if (error == 0) {
	2067	cache_rename(ap->a_fnch, ap->a_tnch);
	2068	hammer_knote(ap->a_fdvp, NOTE_WRITE);
	2069	hammer_knote(ap->a_tdvp, NOTE_WRITE);
	2070	while (ip->vp) {
	2071	struct vnode *vp;
	2072
	2073	error = hammer_get_vnode(ip, &vp);
	2074	if (error == 0 && vp) {
	2075	vn_unlock(vp);
	2076	hammer_knote(ip->vp, NOTE_RENAME);
	2077	vrele(vp);
	2078	break;
	2079	}
	2080	kprintf("Debug: HAMMER ip/vp race2 avoided\n");
	2081	}
	2082	}
	2083
	2084	failed:
	2085	hammer_done_transaction(&trans);
	2086	lwkt_reltoken(&hmp->fs_token);
	2087	return (error);
	2088	}
	2089
	2090	/*
	2091	* hammer_vop_nrmdir { nch, dvp, cred }
	2092	*/
	2093	static
	2094	int
	2095	hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
	2096	{
	2097	struct hammer_transaction trans;
	2098	struct hammer_inode *dip;
	2099	hammer_mount_t hmp;
	2100	int error;
	2101
	2102	dip = VTOI(ap->a_dvp);
	2103	hmp = dip->hmp;
	2104
	2105	if (hammer_nohistory(dip) == 0 &&
	2106	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	2107	return (error);
	2108	}
	2109
	2110	lwkt_gettoken(&hmp->fs_token);
	2111	hammer_start_transaction(&trans, hmp);
	2112	++hammer_stats_file_iopsw;
	2113	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
	2114	hammer_done_transaction(&trans);
	2115	if (error == 0)
	2116	hammer_knote(ap->a_dvp, NOTE_WRITE \| NOTE_LINK);
	2117	lwkt_reltoken(&hmp->fs_token);
	2118	return (error);
	2119	}
	2120
	2121	/*
	2122	* hammer_vop_markatime { vp, cred }
	2123	*/
	2124	static
	2125	int
	2126	hammer_vop_markatime(struct vop_markatime_args *ap)
	2127	{
	2128	struct hammer_transaction trans;
	2129	struct hammer_inode *ip;
	2130	hammer_mount_t hmp;
	2131
	2132	ip = VTOI(ap->a_vp);
	2133	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	2134	return (EROFS);
	2135	if (ip->flags & HAMMER_INODE_RO)
	2136	return (EROFS);
	2137	hmp = ip->hmp;
	2138	if (hmp->mp->mnt_flag & MNT_NOATIME)
	2139	return (0);
	2140	lwkt_gettoken(&hmp->fs_token);
	2141	hammer_start_transaction(&trans, hmp);
	2142	++hammer_stats_file_iopsw;
	2143
	2144	ip->ino_data.atime = trans.time;
	2145	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
	2146	hammer_done_transaction(&trans);
	2147	hammer_knote(ap->a_vp, NOTE_ATTRIB);
	2148	lwkt_reltoken(&hmp->fs_token);
	2149	return (0);
	2150	}
	2151
	2152	/*
	2153	* hammer_vop_setattr { vp, vap, cred }
	2154	*/
	2155	static
	2156	int
	2157	hammer_vop_setattr(struct vop_setattr_args *ap)
	2158	{
	2159	struct hammer_transaction trans;
	2160	struct hammer_inode *ip;
	2161	struct vattr *vap;
	2162	hammer_mount_t hmp;
	2163	int modflags;
	2164	int error;
	2165	int truncating;
	2166	int blksize;
	2167	int kflags;
	2168	#if 0
	2169	int64_t aligned_size;
	2170	#endif
	2171	u_int32_t flags;
	2172
	2173	vap = ap->a_vap;
	2174	ip = ap->a_vp->v_data;
	2175	modflags = 0;
	2176	kflags = 0;
	2177	hmp = ip->hmp;
	2178
	2179	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	2180	return(EROFS);
	2181	if (ip->flags & HAMMER_INODE_RO)
	2182	return (EROFS);
	2183	if (hammer_nohistory(ip) == 0 &&
	2184	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	2185	return (error);
	2186	}
	2187
	2188	lwkt_gettoken(&hmp->fs_token);
	2189	hammer_start_transaction(&trans, hmp);
	2190	++hammer_stats_file_iopsw;
	2191	error = 0;
	2192
	2193	if (vap->va_flags != VNOVAL) {
	2194	flags = ip->ino_data.uflags;
	2195	error = vop_helper_setattr_flags(&flags, vap->va_flags,
	2196	hammer_to_unix_xid(&ip->ino_data.uid),
	2197	ap->a_cred);
	2198	if (error == 0) {
	2199	if (ip->ino_data.uflags != flags) {
	2200	ip->ino_data.uflags = flags;
	2201	ip->ino_data.ctime = trans.time;
	2202	modflags \|= HAMMER_INODE_DDIRTY;
	2203	kflags \|= NOTE_ATTRIB;
	2204	}
	2205	if (ip->ino_data.uflags & (IMMUTABLE \| APPEND)) {
	2206	error = 0;
	2207	goto done;
	2208	}
	2209	}
	2210	goto done;
	2211	}
	2212	if (ip->ino_data.uflags & (IMMUTABLE \| APPEND)) {
	2213	error = EPERM;
	2214	goto done;
	2215	}
	2216	if (vap->va_uid != (uid_t)VNOVAL \|\| vap->va_gid != (gid_t)VNOVAL) {
	2217	mode_t cur_mode = ip->ino_data.mode;
	2218	uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	2219	gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	2220	uuid_t uuid_uid;
	2221	uuid_t uuid_gid;
	2222
	2223	error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
	2224	ap->a_cred,
	2225	&cur_uid, &cur_gid, &cur_mode);
	2226	if (error == 0) {
	2227	hammer_guid_to_uuid(&uuid_uid, cur_uid);
	2228	hammer_guid_to_uuid(&uuid_gid, cur_gid);
	2229	if (bcmp(&uuid_uid, &ip->ino_data.uid,
	2230	sizeof(uuid_uid)) \|\|
	2231	bcmp(&uuid_gid, &ip->ino_data.gid,
	2232	sizeof(uuid_gid)) \|\|
	2233	ip->ino_data.mode != cur_mode
	2234	) {
	2235	ip->ino_data.uid = uuid_uid;
	2236	ip->ino_data.gid = uuid_gid;
	2237	ip->ino_data.mode = cur_mode;
	2238	ip->ino_data.ctime = trans.time;
	2239	modflags \|= HAMMER_INODE_DDIRTY;
	2240	}
	2241	kflags \|= NOTE_ATTRIB;
	2242	}
	2243	}
	2244	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
	2245	switch(ap->a_vp->v_type) {
	2246	case VREG:
	2247	if (vap->va_size == ip->ino_data.size)
	2248	break;
	2249
	2250	/*
	2251	* Log the operation if in fast-fsync mode or if
	2252	* there are unterminated redo write records present.
	2253	*
	2254	* The second check is needed so the recovery code
	2255	* properly truncates write redos even if nominal
	2256	* REDO operations is turned off due to excessive
	2257	* writes, because the related records might be
	2258	* destroyed and never lay down a TERM_WRITE.
	2259	*/
	2260	if ((ip->flags & HAMMER_INODE_REDO) \|\|
	2261	(ip->flags & HAMMER_INODE_RDIRTY)) {
	2262	error = hammer_generate_redo(&trans, ip,
	2263	vap->va_size,
	2264	HAMMER_REDO_TRUNC,
	2265	NULL, 0);
	2266	}
	2267	blksize = hammer_blocksize(vap->va_size);
	2268
	2269	/*
	2270	* XXX break atomicy, we can deadlock the backend
	2271	* if we do not release the lock. Probably not a
	2272	* big deal here.
	2273	*/
	2274	if (vap->va_size < ip->ino_data.size) {
	2275	nvtruncbuf(ap->a_vp, vap->va_size,
	2276	blksize,
	2277	hammer_blockoff(vap->va_size));
	2278	truncating = 1;
	2279	kflags \|= NOTE_WRITE;
	2280	} else {
	2281	nvextendbuf(ap->a_vp,
	2282	ip->ino_data.size,
	2283	vap->va_size,
	2284	hammer_blocksize(ip->ino_data.size),
	2285	hammer_blocksize(vap->va_size),
	2286	hammer_blockoff(ip->ino_data.size),
	2287	hammer_blockoff(vap->va_size),
	2288	0);
	2289	truncating = 0;
	2290	kflags \|= NOTE_WRITE \| NOTE_EXTEND;
	2291	}
	2292	ip->ino_data.size = vap->va_size;
	2293	ip->ino_data.mtime = trans.time;
	2294	/* XXX safe to use SDIRTY instead of DDIRTY here? */
	2295	modflags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_DDIRTY;
	2296
	2297	/*
	2298	* On-media truncation is cached in the inode until
	2299	* the inode is synchronized. We must immediately
	2300	* handle any frontend records.
	2301	*/
	2302	if (truncating) {
	2303	hammer_ip_frontend_trunc(ip, vap->va_size);
	2304	#ifdef DEBUG_TRUNCATE
	2305	if (HammerTruncIp == NULL)
	2306	HammerTruncIp = ip;
	2307	#endif
	2308	if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
	2309	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2310	ip->trunc_off = vap->va_size;
	2311	#ifdef DEBUG_TRUNCATE
	2312	if (ip == HammerTruncIp)
	2313	kprintf("truncate1 %016llx\n",
	2314	(long long)ip->trunc_off);
	2315	#endif
	2316	} else if (ip->trunc_off > vap->va_size) {
	2317	ip->trunc_off = vap->va_size;
	2318	#ifdef DEBUG_TRUNCATE
	2319	if (ip == HammerTruncIp)
	2320	kprintf("truncate2 %016llx\n",
	2321	(long long)ip->trunc_off);
	2322	#endif
	2323	} else {
	2324	#ifdef DEBUG_TRUNCATE
	2325	if (ip == HammerTruncIp)
	2326	kprintf("truncate3 %016llx (ignored)\n",
	2327	(long long)vap->va_size);
	2328	#endif
	2329	}
	2330	}
	2331
	2332	#if 0
	2333	/*
	2334	* When truncating, nvtruncbuf() may have cleaned out
	2335	* a portion of the last block on-disk in the buffer
	2336	* cache. We must clean out any frontend records
	2337	* for blocks beyond the new last block.
	2338	*/
	2339	aligned_size = (vap->va_size + (blksize - 1)) &
	2340	~(int64_t)(blksize - 1);
	2341	if (truncating && vap->va_size < aligned_size) {
	2342	aligned_size -= blksize;
	2343	hammer_ip_frontend_trunc(ip, aligned_size);
	2344	}
	2345	#endif
	2346	break;
	2347	case VDATABASE:
	2348	if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
	2349	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2350	ip->trunc_off = vap->va_size;
	2351	} else if (ip->trunc_off > vap->va_size) {
	2352	ip->trunc_off = vap->va_size;
	2353	}
	2354	hammer_ip_frontend_trunc(ip, vap->va_size);
	2355	ip->ino_data.size = vap->va_size;
	2356	ip->ino_data.mtime = trans.time;
	2357	modflags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_DDIRTY;
	2358	kflags \|= NOTE_ATTRIB;
	2359	break;
	2360	default:
	2361	error = EINVAL;
	2362	goto done;
	2363	}
	2364	break;
	2365	}
	2366	if (vap->va_atime.tv_sec != VNOVAL) {
	2367	ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
	2368	modflags \|= HAMMER_INODE_ATIME;
	2369	kflags \|= NOTE_ATTRIB;
	2370	}
	2371	if (vap->va_mtime.tv_sec != VNOVAL) {
	2372	ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
	2373	modflags \|= HAMMER_INODE_MTIME;
	2374	kflags \|= NOTE_ATTRIB;
	2375	}
	2376	if (vap->va_mode != (mode_t)VNOVAL) {
	2377	mode_t cur_mode = ip->ino_data.mode;
	2378	uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	2379	gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	2380
	2381	error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
	2382	cur_uid, cur_gid, &cur_mode);
	2383	if (error == 0 && ip->ino_data.mode != cur_mode) {
	2384	ip->ino_data.mode = cur_mode;
	2385	ip->ino_data.ctime = trans.time;
	2386	modflags \|= HAMMER_INODE_DDIRTY;
	2387	kflags \|= NOTE_ATTRIB;
	2388	}
	2389	}
	2390	done:
	2391	if (error == 0)
	2392	hammer_modify_inode(&trans, ip, modflags);
	2393	hammer_done_transaction(&trans);
	2394	hammer_knote(ap->a_vp, kflags);
	2395	lwkt_reltoken(&hmp->fs_token);
	2396	return (error);
	2397	}
	2398
	2399	/*
	2400	* hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
	2401	*/
	2402	static
	2403	int
	2404	hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
	2405	{
	2406	struct hammer_transaction trans;
	2407	struct hammer_inode *dip;
	2408	struct hammer_inode *nip;
	2409	hammer_record_t record;
	2410	struct nchandle *nch;
	2411	hammer_mount_t hmp;
	2412	int error;
	2413	int bytes;
	2414
	2415	ap->a_vap->va_type = VLNK;
	2416
	2417	nch = ap->a_nch;
	2418	dip = VTOI(ap->a_dvp);
	2419	hmp = dip->hmp;
	2420
	2421	if (dip->flags & HAMMER_INODE_RO)
	2422	return (EROFS);
	2423	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	2424	return (error);
	2425
	2426	/*
	2427	* Create a transaction to cover the operations we perform.
	2428	*/
	2429	lwkt_gettoken(&hmp->fs_token);
	2430	hammer_start_transaction(&trans, hmp);
	2431	++hammer_stats_file_iopsw;
	2432
	2433	/*
	2434	* Create a new filesystem object of the requested type. The
	2435	* returned inode will be referenced but not locked.
	2436	*/
	2437
	2438	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	2439	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	2440	NULL, &nip);
	2441	if (error) {
	2442	hammer_done_transaction(&trans);
	2443	*ap->a_vpp = NULL;
	2444	lwkt_reltoken(&hmp->fs_token);
	2445	return (error);
	2446	}
	2447
	2448	/*
	2449	* Add a record representing the symlink. symlink stores the link
	2450	* as pure data, not a string, and is no \0 terminated.
	2451	*/
	2452	if (error == 0) {
	2453	bytes = strlen(ap->a_target);
	2454
	2455	if (bytes <= HAMMER_INODE_BASESYMLEN) {
	2456	bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
	2457	} else {
	2458	record = hammer_alloc_mem_record(nip, bytes);
	2459	record->type = HAMMER_MEM_RECORD_GENERAL;
	2460
	2461	record->leaf.base.localization = nip->obj_localization +
	2462	HAMMER_LOCALIZE_MISC;
	2463	record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
	2464	record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
	2465	record->leaf.data_len = bytes;
	2466	KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
	2467	bcopy(ap->a_target, record->data->symlink.name, bytes);
	2468	error = hammer_ip_add_record(&trans, record);
	2469	}
	2470
	2471	/*
	2472	* Set the file size to the length of the link.
	2473	*/
	2474	if (error == 0) {
	2475	nip->ino_data.size = bytes;
	2476	hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
	2477	}
	2478	}
	2479	if (error == 0)
	2480	error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
	2481	nch->ncp->nc_nlen, nip);
	2482
	2483	/*
	2484	* Finish up.
	2485	*/
	2486	if (error) {
	2487	hammer_rel_inode(nip, 0);
	2488	*ap->a_vpp = NULL;
	2489	} else {
	2490	error = hammer_get_vnode(nip, ap->a_vpp);
	2491	hammer_rel_inode(nip, 0);
	2492	if (error == 0) {
	2493	cache_setunresolved(ap->a_nch);
	2494	cache_setvp(ap->a_nch, *ap->a_vpp);
	2495	hammer_knote(ap->a_dvp, NOTE_WRITE);
	2496	}
	2497	}
	2498	hammer_done_transaction(&trans);
	2499	lwkt_reltoken(&hmp->fs_token);
	2500	return (error);
	2501	}
	2502
	2503	/*
	2504	* hammer_vop_nwhiteout { nch, dvp, cred, flags }
	2505	*/
	2506	static
	2507	int
	2508	hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
	2509	{
	2510	struct hammer_transaction trans;
	2511	struct hammer_inode *dip;
	2512	hammer_mount_t hmp;
	2513	int error;
	2514
	2515	dip = VTOI(ap->a_dvp);
	2516	hmp = dip->hmp;
	2517
	2518	if (hammer_nohistory(dip) == 0 &&
	2519	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
	2520	return (error);
	2521	}
	2522
	2523	lwkt_gettoken(&hmp->fs_token);
	2524	hammer_start_transaction(&trans, hmp);
	2525	++hammer_stats_file_iopsw;
	2526	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
	2527	ap->a_cred, ap->a_flags, -1);
	2528	hammer_done_transaction(&trans);
	2529	lwkt_reltoken(&hmp->fs_token);
	2530
	2531	return (error);
	2532	}
	2533
	2534	/*
	2535	* hammer_vop_ioctl { vp, command, data, fflag, cred }
	2536	*/
	2537	static
	2538	int
	2539	hammer_vop_ioctl(struct vop_ioctl_args *ap)
	2540	{
	2541	struct hammer_inode *ip = ap->a_vp->v_data;
	2542	hammer_mount_t hmp = ip->hmp;
	2543	int error;
	2544
	2545	++hammer_stats_file_iopsr;
	2546	lwkt_gettoken(&hmp->fs_token);
	2547	error = hammer_ioctl(ip, ap->a_command, ap->a_data,
	2548	ap->a_fflag, ap->a_cred);
	2549	lwkt_reltoken(&hmp->fs_token);
	2550	return (error);
	2551	}
	2552
	2553	static
	2554	int
	2555	hammer_vop_mountctl(struct vop_mountctl_args *ap)
	2556	{
	2557	static const struct mountctl_opt extraopt[] = {
	2558	{ HMNT_NOHISTORY, "nohistory" },
	2559	{ HMNT_MASTERID, "master" },
	2560	{ 0, NULL}
	2561
	2562	};
	2563	struct hammer_mount *hmp;
	2564	struct mount *mp;
	2565	int usedbytes;
	2566	int error;
	2567
	2568	error = 0;
	2569	usedbytes = 0;
	2570	mp = ap->a_head.a_ops->head.vv_mount;
	2571	KKASSERT(mp->mnt_data != NULL);
	2572	hmp = (struct hammer_mount *)mp->mnt_data;
	2573
	2574	lwkt_gettoken(&hmp->fs_token);
	2575
	2576	switch(ap->a_op) {
	2577	case MOUNTCTL_SET_EXPORT:
	2578	if (ap->a_ctllen != sizeof(struct export_args))
	2579	error = EINVAL;
	2580	else
	2581	error = hammer_vfs_export(mp, ap->a_op,
	2582	(const struct export_args *)ap->a_ctl);
	2583	break;
	2584	case MOUNTCTL_MOUNTFLAGS:
	2585	{
	2586	/*
	2587	* Call standard mountctl VOP function
	2588	* so we get user mount flags.
	2589	*/
	2590	error = vop_stdmountctl(ap);
	2591	if (error)
	2592	break;
	2593
	2594	usedbytes = *ap->a_res;
	2595
	2596	if (usedbytes > 0 && usedbytes < ap->a_buflen) {
	2597	usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
	2598	ap->a_buf,
	2599	ap->a_buflen - usedbytes,
	2600	&error);
	2601	}
	2602
	2603	*ap->a_res += usedbytes;
	2604	break;
	2605	}
	2606	default:
	2607	error = vop_stdmountctl(ap);
	2608	break;
	2609	}
	2610	lwkt_reltoken(&hmp->fs_token);
	2611	return(error);
	2612	}
	2613
	2614	/*
	2615	* hammer_vop_strategy { vp, bio }
	2616	*
	2617	* Strategy call, used for regular file read & write only. Note that the
	2618	* bp may represent a cluster.
	2619	*
	2620	* To simplify operation and allow better optimizations in the future,
	2621	* this code does not make any assumptions with regards to buffer alignment
	2622	* or size.
	2623	*/
	2624	static
	2625	int
	2626	hammer_vop_strategy(struct vop_strategy_args *ap)
	2627	{
	2628	struct buf *bp;
	2629	int error;
	2630
	2631	bp = ap->a_bio->bio_buf;
	2632
	2633	switch(bp->b_cmd) {
	2634	case BUF_CMD_READ:
	2635	error = hammer_vop_strategy_read(ap);
	2636	break;
	2637	case BUF_CMD_WRITE:
	2638	error = hammer_vop_strategy_write(ap);
	2639	break;
	2640	default:
	2641	bp->b_error = error = EINVAL;
	2642	bp->b_flags \|= B_ERROR;
	2643	biodone(ap->a_bio);
	2644	break;
	2645	}
	2646
	2647	/* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
	2648
	2649	return (error);
	2650	}
	2651
	2652	/*
	2653	* Read from a regular file. Iterate the related records and fill in the
	2654	* BIO/BUF. Gaps are zero-filled.
	2655	*
	2656	* The support code in hammer_object.c should be used to deal with mixed
	2657	* in-memory and on-disk records.
	2658	*
	2659	* NOTE: Can be called from the cluster code with an oversized buf.
	2660	*
	2661	* XXX atime update
	2662	*/
	2663	static
	2664	int
	2665	hammer_vop_strategy_read(struct vop_strategy_args *ap)
	2666	{
	2667	struct hammer_transaction trans;
	2668	struct hammer_inode *ip;
	2669	struct hammer_inode *dip;
	2670	hammer_mount_t hmp;
	2671	struct hammer_cursor cursor;
	2672	hammer_base_elm_t base;
	2673	hammer_off_t disk_offset;
	2674	struct bio *bio;
	2675	struct bio *nbio;
	2676	struct buf *bp;
	2677	int64_t rec_offset;
	2678	int64_t ran_end;
	2679	int64_t tmp64;
	2680	int error;
	2681	int boff;
	2682	int roff;
	2683	int n;
	2684	int isdedupable;
	2685
	2686	bio = ap->a_bio;
	2687	bp = bio->bio_buf;
	2688	ip = ap->a_vp->v_data;
	2689	hmp = ip->hmp;
	2690
	2691	/*
	2692	* The zone-2 disk offset may have been set by the cluster code via
	2693	* a BMAP operation, or else should be NOOFFSET.
	2694	*
	2695	* Checking the high bits for a match against zone-2 should suffice.
	2696	*
	2697	* In cases where a lot of data duplication is present it may be
	2698	* more beneficial to drop through and doubule-buffer through the
	2699	* device.
	2700	*/
	2701	nbio = push_bio(bio);
	2702	if (hammer_double_buffer == 0 &&
	2703	(nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
	2704	HAMMER_ZONE_LARGE_DATA) {
	2705	lwkt_gettoken(&hmp->fs_token);
	2706	error = hammer_io_direct_read(hmp, nbio, NULL);
	2707	lwkt_reltoken(&hmp->fs_token);
	2708	return (error);
	2709	}
	2710
	2711	/*
	2712	* Well, that sucked. Do it the hard way. If all the stars are
	2713	* aligned we may still be able to issue a direct-read.
	2714	*/
	2715	lwkt_gettoken(&hmp->fs_token);
	2716	hammer_simple_transaction(&trans, hmp);
	2717	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	2718
	2719	/*
	2720	* Key range (begin and end inclusive) to scan. Note that the key's
	2721	* stored in the actual records represent BASE+LEN, not BASE. The
	2722	* first record containing bio_offset will have a key > bio_offset.
	2723	*/
	2724	cursor.key_beg.localization = ip->obj_localization +
	2725	HAMMER_LOCALIZE_MISC;
	2726	cursor.key_beg.obj_id = ip->obj_id;
	2727	cursor.key_beg.create_tid = 0;
	2728	cursor.key_beg.delete_tid = 0;
	2729	cursor.key_beg.obj_type = 0;
	2730	cursor.key_beg.key = bio->bio_offset + 1;
	2731	cursor.asof = ip->obj_asof;
	2732	cursor.flags \|= HAMMER_CURSOR_ASOF;
	2733
	2734	cursor.key_end = cursor.key_beg;
	2735	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
	2736	#if 0
	2737	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
	2738	cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
	2739	cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
	2740	cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
	2741	} else
	2742	#endif
	2743	{
	2744	ran_end = bio->bio_offset + bp->b_bufsize;
	2745	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
	2746	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
	2747	tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
	2748	if (tmp64 < ran_end)
	2749	cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
	2750	else
	2751	cursor.key_end.key = ran_end + MAXPHYS + 1;
	2752	}
	2753	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE;
	2754
	2755	error = hammer_ip_first(&cursor);
	2756	boff = 0;
	2757
	2758	while (error == 0) {
	2759	/*
	2760	* Get the base file offset of the record. The key for
	2761	* data records is (base + bytes) rather then (base).
	2762	*/
	2763	base = &cursor.leaf->base;
	2764	rec_offset = base->key - cursor.leaf->data_len;
	2765
	2766	/*
	2767	* Calculate the gap, if any, and zero-fill it.
	2768	*
	2769	* n is the offset of the start of the record verses our
	2770	* current seek offset in the bio.
	2771	*/
	2772	n = (int)(rec_offset - (bio->bio_offset + boff));
	2773	if (n > 0) {
	2774	if (n > bp->b_bufsize - boff)
	2775	n = bp->b_bufsize - boff;
	2776	bzero((char *)bp->b_data + boff, n);
	2777	boff += n;
	2778	n = 0;
	2779	}
	2780
	2781	/*
	2782	* Calculate the data offset in the record and the number
	2783	* of bytes we can copy.
	2784	*
	2785	* There are two degenerate cases. First, boff may already
	2786	* be at bp->b_bufsize. Secondly, the data offset within
	2787	* the record may exceed the record's size.
	2788	*/
	2789	roff = -n;
	2790	rec_offset += roff;
	2791	n = cursor.leaf->data_len - roff;
	2792	if (n <= 0) {
	2793	kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
	2794	n = 0;
	2795	} else if (n > bp->b_bufsize - boff) {
	2796	n = bp->b_bufsize - boff;
	2797	}
	2798
	2799	/*
	2800	* Deal with cached truncations. This cool bit of code
	2801	* allows truncate()/ftruncate() to avoid having to sync
	2802	* the file.
	2803	*
	2804	* If the frontend is truncated then all backend records are
	2805	* subject to the frontend's truncation.
	2806	*
	2807	* If the backend is truncated then backend records on-disk
	2808	* (but not in-memory) are subject to the backend's
	2809	* truncation. In-memory records owned by the backend
	2810	* represent data written after the truncation point on the
	2811	* backend and must not be truncated.
	2812	*
	2813	* Truncate operations deal with frontend buffer cache
	2814	* buffers and frontend-owned in-memory records synchronously.
	2815	*/
	2816	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	2817	if (hammer_cursor_ondisk(&cursor)/* \|\|
	2818	cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
	2819	if (ip->trunc_off <= rec_offset)
	2820	n = 0;
	2821	else if (ip->trunc_off < rec_offset + n)
	2822	n = (int)(ip->trunc_off - rec_offset);
	2823	}
	2824	}
	2825	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2826	if (hammer_cursor_ondisk(&cursor)) {
	2827	if (ip->sync_trunc_off <= rec_offset)
	2828	n = 0;
	2829	else if (ip->sync_trunc_off < rec_offset + n)
	2830	n = (int)(ip->sync_trunc_off - rec_offset);
	2831	}
	2832	}
	2833
	2834	/*
	2835	* Try to issue a direct read into our bio if possible,
	2836	* otherwise resolve the element data into a hammer_buffer
	2837	* and copy.
	2838	*
	2839	* The buffer on-disk should be zerod past any real
	2840	* truncation point, but may not be for any synthesized
	2841	* truncation point from above.
	2842	*/
	2843	disk_offset = cursor.leaf->data_offset + roff;
	2844	isdedupable = (boff == 0 && n == bp->b_bufsize &&
	2845	hammer_cursor_ondisk(&cursor) &&
	2846	((int)disk_offset & HAMMER_BUFMASK) == 0);
	2847
	2848	if (isdedupable && hammer_double_buffer == 0) {
	2849	KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
	2850	HAMMER_ZONE_LARGE_DATA);
	2851	nbio->bio_offset = disk_offset;
	2852	error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
	2853	if (hammer_live_dedup && error == 0)
	2854	hammer_dedup_cache_add(ip, cursor.leaf);
	2855	goto done;
	2856	} else if (n) {
	2857	error = hammer_ip_resolve_data(&cursor);
	2858	if (error == 0) {
	2859	if (hammer_live_dedup && isdedupable)
	2860	hammer_dedup_cache_add(ip, cursor.leaf);
	2861	bcopy((char *)cursor.data + roff,
	2862	(char *)bp->b_data + boff, n);
	2863	}
	2864	}
	2865	if (error)
	2866	break;
	2867
	2868	/*
	2869	* We have to be sure that the only elements added to the
	2870	* dedup cache are those which are already on-media.
	2871	*/
	2872	if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
	2873	hammer_dedup_cache_add(ip, cursor.leaf);
	2874
	2875	/*
	2876	* Iterate until we have filled the request.
	2877	*/
	2878	boff += n;
	2879	if (boff == bp->b_bufsize)
	2880	break;
	2881	error = hammer_ip_next(&cursor);
	2882	}
	2883
	2884	/*
	2885	* There may have been a gap after the last record
	2886	*/
	2887	if (error == ENOENT)
	2888	error = 0;
	2889	if (error == 0 && boff != bp->b_bufsize) {
	2890	KKASSERT(boff < bp->b_bufsize);
	2891	bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
	2892	/* boff = bp->b_bufsize; */
	2893	}
	2894	bp->b_resid = 0;
	2895	bp->b_error = error;
	2896	if (error)
	2897	bp->b_flags \|= B_ERROR;
	2898	biodone(ap->a_bio);
	2899
	2900	done:
	2901	/*
	2902	* Cache the b-tree node for the last data read in cache[1].
	2903	*
	2904	* If we hit the file EOF then also cache the node in the
	2905	* governing director's cache[3], it will be used to initialize
	2906	* the inode's cache[1] for any inodes looked up via the directory.
	2907	*
	2908	* This doesn't reduce disk accesses since the B-Tree chain is
	2909	* likely cached, but it does reduce cpu overhead when looking
	2910	* up file offsets for cpdup/tar/cpio style iterations.
	2911	*/
	2912	if (cursor.node)
	2913	hammer_cache_node(&ip->cache[1], cursor.node);
	2914	if (ran_end >= ip->ino_data.size) {
	2915	dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
	2916	ip->obj_asof, ip->obj_localization);
	2917	if (dip) {
	2918	hammer_cache_node(&dip->cache[3], cursor.node);
	2919	hammer_rel_inode(dip, 0);
	2920	}
	2921	}
	2922	hammer_done_cursor(&cursor);
	2923	hammer_done_transaction(&trans);
	2924	lwkt_reltoken(&hmp->fs_token);
	2925	return(error);
	2926	}
	2927
	2928	/*
	2929	* BMAP operation - used to support cluster_read() only.
	2930	*
	2931	* (struct vnode vp, off_t loffset, off_t doffsetp, int runp, int runb)
	2932	*
	2933	* This routine may return EOPNOTSUPP if the opration is not supported for
	2934	* the specified offset. The contents of the pointer arguments do not
	2935	* need to be initialized in that case.
	2936	*
	2937	* If a disk address is available and properly aligned return 0 with
	2938	* doffsetp set to the zone-2 address, and runp / *runb set appropriately
	2939	* to the run-length relative to that offset. Callers may assume that
	2940	* doffsetp is valid if 0 is returned, even if runp is not sufficiently
	2941	* large, so return EOPNOTSUPP if it is not sufficiently large.
	2942	*/
	2943	static
	2944	int
	2945	hammer_vop_bmap(struct vop_bmap_args *ap)
	2946	{
	2947	struct hammer_transaction trans;
	2948	struct hammer_inode *ip;
	2949	hammer_mount_t hmp;
	2950	struct hammer_cursor cursor;
	2951	hammer_base_elm_t base;
	2952	int64_t rec_offset;
	2953	int64_t ran_end;
	2954	int64_t tmp64;
	2955	int64_t base_offset;
	2956	int64_t base_disk_offset;
	2957	int64_t last_offset;
	2958	hammer_off_t last_disk_offset;
	2959	hammer_off_t disk_offset;
	2960	int rec_len;
	2961	int error;
	2962	int blksize;
	2963
	2964	++hammer_stats_file_iopsr;
	2965	ip = ap->a_vp->v_data;
	2966	hmp = ip->hmp;
	2967
	2968	/*
	2969	* We can only BMAP regular files. We can't BMAP database files,
	2970	* directories, etc.
	2971	*/
	2972	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
	2973	return(EOPNOTSUPP);
	2974
	2975	/*
	2976	* bmap is typically called with runp/runb both NULL when used
	2977	* for writing. We do not support BMAP for writing atm.
	2978	*/
	2979	if (ap->a_cmd != BUF_CMD_READ)
	2980	return(EOPNOTSUPP);
	2981
	2982	/*
	2983	* Scan the B-Tree to acquire blockmap addresses, then translate
	2984	* to raw addresses.
	2985	*/
	2986	lwkt_gettoken(&hmp->fs_token);
	2987	hammer_simple_transaction(&trans, hmp);
	2988	#if 0
	2989	kprintf("bmap_beg %016llx ip->cache %p\n",
	2990	(long long)ap->a_loffset, ip->cache[1]);
	2991	#endif
	2992	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	2993
	2994	/*
	2995	* Key range (begin and end inclusive) to scan. Note that the key's
	2996	* stored in the actual records represent BASE+LEN, not BASE. The
	2997	* first record containing bio_offset will have a key > bio_offset.
	2998	*/
	2999	cursor.key_beg.localization = ip->obj_localization +
	3000	HAMMER_LOCALIZE_MISC;
	3001	cursor.key_beg.obj_id = ip->obj_id;
	3002	cursor.key_beg.create_tid = 0;
	3003	cursor.key_beg.delete_tid = 0;
	3004	cursor.key_beg.obj_type = 0;
	3005	if (ap->a_runb)
	3006	cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
	3007	else
	3008	cursor.key_beg.key = ap->a_loffset + 1;
	3009	if (cursor.key_beg.key < 0)
	3010	cursor.key_beg.key = 0;
	3011	cursor.asof = ip->obj_asof;
	3012	cursor.flags \|= HAMMER_CURSOR_ASOF;
	3013
	3014	cursor.key_end = cursor.key_beg;
	3015	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
	3016
	3017	ran_end = ap->a_loffset + MAXPHYS;
	3018	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
	3019	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
	3020	tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
	3021	if (tmp64 < ran_end)
	3022	cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
	3023	else
	3024	cursor.key_end.key = ran_end + MAXPHYS + 1;
	3025
	3026	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE;
	3027
	3028	error = hammer_ip_first(&cursor);
	3029	base_offset = last_offset = 0;
	3030	base_disk_offset = last_disk_offset = 0;
	3031
	3032	while (error == 0) {
	3033	/*
	3034	* Get the base file offset of the record. The key for
	3035	* data records is (base + bytes) rather then (base).
	3036	*
	3037	* NOTE: rec_offset + rec_len may exceed the end-of-file.
	3038	* The extra bytes should be zero on-disk and the BMAP op
	3039	* should still be ok.
	3040	*/
	3041	base = &cursor.leaf->base;
	3042	rec_offset = base->key - cursor.leaf->data_len;
	3043	rec_len = cursor.leaf->data_len;
	3044
	3045	/*
	3046	* Incorporate any cached truncation.
	3047	*
	3048	* NOTE: Modifications to rec_len based on synthesized
	3049	* truncation points remove the guarantee that any extended
	3050	* data on disk is zero (since the truncations may not have
	3051	* taken place on-media yet).
	3052	*/
	3053	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	3054	if (hammer_cursor_ondisk(&cursor) \|\|
	3055	cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
	3056	if (ip->trunc_off <= rec_offset)
	3057	rec_len = 0;
	3058	else if (ip->trunc_off < rec_offset + rec_len)
	3059	rec_len = (int)(ip->trunc_off - rec_offset);
	3060	}
	3061	}
	3062	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	3063	if (hammer_cursor_ondisk(&cursor)) {
	3064	if (ip->sync_trunc_off <= rec_offset)
	3065	rec_len = 0;
	3066	else if (ip->sync_trunc_off < rec_offset + rec_len)
	3067	rec_len = (int)(ip->sync_trunc_off - rec_offset);
	3068	}
	3069	}
	3070
	3071	/*
	3072	* Accumulate information. If we have hit a discontiguous
	3073	* block reset base_offset unless we are already beyond the
	3074	* requested offset. If we are, that's it, we stop.
	3075	*/
	3076	if (error)
	3077	break;
	3078	if (hammer_cursor_ondisk(&cursor)) {
	3079	disk_offset = cursor.leaf->data_offset;
	3080	if (rec_offset != last_offset \|\|
	3081	disk_offset != last_disk_offset) {
	3082	if (rec_offset > ap->a_loffset)
	3083	break;
	3084	base_offset = rec_offset;
	3085	base_disk_offset = disk_offset;
	3086	}
	3087	last_offset = rec_offset + rec_len;
	3088	last_disk_offset = disk_offset + rec_len;
	3089
	3090	if (hammer_live_dedup)
	3091	hammer_dedup_cache_add(ip, cursor.leaf);
	3092	}
	3093
	3094	error = hammer_ip_next(&cursor);
	3095	}
	3096
	3097	#if 0
	3098	kprintf("BMAP %016llx: %016llx - %016llx\n",
	3099	(long long)ap->a_loffset,
	3100	(long long)base_offset,
	3101	(long long)last_offset);
	3102	kprintf("BMAP %16s: %016llx - %016llx\n", "",
	3103	(long long)base_disk_offset,
	3104	(long long)last_disk_offset);
	3105	#endif
	3106
	3107	if (cursor.node) {
	3108	hammer_cache_node(&ip->cache[1], cursor.node);
	3109	#if 0
	3110	kprintf("bmap_end2 %016llx ip->cache %p\n",
	3111	(long long)ap->a_loffset, ip->cache[1]);
	3112	#endif
	3113	}
	3114	hammer_done_cursor(&cursor);
	3115	hammer_done_transaction(&trans);
	3116	lwkt_reltoken(&hmp->fs_token);
	3117
	3118	/*
	3119	* If we couldn't find any records or the records we did find were
	3120	* all behind the requested offset, return failure. A forward
	3121	* truncation can leave a hole w/ no on-disk records.
	3122	*/
	3123	if (last_offset == 0 \|\| last_offset < ap->a_loffset)
	3124	return (EOPNOTSUPP);
	3125
	3126	/*
	3127	* Figure out the block size at the requested offset and adjust
	3128	* our limits so the cluster_read() does not create inappropriately
	3129	* sized buffer cache buffers.
	3130	*/
	3131	blksize = hammer_blocksize(ap->a_loffset);
	3132	if (hammer_blocksize(base_offset) != blksize) {
	3133	base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
	3134	}
	3135	if (last_offset != ap->a_loffset &&
	3136	hammer_blocksize(last_offset - 1) != blksize) {
	3137	last_offset = hammer_blockdemarc(ap->a_loffset,
	3138	last_offset - 1);
	3139	}
	3140
	3141	/*
	3142	* Returning EOPNOTSUPP simply prevents the direct-IO optimization
	3143	* from occuring.
	3144	*/
	3145	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
	3146
	3147	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
	3148	/*
	3149	* Only large-data zones can be direct-IOd
	3150	*/
	3151	error = EOPNOTSUPP;
	3152	} else if ((disk_offset & HAMMER_BUFMASK) \|\|
	3153	(last_offset - ap->a_loffset) < blksize) {
	3154	/*
	3155	* doffsetp is not aligned or the forward run size does
	3156	* not cover a whole buffer, disallow the direct I/O.
	3157	*/
	3158	error = EOPNOTSUPP;
	3159	} else {
	3160	/*
	3161	* We're good.
	3162	*/
	3163	*ap->a_doffsetp = disk_offset;
	3164	if (ap->a_runb) {
	3165	*ap->a_runb = ap->a_loffset - base_offset;
	3166	KKASSERT(*ap->a_runb >= 0);
	3167	}
	3168	if (ap->a_runp) {
	3169	*ap->a_runp = last_offset - ap->a_loffset;
	3170	KKASSERT(*ap->a_runp >= 0);
	3171	}
	3172	error = 0;
	3173	}
	3174	return(error);
	3175	}
	3176
	3177	/*
	3178	* Write to a regular file. Because this is a strategy call the OS is
	3179	* trying to actually get data onto the media.
	3180	*/
	3181	static
	3182	int
	3183	hammer_vop_strategy_write(struct vop_strategy_args *ap)
	3184	{
	3185	hammer_record_t record;
	3186	hammer_mount_t hmp;
	3187	hammer_inode_t ip;
	3188	struct bio *bio;
	3189	struct buf *bp;
	3190	int blksize;
	3191	int bytes;
	3192	int error;
	3193
	3194	bio = ap->a_bio;
	3195	bp = bio->bio_buf;
	3196	ip = ap->a_vp->v_data;
	3197	hmp = ip->hmp;
	3198
	3199	blksize = hammer_blocksize(bio->bio_offset);
	3200	KKASSERT(bp->b_bufsize == blksize);
	3201
	3202	if (ip->flags & HAMMER_INODE_RO) {
	3203	bp->b_error = EROFS;
	3204	bp->b_flags \|= B_ERROR;
	3205	biodone(ap->a_bio);
	3206	return(EROFS);
	3207	}
	3208
	3209	lwkt_gettoken(&hmp->fs_token);
	3210
	3211	/*
	3212	* Interlock with inode destruction (no in-kernel or directory
	3213	* topology visibility). If we queue new IO while trying to
	3214	* destroy the inode we can deadlock the vtrunc call in
	3215	* hammer_inode_unloadable_check().
	3216	*
	3217	* Besides, there's no point flushing a bp associated with an
	3218	* inode that is being destroyed on-media and has no kernel
	3219	* references.
	3220	*/
	3221	if ((ip->flags \| ip->sync_flags) &
	3222	(HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) {
	3223	bp->b_resid = 0;
	3224	biodone(ap->a_bio);
	3225	lwkt_reltoken(&hmp->fs_token);
	3226	return(0);
	3227	}
	3228
	3229	/*
	3230	* Reserve space and issue a direct-write from the front-end.
	3231	* NOTE: The direct_io code will hammer_bread/bcopy smaller
	3232	* allocations.
	3233	*
	3234	* An in-memory record will be installed to reference the storage
	3235	* until the flusher can get to it.
	3236	*
	3237	* Since we own the high level bio the front-end will not try to
	3238	* do a direct-read until the write completes.
	3239	*
	3240	* NOTE: The only time we do not reserve a full-sized buffers
	3241	* worth of data is if the file is small. We do not try to
	3242	* allocate a fragment (from the small-data zone) at the end of
	3243	* an otherwise large file as this can lead to wildly separated
	3244	* data.
	3245	*/
	3246	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
	3247	KKASSERT(bio->bio_offset < ip->ino_data.size);
	3248	if (bio->bio_offset \|\| ip->ino_data.size > HAMMER_BUFSIZE / 2)
	3249	bytes = bp->b_bufsize;
	3250	else
	3251	bytes = ((int)ip->ino_data.size + 15) & ~15;
	3252
	3253	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
	3254	bytes, &error);
	3255
	3256	/*
	3257	* B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
	3258	* in hammer_vop_write(). We must flag the record so the proper
	3259	* REDO_TERM_WRITE entry is generated during the flush.
	3260	*/
	3261	if (record) {
	3262	if (bp->b_flags & B_VFSFLAG1) {
	3263	record->flags \|= HAMMER_RECF_REDO;
	3264	bp->b_flags &= ~B_VFSFLAG1;
	3265	}
	3266	if (record->flags & HAMMER_RECF_DEDUPED) {
	3267	bp->b_resid = 0;
	3268	hammer_ip_replace_bulk(hmp, record);
	3269	biodone(ap->a_bio);
	3270	} else {
	3271	hammer_io_direct_write(hmp, bio, record);
	3272	}
	3273	if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
	3274	hammer_flush_inode(ip, 0);
	3275	} else {
	3276	bp->b_bio2.bio_offset = NOOFFSET;
	3277	bp->b_error = error;
	3278	bp->b_flags \|= B_ERROR;
	3279	biodone(ap->a_bio);
	3280	}
	3281	lwkt_reltoken(&hmp->fs_token);
	3282	return(error);
	3283	}
	3284
	3285	/*
	3286	* dounlink - disconnect a directory entry
	3287	*
	3288	* XXX whiteout support not really in yet
	3289	*/
	3290	static int
	3291	hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
	3292	struct vnode dvp, struct ucred cred,
	3293	int flags, int isdir)
	3294	{
	3295	struct namecache *ncp;
	3296	hammer_inode_t dip;
	3297	hammer_inode_t ip;
	3298	hammer_mount_t hmp;
	3299	struct hammer_cursor cursor;
	3300	int64_t namekey;
	3301	u_int32_t max_iterations;
	3302	int nlen, error;
	3303
	3304	/*
	3305	* Calculate the namekey and setup the key range for the scan. This
	3306	* works kinda like a chained hash table where the lower 32 bits
	3307	* of the namekey synthesize the chain.
	3308	*
	3309	* The key range is inclusive of both key_beg and key_end.
	3310	*/
	3311	dip = VTOI(dvp);
	3312	ncp = nch->ncp;
	3313	hmp = dip->hmp;
	3314
	3315	if (dip->flags & HAMMER_INODE_RO)
	3316	return (EROFS);
	3317
	3318	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
	3319	&max_iterations);
	3320	retry:
	3321	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
	3322	cursor.key_beg.localization = dip->obj_localization +
	3323	hammer_dir_localization(dip);
	3324	cursor.key_beg.obj_id = dip->obj_id;
	3325	cursor.key_beg.key = namekey;
	3326	cursor.key_beg.create_tid = 0;
	3327	cursor.key_beg.delete_tid = 0;
	3328	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	3329	cursor.key_beg.obj_type = 0;
	3330
	3331	cursor.key_end = cursor.key_beg;
	3332	cursor.key_end.key += max_iterations;
	3333	cursor.asof = dip->obj_asof;
	3334	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	3335
	3336	/*
	3337	* Scan all matching records (the chain), locate the one matching
	3338	* the requested path component. info->last_error contains the
	3339	* error code on search termination and could be 0, ENOENT, or
	3340	* something else.
	3341	*
	3342	* The hammer_ip_*() functions merge in-memory records with on-disk
	3343	* records for the purposes of the search.
	3344	*/
	3345	error = hammer_ip_first(&cursor);
	3346
	3347	while (error == 0) {
	3348	error = hammer_ip_resolve_data(&cursor);
	3349	if (error)
	3350	break;
	3351	nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
	3352	KKASSERT(nlen > 0);
	3353	if (ncp->nc_nlen == nlen &&
	3354	bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	3355	break;
	3356	}
	3357	error = hammer_ip_next(&cursor);
	3358	}
	3359
	3360	/*
	3361	* If all is ok we have to get the inode so we can adjust nlinks.
	3362	* To avoid a deadlock with the flusher we must release the inode
	3363	* lock on the directory when acquiring the inode for the entry.
	3364	*
	3365	* If the target is a directory, it must be empty.
	3366	*/
	3367	if (error == 0) {
	3368	hammer_unlock(&cursor.ip->lock);
	3369	ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
	3370	hmp->asof,
	3371	cursor.data->entry.localization,
	3372	0, &error);
	3373	hammer_lock_sh(&cursor.ip->lock);
	3374	if (error == ENOENT) {
	3375	kprintf("HAMMER: WARNING: Removing "
	3376	"dirent w/missing inode \"%s\"\n"
	3377	"\tobj_id = %016llx\n",
	3378	ncp->nc_name,
	3379	(long long)cursor.data->entry.obj_id);
	3380	error = 0;
	3381	}
	3382
	3383	/*
	3384	* If isdir >= 0 we validate that the entry is or is not a
	3385	* directory. If isdir < 0 we don't care.
	3386	*/
	3387	if (error == 0 && isdir >= 0 && ip) {
	3388	if (isdir &&
	3389	ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
	3390	error = ENOTDIR;
	3391	} else if (isdir == 0 &&
	3392	ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	3393	error = EISDIR;
	3394	}
	3395	}
	3396
	3397	/*
	3398	* If we are trying to remove a directory the directory must
	3399	* be empty.
	3400	*
	3401	* The check directory code can loop and deadlock/retry. Our
	3402	* own cursor's node locks must be released to avoid a 3-way
	3403	* deadlock with the flusher if the check directory code
	3404	* blocks.
	3405	*
	3406	* If any changes whatsoever have been made to the cursor
	3407	* set EDEADLK and retry.
	3408	*
	3409	* WARNING: See warnings in hammer_unlock_cursor()
	3410	* function.
	3411	*/
	3412	if (error == 0 && ip && ip->ino_data.obj_type ==
	3413	HAMMER_OBJTYPE_DIRECTORY) {
	3414	hammer_unlock_cursor(&cursor);
	3415	error = hammer_ip_check_directory_empty(trans, ip);
	3416	hammer_lock_cursor(&cursor);
	3417	if (cursor.flags & HAMMER_CURSOR_RETEST) {
	3418	kprintf("HAMMER: Warning: avoided deadlock "
	3419	"on rmdir '%s'\n",
	3420	ncp->nc_name);
	3421	error = EDEADLK;
	3422	}
	3423	}
	3424
	3425	/*
	3426	* Delete the directory entry.
	3427	*
	3428	* WARNING: hammer_ip_del_directory() may have to terminate
	3429	* the cursor to avoid a deadlock. It is ok to call
	3430	* hammer_done_cursor() twice.
	3431	*/
	3432	if (error == 0) {
	3433	error = hammer_ip_del_directory(trans, &cursor,
	3434	dip, ip);
	3435	}
	3436	hammer_done_cursor(&cursor);
	3437	if (error == 0) {
	3438	cache_setunresolved(nch);
	3439	cache_setvp(nch, NULL);
	3440
	3441	/*
	3442	* NOTE: ip->vp, if non-NULL, cannot be directly
	3443	* referenced without formally acquiring the
	3444	* vp since the vp might have zero refs on it,
	3445	* or in the middle of a reclaim, etc.
	3446	*
	3447	* NOTE: The cache_setunresolved() can rip the vp
	3448	* out from under us since the vp may not have
	3449	* any refs, in which case ip->vp will be NULL
	3450	* from the outset.
	3451	*/
	3452	while (ip && ip->vp) {
	3453	struct vnode *vp;
	3454
	3455	error = hammer_get_vnode(ip, &vp);
	3456	if (error == 0 && vp) {
	3457	vn_unlock(vp);
	3458	hammer_knote(ip->vp, NOTE_DELETE);
	3459	cache_inval_vp(ip->vp, CINV_DESTROY);
	3460	vrele(vp);
	3461	break;
	3462	}
	3463	kprintf("Debug: HAMMER ip/vp race1 avoided\n");
	3464	}
	3465	}
	3466	if (ip)
	3467	hammer_rel_inode(ip, 0);
	3468	} else {
	3469	hammer_done_cursor(&cursor);
	3470	}
	3471	if (error == EDEADLK)
	3472	goto retry;
	3473
	3474	return (error);
	3475	}
	3476
	3477	/************************************************************************
	3478	* FIFO AND SPECFS OPS *
	3479	************************************************************************
	3480	*
	3481	*/
	3482	static int
	3483	hammer_vop_fifoclose (struct vop_close_args *ap)
	3484	{
	3485	/* XXX update itimes */
	3486	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
	3487	}
	3488
	3489	static int
	3490	hammer_vop_fiforead (struct vop_read_args *ap)
	3491	{
	3492	int error;
	3493
	3494	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3495	/* XXX update access time */
	3496	return (error);
	3497	}
	3498
	3499	static int
	3500	hammer_vop_fifowrite (struct vop_write_args *ap)
	3501	{
	3502	int error;
	3503
	3504	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3505	/* XXX update access time */
	3506	return (error);
	3507	}
	3508
	3509	static
	3510	int
	3511	hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
	3512	{
	3513	int error;
	3514
	3515	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3516	if (error)
	3517	error = hammer_vop_kqfilter(ap);
	3518	return(error);
	3519	}
	3520
	3521	/************************************************************************
	3522	* KQFILTER OPS *
	3523	************************************************************************
	3524	*
	3525	*/
	3526	static void filt_hammerdetach(struct knote *kn);
	3527	static int filt_hammerread(struct knote *kn, long hint);
	3528	static int filt_hammerwrite(struct knote *kn, long hint);
	3529	static int filt_hammervnode(struct knote *kn, long hint);
	3530
	3531	static struct filterops hammerread_filtops =
	3532	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
	3533	static struct filterops hammerwrite_filtops =
	3534	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
	3535	static struct filterops hammervnode_filtops =
	3536	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
	3537
	3538	static
	3539	int
	3540	hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
	3541	{
	3542	struct vnode *vp = ap->a_vp;
	3543	struct knote *kn = ap->a_kn;
	3544
	3545	switch (kn->kn_filter) {
	3546	case EVFILT_READ:
	3547	kn->kn_fop = &hammerread_filtops;
	3548	break;
	3549	case EVFILT_WRITE:
	3550	kn->kn_fop = &hammerwrite_filtops;
	3551	break;
	3552	case EVFILT_VNODE:
	3553	kn->kn_fop = &hammervnode_filtops;
	3554	break;
	3555	default:
	3556	return (EOPNOTSUPP);
	3557	}
	3558
	3559	kn->kn_hook = (caddr_t)vp;
	3560
	3561	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
	3562
	3563	return(0);
	3564	}
	3565
	3566	static void
	3567	filt_hammerdetach(struct knote *kn)
	3568	{
	3569	struct vnode vp = (void )kn->kn_hook;
	3570
	3571	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
	3572	}
	3573
	3574	static int
	3575	filt_hammerread(struct knote *kn, long hint)
	3576	{
	3577	struct vnode vp = (void )kn->kn_hook;
	3578	hammer_inode_t ip = VTOI(vp);
	3579	hammer_mount_t hmp = ip->hmp;
	3580	off_t off;
	3581
	3582	if (hint == NOTE_REVOKE) {
	3583	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	3584	return(1);
	3585	}
	3586	lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */
	3587	off = ip->ino_data.size - kn->kn_fp->f_offset;
	3588	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
	3589	lwkt_reltoken(&hmp->fs_token);
	3590	if (kn->kn_sfflags & NOTE_OLDAPI)
	3591	return(1);
	3592	return (kn->kn_data != 0);
	3593	}
	3594
	3595	static int
	3596	filt_hammerwrite(struct knote *kn, long hint)
	3597	{
	3598	if (hint == NOTE_REVOKE)
	3599	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	3600	kn->kn_data = 0;
	3601	return (1);
	3602	}
	3603
	3604	static int
	3605	filt_hammervnode(struct knote *kn, long hint)
	3606	{
	3607	if (kn->kn_sfflags & hint)
	3608	kn->kn_fflags \|= hint;
	3609	if (hint == NOTE_REVOKE) {
	3610	kn->kn_flags \|= EV_EOF;
	3611	return (1);
	3612	}
	3613	return (kn->kn_fflags != 0);
	3614	}
	3615