gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	#include <sys/mountctl.h>
	36	#include <sys/namecache.h>
	37	#include <sys/buf2.h>
	38	#include <vfs/fifofs/fifo.h>
	39
	40	#include "hammer.h"
	41
	42	/*
	43	* USERFS VNOPS
	44	*/
	45	static int hammer_vop_fsync(struct vop_fsync_args *);
	46	static int hammer_vop_read(struct vop_read_args *);
	47	static int hammer_vop_write(struct vop_write_args *);
	48	static int hammer_vop_access(struct vop_access_args *);
	49	static int hammer_vop_advlock(struct vop_advlock_args *);
	50	static int hammer_vop_close(struct vop_close_args *);
	51	static int hammer_vop_ncreate(struct vop_ncreate_args *);
	52	static int hammer_vop_getattr(struct vop_getattr_args *);
	53	static int hammer_vop_nresolve(struct vop_nresolve_args *);
	54	static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
	55	static int hammer_vop_nlink(struct vop_nlink_args *);
	56	static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
	57	static int hammer_vop_nmknod(struct vop_nmknod_args *);
	58	static int hammer_vop_open(struct vop_open_args *);
	59	static int hammer_vop_print(struct vop_print_args *);
	60	static int hammer_vop_readdir(struct vop_readdir_args *);
	61	static int hammer_vop_readlink(struct vop_readlink_args *);
	62	static int hammer_vop_nremove(struct vop_nremove_args *);
	63	static int hammer_vop_nrename(struct vop_nrename_args *);
	64	static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
	65	static int hammer_vop_markatime(struct vop_markatime_args *);
	66	static int hammer_vop_setattr(struct vop_setattr_args *);
	67	static int hammer_vop_strategy(struct vop_strategy_args *);
	68	static int hammer_vop_bmap(struct vop_bmap_args *ap);
	69	static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
	70	static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
	71	static int hammer_vop_ioctl(struct vop_ioctl_args *);
	72	static int hammer_vop_mountctl(struct vop_mountctl_args *);
	73	static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
	74
	75	static int hammer_vop_fifoclose (struct vop_close_args *);
	76	static int hammer_vop_fiforead (struct vop_read_args *);
	77	static int hammer_vop_fifowrite (struct vop_write_args *);
	78	static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
	79
	80	struct vop_ops hammer_vnode_vops = {
	81	.vop_default = vop_defaultop,
	82	.vop_fsync = hammer_vop_fsync,
	83	.vop_getpages = vop_stdgetpages,
	84	.vop_putpages = vop_stdputpages,
	85	.vop_read = hammer_vop_read,
	86	.vop_write = hammer_vop_write,
	87	.vop_access = hammer_vop_access,
	88	.vop_advlock = hammer_vop_advlock,
	89	.vop_close = hammer_vop_close,
	90	.vop_ncreate = hammer_vop_ncreate,
	91	.vop_getattr = hammer_vop_getattr,
	92	.vop_inactive = hammer_vop_inactive,
	93	.vop_reclaim = hammer_vop_reclaim,
	94	.vop_nresolve = hammer_vop_nresolve,
	95	.vop_nlookupdotdot = hammer_vop_nlookupdotdot,
	96	.vop_nlink = hammer_vop_nlink,
	97	.vop_nmkdir = hammer_vop_nmkdir,
	98	.vop_nmknod = hammer_vop_nmknod,
	99	.vop_open = hammer_vop_open,
	100	.vop_pathconf = vop_stdpathconf,
	101	.vop_print = hammer_vop_print,
	102	.vop_readdir = hammer_vop_readdir,
	103	.vop_readlink = hammer_vop_readlink,
	104	.vop_nremove = hammer_vop_nremove,
	105	.vop_nrename = hammer_vop_nrename,
	106	.vop_nrmdir = hammer_vop_nrmdir,
	107	.vop_markatime = hammer_vop_markatime,
	108	.vop_setattr = hammer_vop_setattr,
	109	.vop_bmap = hammer_vop_bmap,
	110	.vop_strategy = hammer_vop_strategy,
	111	.vop_nsymlink = hammer_vop_nsymlink,
	112	.vop_nwhiteout = hammer_vop_nwhiteout,
	113	.vop_ioctl = hammer_vop_ioctl,
	114	.vop_mountctl = hammer_vop_mountctl,
	115	.vop_kqfilter = hammer_vop_kqfilter
	116	};
	117
	118	struct vop_ops hammer_spec_vops = {
	119	.vop_default = vop_defaultop,
	120	.vop_fsync = hammer_vop_fsync,
	121	.vop_read = vop_stdnoread,
	122	.vop_write = vop_stdnowrite,
	123	.vop_access = hammer_vop_access,
	124	.vop_close = hammer_vop_close,
	125	.vop_markatime = hammer_vop_markatime,
	126	.vop_getattr = hammer_vop_getattr,
	127	.vop_inactive = hammer_vop_inactive,
	128	.vop_reclaim = hammer_vop_reclaim,
	129	.vop_setattr = hammer_vop_setattr
	130	};
	131
	132	struct vop_ops hammer_fifo_vops = {
	133	.vop_default = fifo_vnoperate,
	134	.vop_fsync = hammer_vop_fsync,
	135	.vop_read = hammer_vop_fiforead,
	136	.vop_write = hammer_vop_fifowrite,
	137	.vop_access = hammer_vop_access,
	138	.vop_close = hammer_vop_fifoclose,
	139	.vop_markatime = hammer_vop_markatime,
	140	.vop_getattr = hammer_vop_getattr,
	141	.vop_inactive = hammer_vop_inactive,
	142	.vop_reclaim = hammer_vop_reclaim,
	143	.vop_setattr = hammer_vop_setattr,
	144	.vop_kqfilter = hammer_vop_fifokqfilter
	145	};
	146
	147	static __inline
	148	void
	149	hammer_knote(struct vnode *vp, int flags)
	150	{
	151	if (flags)
	152	KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
	153	}
	154
	155	static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
	156	struct vnode dvp, struct ucred cred,
	157	int flags, int isdir);
	158	static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
	159	static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
	160
	161	/*
	162	* hammer_vop_fsync { vp, waitfor }
	163	*
	164	* fsync() an inode to disk and wait for it to be completely committed
	165	* such that the information would not be undone if a crash occured after
	166	* return.
	167	*
	168	* NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
	169	* a REDO log. A sysctl is provided to relax HAMMER's fsync()
	170	* operation.
	171	*
	172	* Ultimately the combination of a REDO log and use of fast storage
	173	* to front-end cluster caches will make fsync fast, but it aint
	174	* here yet. And, in anycase, we need real transactional
	175	* all-or-nothing features which are not restricted to a single file.
	176	*/
	177	static
	178	int
	179	hammer_vop_fsync(struct vop_fsync_args *ap)
	180	{
	181	hammer_inode_t ip = VTOI(ap->a_vp);
	182	hammer_mount_t hmp = ip->hmp;
	183	int waitfor = ap->a_waitfor;
	184	int mode;
	185
	186	lwkt_gettoken(&hmp->fs_token);
	187
	188	/*
	189	* Fsync rule relaxation (default is either full synchronous flush
	190	* or REDO semantics with synchronous flush).
	191	*/
	192	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
	193	switch(hammer_fsync_mode) {
	194	case 0:
	195	mode0:
	196	/* no REDO, full synchronous flush */
	197	goto skip;
	198	case 1:
	199	mode1:
	200	/* no REDO, full asynchronous flush */
	201	if (waitfor == MNT_WAIT)
	202	waitfor = MNT_NOWAIT;
	203	goto skip;
	204	case 2:
	205	/* REDO semantics, synchronous flush */
	206	if (hmp->version < HAMMER_VOL_VERSION_FOUR)
	207	goto mode0;
	208	mode = HAMMER_FLUSH_UNDOS_AUTO;
	209	break;
	210	case 3:
	211	/* REDO semantics, relaxed asynchronous flush */
	212	if (hmp->version < HAMMER_VOL_VERSION_FOUR)
	213	goto mode1;
	214	mode = HAMMER_FLUSH_UNDOS_RELAXED;
	215	if (waitfor == MNT_WAIT)
	216	waitfor = MNT_NOWAIT;
	217	break;
	218	case 4:
	219	/* ignore the fsync() system call */
	220	lwkt_reltoken(&hmp->fs_token);
	221	return(0);
	222	default:
	223	/* we have to do something */
	224	mode = HAMMER_FLUSH_UNDOS_RELAXED;
	225	if (waitfor == MNT_WAIT)
	226	waitfor = MNT_NOWAIT;
	227	break;
	228	}
	229
	230	/*
	231	* Fast fsync only needs to flush the UNDO/REDO fifo if
	232	* HAMMER_INODE_REDO is non-zero and the only modifications
	233	* made to the file are write or write-extends.
	234	*/
	235	if ((ip->flags & HAMMER_INODE_REDO) &&
	236	(ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) {
	237	++hammer_count_fsyncs;
	238	hammer_flusher_flush_undos(hmp, mode);
	239	ip->redo_count = 0;
	240	if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
	241	vclrisdirty(ip->vp);
	242	lwkt_reltoken(&hmp->fs_token);
	243	return(0);
	244	}
	245
	246	/*
	247	* REDO is enabled by fsync(), the idea being we really only
	248	* want to lay down REDO records when programs are using
	249	* fsync() heavily. The first fsync() on the file starts
	250	* the gravy train going and later fsync()s keep it hot by
	251	* resetting the redo_count.
	252	*
	253	* We weren't running REDOs before now so we have to fall
	254	* through and do a full fsync of what we have.
	255	*/
	256	if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
	257	(hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
	258	ip->flags \|= HAMMER_INODE_REDO;
	259	ip->redo_count = 0;
	260	}
	261	}
	262	skip:
	263
	264	/*
	265	* Do a full flush sequence.
	266	*
	267	* Attempt to release the vnode while waiting for the inode to
	268	* finish flushing. This can really mess up inactive->reclaim
	269	* sequences so only do it if the vnode is active.
	270	*
	271	* WARNING! The VX lock functions must be used. vn_lock() will
	272	* fail when this is part of a VOP_RECLAIM sequence.
	273	*/
	274	++hammer_count_fsyncs;
	275	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
	276	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	277	if (waitfor == MNT_WAIT) {
	278	int dorelock;
	279
	280	if ((ap->a_vp->v_flag & VRECLAIMED) == 0) {
	281	vn_unlock(ap->a_vp);
	282	dorelock = 1;
	283	} else {
	284	dorelock = 0;
	285	}
	286	hammer_wait_inode(ip);
	287	if (dorelock)
	288	vn_relock(ap->a_vp, LK_EXCLUSIVE);
	289	}
	290	if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
	291	vclrisdirty(ip->vp);
	292	lwkt_reltoken(&hmp->fs_token);
	293	return (ip->error);
	294	}
	295
	296	/*
	297	* hammer_vop_read { vp, uio, ioflag, cred }
	298	*
	299	* MPSAFE (for the cache safe does not require fs_token)
	300	*/
	301	static
	302	int
	303	hammer_vop_read(struct vop_read_args *ap)
	304	{
	305	struct hammer_transaction trans;
	306	hammer_inode_t ip;
	307	hammer_mount_t hmp;
	308	off_t offset;
	309	struct buf *bp;
	310	struct uio *uio;
	311	int error;
	312	int n;
	313	int seqcount;
	314	int ioseqcount;
	315	int blksize;
	316	int bigread;
	317	int got_trans;
	318	size_t resid;
	319
	320	if (ap->a_vp->v_type == VDIR)
	321	return (EISDIR);
	322	if (ap->a_vp->v_type != VREG)
	323	return (EINVAL);
	324	ip = VTOI(ap->a_vp);
	325	hmp = ip->hmp;
	326	error = 0;
	327	got_trans = 0;
	328	uio = ap->a_uio;
	329
	330	/*
	331	* Attempt to shortcut directly to the VM object using lwbufs.
	332	* This is much faster than instantiating buffer cache buffers.
	333	*/
	334	resid = uio->uio_resid;
	335	error = vop_helper_read_shortcut(ap);
	336	hammer_stats_file_read += resid - uio->uio_resid;
	337	if (error)
	338	return (error);
	339	if (uio->uio_resid == 0)
	340	goto finished;
	341
	342	/*
	343	* Allow the UIO's size to override the sequential heuristic.
	344	*/
	345	blksize = hammer_blocksize(uio->uio_offset);
	346	seqcount = howmany(uio->uio_resid, MAXBSIZE);
	347	ioseqcount = ap->a_ioflag >> IO_SEQSHIFT;
	348	if (seqcount < ioseqcount)
	349	seqcount = ioseqcount;
	350
	351	/*
	352	* If reading or writing a huge amount of data we have to break
	353	* atomicy and allow the operation to be interrupted by a signal
	354	* or it can DOS the machine.
	355	*/
	356	bigread = (uio->uio_resid > 100 * 1024 * 1024);
	357
	358	/*
	359	* Access the data typically in HAMMER_BUFSIZE blocks via the
	360	* buffer cache, but HAMMER may use a variable block size based
	361	* on the offset.
	362	*
	363	* XXX Temporary hack, delay the start transaction while we remain
	364	* MPSAFE. NOTE: ino_data.size cannot change while vnode is
	365	* locked-shared.
	366	*/
	367	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
	368	int64_t base_offset;
	369	int64_t file_limit;
	370
	371	blksize = hammer_blocksize(uio->uio_offset);
	372	offset = (int)uio->uio_offset & (blksize - 1);
	373	base_offset = uio->uio_offset - offset;
	374
	375	if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
	376	break;
	377
	378	/*
	379	* MPSAFE
	380	*/
	381	bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
	382	if ((bp->b_flags & (B_INVAL \| B_CACHE \| B_RAM)) == B_CACHE) {
	383	bp->b_flags &= ~B_AGE;
	384	error = 0;
	385	goto skip;
	386	}
	387	if (ap->a_ioflag & IO_NRDELAY) {
	388	bqrelse(bp);
	389	return (EWOULDBLOCK);
	390	}
	391
	392	/*
	393	* MPUNSAFE
	394	*/
	395	if (got_trans == 0) {
	396	hammer_start_transaction(&trans, ip->hmp);
	397	got_trans = 1;
	398	}
	399
	400	/*
	401	* NOTE: A valid bp has already been acquired, but was not
	402	* B_CACHE.
	403	*/
	404	if (hammer_cluster_enable) {
	405	/*
	406	* Use file_limit to prevent cluster_read() from
	407	* creating buffers of the wrong block size past
	408	* the demarc.
	409	*/
	410	file_limit = ip->ino_data.size;
	411	if (base_offset < HAMMER_XDEMARC &&
	412	file_limit > HAMMER_XDEMARC) {
	413	file_limit = HAMMER_XDEMARC;
	414	}
	415	error = cluster_readx(ap->a_vp,
	416	file_limit, base_offset,
	417	blksize, B_NOTMETA,
	418	uio->uio_resid,
	419	seqcount * MAXBSIZE,
	420	&bp);
	421	} else {
	422	error = breadnx(ap->a_vp, base_offset,
	423	blksize, B_NOTMETA,
	424	NULL, NULL, 0, &bp);
	425	}
	426	if (error) {
	427	brelse(bp);
	428	break;
	429	}
	430	skip:
	431	if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IOISSUED)) {
	432	hdkprintf("zone2_offset %016jx read file %016jx@%016jx\n",
	433	(intmax_t)bp->b_bio2.bio_offset,
	434	(intmax_t)ip->obj_id,
	435	(intmax_t)bp->b_loffset);
	436	}
	437	bp->b_flags &= ~B_IOISSUED;
	438	if (blksize == HAMMER_XBUFSIZE)
	439	bp->b_flags \|= B_CLUSTEROK;
	440
	441	n = blksize - offset;
	442	if (n > uio->uio_resid)
	443	n = uio->uio_resid;
	444	if (n > ip->ino_data.size - uio->uio_offset)
	445	n = (int)(ip->ino_data.size - uio->uio_offset);
	446
	447	/*
	448	* Set B_AGE, data has a lower priority than meta-data.
	449	*
	450	* Use a hold/unlock/drop sequence to run the uiomove
	451	* with the buffer unlocked, avoiding deadlocks against
	452	* read()s on mmap()'d spaces.
	453	*/
	454	bp->b_flags \|= B_AGE;
	455	error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
	456	bqrelse(bp);
	457
	458	if (error)
	459	break;
	460	hammer_stats_file_read += n;
	461	}
	462
	463	finished:
	464
	465	/*
	466	* Try to update the atime with just the inode lock for maximum
	467	* concurrency. If we can't shortcut it we have to get the full
	468	* blown transaction.
	469	*/
	470	if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) {
	471	hammer_start_transaction(&trans, ip->hmp);
	472	got_trans = 1;
	473	}
	474
	475	if (got_trans) {
	476	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
	477	(ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
	478	lwkt_gettoken(&hmp->fs_token);
	479	ip->ino_data.atime = trans.time;
	480	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
	481	hammer_done_transaction(&trans);
	482	lwkt_reltoken(&hmp->fs_token);
	483	} else {
	484	hammer_done_transaction(&trans);
	485	}
	486	}
	487	return (error);
	488	}
	489
	490	/*
	491	* hammer_vop_write { vp, uio, ioflag, cred }
	492	*/
	493	static
	494	int
	495	hammer_vop_write(struct vop_write_args *ap)
	496	{
	497	struct hammer_transaction trans;
	498	hammer_inode_t ip;
	499	hammer_mount_t hmp;
	500	thread_t td;
	501	struct vnode *vp;
	502	struct uio *uio;
	503	int offset;
	504	off_t base_offset;
	505	int64_t cluster_eof;
	506	struct buf *bp;
	507	int kflags;
	508	int error;
	509	int n;
	510	int flags;
	511	int seqcount;
	512	int bigwrite;
	513
	514	vp = ap->a_vp;
	515	if (vp->v_type != VREG)
	516	return (EINVAL);
	517	ip = VTOI(ap->a_vp);
	518	hmp = ip->hmp;
	519	error = 0;
	520	kflags = 0;
	521	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
	522
	523	if (ip->flags & HAMMER_INODE_RO)
	524	return (EROFS);
	525
	526	/*
	527	* Create a transaction to cover the operations we perform.
	528	*/
	529	hammer_start_transaction(&trans, hmp);
	530	uio = ap->a_uio;
	531
	532	/*
	533	* Use v_lastwrite_ts if file not open for writing
	534	* (i.e. a late msync)
	535	*/
	536	if (uio->uio_segflg == UIO_NOCOPY) {
	537	if (vp->v_flag & VLASTWRITETS) {
	538	trans.time = vp->v_lastwrite_ts.tv_sec * 1000000 +
	539	vp->v_lastwrite_ts.tv_nsec / 1000;
	540	} else {
	541	trans.time = ip->ino_data.mtime;
	542	}
	543	} else {
	544	vclrflags(vp, VLASTWRITETS);
	545	}
	546
	547	/*
	548	* Check append mode
	549	*/
	550	if (ap->a_ioflag & IO_APPEND)
	551	uio->uio_offset = ip->ino_data.size;
	552
	553	/*
	554	* Check for illegal write offsets. Valid range is 0...2^63-1.
	555	*
	556	* NOTE: the base_off assignment is required to work around what
	557	* I consider to be a GCC-4 optimization bug.
	558	*/
	559	if (uio->uio_offset < 0) {
	560	hammer_done_transaction(&trans);
	561	return (EFBIG);
	562	}
	563	base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
	564	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
	565	hammer_done_transaction(&trans);
	566	return (EFBIG);
	567	}
	568
	569	if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
	570	base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
	571	hammer_done_transaction(&trans);
	572	lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
	573	return (EFBIG);
	574	}
	575
	576	/*
	577	* If reading or writing a huge amount of data we have to break
	578	* atomicy and allow the operation to be interrupted by a signal
	579	* or it can DOS the machine.
	580	*
	581	* Preset redo_count so we stop generating REDOs earlier if the
	582	* limit is exceeded.
	583	*
	584	* redo_count is heuristical, SMP races are ok
	585	*/
	586	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
	587	if ((ip->flags & HAMMER_INODE_REDO) &&
	588	ip->redo_count < hammer_limit_redo) {
	589	ip->redo_count += uio->uio_resid;
	590	}
	591
	592	/*
	593	* Access the data typically in HAMMER_BUFSIZE blocks via the
	594	* buffer cache, but HAMMER may use a variable block size based
	595	* on the offset.
	596	*/
	597	while (uio->uio_resid > 0) {
	598	int fixsize = 0;
	599	int blksize;
	600	int blkmask;
	601	int trivial;
	602	int endofblk;
	603	off_t nsize;
	604
	605	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
	606	break;
	607	if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
	608	break;
	609
	610	blksize = hammer_blocksize(uio->uio_offset);
	611
	612	/*
	613	* Control the number of pending records associated with
	614	* this inode. If too many have accumulated start a
	615	* flush. Try to maintain a pipeline with the flusher.
	616	*
	617	* NOTE: It is possible for other sources to grow the
	618	* records but not necessarily issue another flush,
	619	* so use a timeout and ensure that a re-flush occurs.
	620	*/
	621	if (ip->rsv_recs >= hammer_limit_inode_recs) {
	622	lwkt_gettoken(&hmp->fs_token);
	623	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	624	while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
	625	ip->flags \|= HAMMER_INODE_RECSW;
	626	tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
	627	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	628	}
	629	lwkt_reltoken(&hmp->fs_token);
	630	}
	631
	632	/*
	633	* Do not allow HAMMER to blow out the buffer cache. Very
	634	* large UIOs can lockout other processes due to bwillwrite()
	635	* mechanics.
	636	*
	637	* The hammer inode is not locked during these operations.
	638	* The vnode is locked which can interfere with the pageout
	639	* daemon for non-UIO_NOCOPY writes but should not interfere
	640	* with the buffer cache. Even so, we cannot afford to
	641	* allow the pageout daemon to build up too many dirty buffer
	642	* cache buffers.
	643	*
	644	* Only call this if we aren't being recursively called from
	645	* a virtual disk device (vn), else we may deadlock.
	646	*/
	647	if ((ap->a_ioflag & IO_RECURSE) == 0)
	648	bwillwrite(blksize);
	649
	650	/*
	651	* Calculate the blocksize at the current offset and figure
	652	* out how much we can actually write.
	653	*/
	654	blkmask = blksize - 1;
	655	offset = (int)uio->uio_offset & blkmask;
	656	base_offset = uio->uio_offset & ~(int64_t)blkmask;
	657	n = blksize - offset;
	658	if (n > uio->uio_resid) {
	659	n = uio->uio_resid;
	660	endofblk = 0;
	661	} else {
	662	endofblk = 1;
	663	}
	664	nsize = uio->uio_offset + n;
	665	if (nsize > ip->ino_data.size) {
	666	if (uio->uio_offset > ip->ino_data.size)
	667	trivial = 0;
	668	else
	669	trivial = NVEXTF_TRIVIAL;
	670	nvextendbuf(ap->a_vp,
	671	ip->ino_data.size,
	672	nsize,
	673	hammer_blocksize(ip->ino_data.size),
	674	hammer_blocksize(nsize),
	675	hammer_blockoff(ip->ino_data.size),
	676	hammer_blockoff(nsize),
	677	trivial);
	678	fixsize = 1;
	679	kflags \|= NOTE_EXTEND;
	680	}
	681
	682	if (uio->uio_segflg == UIO_NOCOPY) {
	683	/*
	684	* Issuing a write with the same data backing the
	685	* buffer. Instantiate the buffer to collect the
	686	* backing vm pages, then read-in any missing bits.
	687	*
	688	* This case is used by vop_stdputpages().
	689	*/
	690	bp = getblk(ap->a_vp, base_offset,
	691	blksize, GETBLK_BHEAVY, 0);
	692	if ((bp->b_flags & B_CACHE) == 0) {
	693	bqrelse(bp);
	694	error = bread(ap->a_vp, base_offset,
	695	blksize, &bp);
	696	}
	697	} else if (offset == 0 && uio->uio_resid >= blksize) {
	698	/*
	699	* Even though we are entirely overwriting the buffer
	700	* we may still have to zero it out to avoid a
	701	* mmap/write visibility issue.
	702	*/
	703	bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
	704	if ((bp->b_flags & B_CACHE) == 0)
	705	vfs_bio_clrbuf(bp);
	706	} else if (base_offset >= ip->ino_data.size) {
	707	/*
	708	* If the base offset of the buffer is beyond the
	709	* file EOF, we don't have to issue a read.
	710	*/
	711	bp = getblk(ap->a_vp, base_offset,
	712	blksize, GETBLK_BHEAVY, 0);
	713	vfs_bio_clrbuf(bp);
	714	} else {
	715	/*
	716	* Partial overwrite, read in any missing bits then
	717	* replace the portion being written.
	718	*/
	719	error = bread(ap->a_vp, base_offset, blksize, &bp);
	720	if (error == 0)
	721	bheavy(bp);
	722	}
	723	if (error == 0)
	724	error = uiomovebp(bp, bp->b_data + offset, n, uio);
	725
	726	lwkt_gettoken(&hmp->fs_token);
	727
	728	/*
	729	* Generate REDO records if enabled and redo_count will not
	730	* exceeded the limit.
	731	*
	732	* If redo_count exceeds the limit we stop generating records
	733	* and clear HAMMER_INODE_REDO. This will cause the next
	734	* fsync() to do a full meta-data sync instead of just an
	735	* UNDO/REDO fifo update.
	736	*
	737	* When clearing HAMMER_INODE_REDO any pre-existing REDOs
	738	* will still be tracked. The tracks will be terminated
	739	* when the related meta-data (including possible data
	740	* modifications which are not tracked via REDO) is
	741	* flushed.
	742	*/
	743	if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
	744	if (ip->redo_count < hammer_limit_redo) {
	745	bp->b_flags \|= B_VFSFLAG1;
	746	error = hammer_generate_redo(&trans, ip,
	747	base_offset + offset,
	748	HAMMER_REDO_WRITE,
	749	bp->b_data + offset,
	750	(size_t)n);
	751	} else {
	752	ip->flags &= ~HAMMER_INODE_REDO;
	753	}
	754	}
	755
	756	/*
	757	* If we screwed up we have to undo any VM size changes we
	758	* made.
	759	*/
	760	if (error) {
	761	brelse(bp);
	762	if (fixsize) {
	763	nvtruncbuf(ap->a_vp, ip->ino_data.size,
	764	hammer_blocksize(ip->ino_data.size),
	765	hammer_blockoff(ip->ino_data.size),
	766	0);
	767	}
	768	lwkt_reltoken(&hmp->fs_token);
	769	break;
	770	}
	771	kflags \|= NOTE_WRITE;
	772	hammer_stats_file_write += n;
	773	if (blksize == HAMMER_XBUFSIZE)
	774	bp->b_flags \|= B_CLUSTEROK;
	775	if (ip->ino_data.size < uio->uio_offset) {
	776	ip->ino_data.size = uio->uio_offset;
	777	flags = HAMMER_INODE_SDIRTY;
	778	} else {
	779	flags = 0;
	780	}
	781	ip->ino_data.mtime = trans.time;
	782	flags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_BUFS;
	783	hammer_modify_inode(&trans, ip, flags);
	784
	785	/*
	786	* Once we dirty the buffer any cached zone-X offset
	787	* becomes invalid. HAMMER NOTE: no-history mode cannot
	788	* allow overwriting over the same data sector unless
	789	* we provide UNDOs for the old data, which we don't.
	790	*/
	791	bp->b_bio2.bio_offset = NOOFFSET;
	792
	793	lwkt_reltoken(&hmp->fs_token);
	794
	795	/*
	796	* Final buffer disposition.
	797	*
	798	* Because meta-data updates are deferred, HAMMER is
	799	* especially sensitive to excessive bdwrite()s because
	800	* the I/O stream is not broken up by disk reads. So the
	801	* buffer cache simply cannot keep up.
	802	*
	803	* WARNING! blksize is variable. cluster_write() is
	804	* expected to not blow up if it encounters
	805	* buffers that do not match the passed blksize.
	806	*
	807	* NOTE! Hammer shouldn't need to bawrite()/cluster_write().
	808	* The ip->rsv_recs check should burst-flush the data.
	809	* If we queue it immediately the buf could be left
	810	* locked on the device queue for a very long time.
	811	*
	812	* However, failing to flush a dirty buffer out when
	813	* issued from the pageout daemon can result in a low
	814	* memory deadlock against bio_page_alloc(), so we
	815	* have to bawrite() on IO_ASYNC as well.
	816	*
	817	* NOTE! To avoid degenerate stalls due to mismatched block
	818	* sizes we only honor IO_DIRECT on the write which
	819	* abuts the end of the buffer. However, we must
	820	* honor IO_SYNC in case someone is silly enough to
	821	* configure a HAMMER file as swap, or when HAMMER
	822	* is serving NFS (for commits). Ick ick.
	823	*/
	824	bp->b_flags \|= B_AGE;
	825	if (blksize == HAMMER_XBUFSIZE)
	826	bp->b_flags \|= B_CLUSTEROK;
	827
	828	if (ap->a_ioflag & IO_SYNC) {
	829	bwrite(bp);
	830	} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
	831	bawrite(bp);
	832	} else if (ap->a_ioflag & IO_ASYNC) {
	833	bawrite(bp);
	834	} else if (hammer_cluster_enable &&
	835	!(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
	836	if (base_offset < HAMMER_XDEMARC)
	837	cluster_eof = hammer_blockdemarc(base_offset,
	838	ip->ino_data.size);
	839	else
	840	cluster_eof = ip->ino_data.size;
	841	cluster_write(bp, cluster_eof, blksize, seqcount);
	842	} else {
	843	bdwrite(bp);
	844	}
	845	}
	846	hammer_done_transaction(&trans);
	847	hammer_knote(ap->a_vp, kflags);
	848
	849	return (error);
	850	}
	851
	852	/*
	853	* hammer_vop_access { vp, mode, cred }
	854	*
	855	* MPSAFE - does not require fs_token
	856	*/
	857	static
	858	int
	859	hammer_vop_access(struct vop_access_args *ap)
	860	{
	861	hammer_inode_t ip = VTOI(ap->a_vp);
	862	uid_t uid;
	863	gid_t gid;
	864	int error;
	865
	866	uid = hammer_to_unix_xid(&ip->ino_data.uid);
	867	gid = hammer_to_unix_xid(&ip->ino_data.gid);
	868
	869	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
	870	ip->ino_data.uflags);
	871	return (error);
	872	}
	873
	874	/*
	875	* hammer_vop_advlock { vp, id, op, fl, flags }
	876	*
	877	* MPSAFE - does not require fs_token
	878	*/
	879	static
	880	int
	881	hammer_vop_advlock(struct vop_advlock_args *ap)
	882	{
	883	hammer_inode_t ip = VTOI(ap->a_vp);
	884
	885	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
	886	}
	887
	888	/*
	889	* hammer_vop_close { vp, fflag }
	890	*
	891	* We can only sync-on-close for normal closes. XXX disabled for now.
	892	*/
	893	static
	894	int
	895	hammer_vop_close(struct vop_close_args *ap)
	896	{
	897	#if 0
	898	struct vnode *vp = ap->a_vp;
	899	hammer_inode_t ip = VTOI(vp);
	900	int waitfor;
	901	if (ip->flags & (HAMMER_INODE_CLOSESYNC\|HAMMER_INODE_CLOSEASYNC)) {
	902	if (vn_islocked(vp) == LK_EXCLUSIVE &&
	903	(vp->v_flag & (VINACTIVE\|VRECLAIMED)) == 0) {
	904	if (ip->flags & HAMMER_INODE_CLOSESYNC)
	905	waitfor = MNT_WAIT;
	906	else
	907	waitfor = MNT_NOWAIT;
	908	ip->flags &= ~(HAMMER_INODE_CLOSESYNC \|
	909	HAMMER_INODE_CLOSEASYNC);
	910	VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
	911	}
	912	}
	913	#endif
	914	return (vop_stdclose(ap));
	915	}
	916
	917	/*
	918	* hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
	919	*
	920	* The operating system has already ensured that the directory entry
	921	* does not exist and done all appropriate namespace locking.
	922	*/
	923	static
	924	int
	925	hammer_vop_ncreate(struct vop_ncreate_args *ap)
	926	{
	927	struct hammer_transaction trans;
	928	hammer_inode_t dip;
	929	hammer_inode_t nip;
	930	struct nchandle *nch;
	931	hammer_mount_t hmp;
	932	int error;
	933
	934	nch = ap->a_nch;
	935	dip = VTOI(ap->a_dvp);
	936	hmp = dip->hmp;
	937
	938	if (dip->flags & HAMMER_INODE_RO)
	939	return (EROFS);
	940	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	941	return (error);
	942
	943	/*
	944	* Create a transaction to cover the operations we perform.
	945	*/
	946	lwkt_gettoken(&hmp->fs_token);
	947	hammer_start_transaction(&trans, hmp);
	948
	949	/*
	950	* Create a new filesystem object of the requested type. The
	951	* returned inode will be referenced and shared-locked to prevent
	952	* it from being moved to the flusher.
	953	*/
	954	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	955	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	956	NULL, &nip);
	957	if (error) {
	958	hkprintf("hammer_create_inode error %d\n", error);
	959	hammer_done_transaction(&trans);
	960	*ap->a_vpp = NULL;
	961	lwkt_reltoken(&hmp->fs_token);
	962	return (error);
	963	}
	964
	965	/*
	966	* Add the new filesystem object to the directory. This will also
	967	* bump the inode's link count.
	968	*/
	969	error = hammer_ip_add_direntry(&trans, dip,
	970	nch->ncp->nc_name, nch->ncp->nc_nlen,
	971	nip);
	972	if (error)
	973	hkprintf("hammer_ip_add_direntry error %d\n", error);
	974
	975	/*
	976	* Finish up.
	977	*/
	978	if (error) {
	979	hammer_rel_inode(nip, 0);
	980	hammer_done_transaction(&trans);
	981	*ap->a_vpp = NULL;
	982	} else {
	983	error = hammer_get_vnode(nip, ap->a_vpp);
	984	hammer_done_transaction(&trans);
	985	hammer_rel_inode(nip, 0);
	986	if (error == 0) {
	987	cache_setunresolved(ap->a_nch);
	988	cache_setvp(ap->a_nch, *ap->a_vpp);
	989	}
	990	hammer_knote(ap->a_dvp, NOTE_WRITE);
	991	}
	992	lwkt_reltoken(&hmp->fs_token);
	993	return (error);
	994	}
	995
	996	/*
	997	* hammer_vop_getattr { vp, vap }
	998	*
	999	* Retrieve an inode's attribute information. When accessing inodes
	1000	* historically we fake the atime field to ensure consistent results.
	1001	* The atime field is stored in the B-Tree element and allowed to be
	1002	* updated without cycling the element.
	1003	*
	1004	* MPSAFE - does not require fs_token
	1005	*/
	1006	static
	1007	int
	1008	hammer_vop_getattr(struct vop_getattr_args *ap)
	1009	{
	1010	hammer_inode_t ip = VTOI(ap->a_vp);
	1011	struct vattr *vap = ap->a_vap;
	1012
	1013	/*
	1014	* We want the fsid to be different when accessing a filesystem
	1015	* with different as-of's so programs like diff don't think
	1016	* the files are the same.
	1017	*
	1018	* We also want the fsid to be the same when comparing snapshots,
	1019	* or when comparing mirrors (which might be backed by different
	1020	* physical devices). HAMMER fsids are based on the PFS's
	1021	* shared_uuid field.
	1022	*
	1023	* XXX there is a chance of collision here. The va_fsid reported
	1024	* by stat is different from the more involved fsid used in the
	1025	* mount structure.
	1026	*/
	1027	hammer_lock_sh(&ip->lock);
	1028	vap->va_fsid = ip->pfsm->fsid_udev ^ (uint32_t)ip->obj_asof ^
	1029	(uint32_t)(ip->obj_asof >> 32);
	1030
	1031	vap->va_fileid = ip->ino_leaf.base.obj_id;
	1032	vap->va_mode = ip->ino_data.mode;
	1033	vap->va_nlink = ip->ino_data.nlinks;
	1034	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	1035	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	1036	vap->va_rmajor = 0;
	1037	vap->va_rminor = 0;
	1038	vap->va_size = ip->ino_data.size;
	1039
	1040	/*
	1041	* Special case for @@PFS softlinks. The actual size of the
	1042	* expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
	1043	* or for MAX_TID is "@@-1:%05d" == 10 bytes.
	1044	*
	1045	* Note that userspace hammer command does not allow users to
	1046	* create a @@PFS softlink under an existing other PFS (id!=0)
	1047	* so the ip localization here for @@PFS softlink is always 0.
	1048	*/
	1049	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
	1050	ip->ino_data.size == 10 &&
	1051	ip->obj_asof == HAMMER_MAX_TID &&
	1052	ip->obj_localization == HAMMER_DEF_LOCALIZATION &&
	1053	strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
	1054	if (hammer_is_pfs_slave(&ip->pfsm->pfsd))
	1055	vap->va_size = 26;
	1056	else
	1057	vap->va_size = 10;
	1058	}
	1059
	1060	/*
	1061	* We must provide a consistent atime and mtime for snapshots
	1062	* so people can do a 'tar cf - ... \| md5' on them and get
	1063	* consistent results.
	1064	*/
	1065	if (ip->flags & HAMMER_INODE_RO) {
	1066	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
	1067	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
	1068	} else {
	1069	hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
	1070	hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
	1071	}
	1072	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
	1073	vap->va_flags = ip->ino_data.uflags;
	1074	vap->va_gen = 1; /* hammer inums are unique for all time */
	1075	vap->va_blocksize = HAMMER_BUFSIZE;
	1076	if (ip->ino_data.size >= HAMMER_XDEMARC) {
	1077	vap->va_bytes = HAMMER_XBUFSIZE64_DOALIGN(ip->ino_data.size);
	1078	} else if (ip->ino_data.size > HAMMER_HBUFSIZE) {
	1079	vap->va_bytes = HAMMER_BUFSIZE64_DOALIGN(ip->ino_data.size);
	1080	} else {
	1081	vap->va_bytes = HAMMER_DATA_DOALIGN(ip->ino_data.size);
	1082	}
	1083
	1084	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
	1085	vap->va_filerev = 0; /* XXX */
	1086	vap->va_uid_uuid = ip->ino_data.uid;
	1087	vap->va_gid_uuid = ip->ino_data.gid;
	1088	vap->va_fsid_uuid = ip->hmp->fsid;
	1089	vap->va_vaflags = VA_UID_UUID_VALID \| VA_GID_UUID_VALID \|
	1090	VA_FSID_UUID_VALID;
	1091
	1092	switch (ip->ino_data.obj_type) {
	1093	case HAMMER_OBJTYPE_CDEV:
	1094	case HAMMER_OBJTYPE_BDEV:
	1095	vap->va_rmajor = ip->ino_data.rmajor;
	1096	vap->va_rminor = ip->ino_data.rminor;
	1097	break;
	1098	default:
	1099	break;
	1100	}
	1101	hammer_unlock(&ip->lock);
	1102	return(0);
	1103	}
	1104
	1105	/*
	1106	* hammer_vop_nresolve { nch, dvp, cred }
	1107	*
	1108	* Locate the requested directory entry.
	1109	*/
	1110	static
	1111	int
	1112	hammer_vop_nresolve(struct vop_nresolve_args *ap)
	1113	{
	1114	struct hammer_transaction trans;
	1115	struct namecache *ncp;
	1116	hammer_mount_t hmp;
	1117	hammer_inode_t dip;
	1118	hammer_inode_t ip;
	1119	hammer_tid_t asof;
	1120	struct hammer_cursor cursor;
	1121	struct vnode *vp;
	1122	int64_t namekey;
	1123	int error;
	1124	int i;
	1125	int nlen;
	1126	int flags;
	1127	int ispfs;
	1128	int64_t obj_id;
	1129	uint32_t localization;
	1130	uint32_t max_iterations;
	1131
	1132	/*
	1133	* Misc initialization, plus handle as-of name extensions. Look for
	1134	* the '@@' extension. Note that as-of files and directories cannot
	1135	* be modified.
	1136	*/
	1137	dip = VTOI(ap->a_dvp);
	1138	ncp = ap->a_nch->ncp;
	1139	asof = dip->obj_asof;
	1140	localization = dip->obj_localization; /* for code consistency */
	1141	nlen = ncp->nc_nlen;
	1142	flags = dip->flags & HAMMER_INODE_RO;
	1143	ispfs = 0;
	1144	hmp = dip->hmp;
	1145
	1146	lwkt_gettoken(&hmp->fs_token);
	1147	hammer_simple_transaction(&trans, hmp);
	1148
	1149	for (i = 0; i < nlen; ++i) {
	1150	if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
	1151	error = hammer_str_to_tid(ncp->nc_name + i + 2,
	1152	&ispfs, &asof, &localization);
	1153	if (error != 0) {
	1154	i = nlen;
	1155	break;
	1156	}
	1157	if (asof != HAMMER_MAX_TID)
	1158	flags \|= HAMMER_INODE_RO;
	1159	break;
	1160	}
	1161	}
	1162	nlen = i;
	1163
	1164	/*
	1165	* If this is a PFS we dive into the PFS root inode
	1166	*/
	1167	if (ispfs && nlen == 0) {
	1168	ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
	1169	asof, localization,
	1170	flags, &error);
	1171	if (error == 0) {
	1172	error = hammer_get_vnode(ip, &vp);
	1173	hammer_rel_inode(ip, 0);
	1174	} else {
	1175	vp = NULL;
	1176	}
	1177	if (error == 0) {
	1178	vn_unlock(vp);
	1179	cache_setvp(ap->a_nch, vp);
	1180	vrele(vp);
	1181	}
	1182	goto done;
	1183	}
	1184
	1185	/*
	1186	* If there is no path component the time extension is relative to dip.
	1187	* e.g. "fubar/@@<snapshot>"
	1188	*
	1189	* "." is handled by the kernel, but ".@@<snapshot>" is not.
	1190	* e.g. "fubar/.@@<snapshot>"
	1191	*
	1192	* ".." is handled by the kernel. We do not currently handle
	1193	* "..@<snapshot>".
	1194	*/
	1195	if (nlen == 0 \|\| (nlen == 1 && ncp->nc_name[0] == '.')) {
	1196	ip = hammer_get_inode(&trans, dip, dip->obj_id,
	1197	asof, dip->obj_localization,
	1198	flags, &error);
	1199	if (error == 0) {
	1200	error = hammer_get_vnode(ip, &vp);
	1201	hammer_rel_inode(ip, 0);
	1202	} else {
	1203	vp = NULL;
	1204	}
	1205	if (error == 0) {
	1206	vn_unlock(vp);
	1207	cache_setvp(ap->a_nch, vp);
	1208	vrele(vp);
	1209	}
	1210	goto done;
	1211	}
	1212
	1213	/*
	1214	* Calculate the namekey and setup the key range for the scan. This
	1215	* works kinda like a chained hash table where the lower 32 bits
	1216	* of the namekey synthesize the chain.
	1217	*
	1218	* The key range is inclusive of both key_beg and key_end.
	1219	*/
	1220	namekey = hammer_direntry_namekey(dip, ncp->nc_name, nlen,
	1221	&max_iterations);
	1222
	1223	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
	1224	cursor.key_beg.localization = dip->obj_localization \|
	1225	hammer_dir_localization(dip);
	1226	cursor.key_beg.obj_id = dip->obj_id;
	1227	cursor.key_beg.key = namekey;
	1228	cursor.key_beg.create_tid = 0;
	1229	cursor.key_beg.delete_tid = 0;
	1230	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1231	cursor.key_beg.obj_type = 0;
	1232
	1233	cursor.key_end = cursor.key_beg;
	1234	cursor.key_end.key += max_iterations;
	1235	cursor.asof = asof;
	1236	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1237
	1238	/*
	1239	* Scan all matching records (the chain), locate the one matching
	1240	* the requested path component.
	1241	*
	1242	* The hammer_ip_*() functions merge in-memory records with on-disk
	1243	* records for the purposes of the search.
	1244	*/
	1245	obj_id = 0;
	1246	localization = HAMMER_DEF_LOCALIZATION;
	1247
	1248	if (error == 0) {
	1249	error = hammer_ip_first(&cursor);
	1250	while (error == 0) {
	1251	error = hammer_ip_resolve_data(&cursor);
	1252	if (error)
	1253	break;
	1254	if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
	1255	bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	1256	obj_id = cursor.data->entry.obj_id;
	1257	localization = cursor.data->entry.localization;
	1258	break;
	1259	}
	1260	error = hammer_ip_next(&cursor);
	1261	}
	1262	}
	1263	hammer_done_cursor(&cursor);
	1264
	1265	/*
	1266	* Lookup the obj_id. This should always succeed. If it does not
	1267	* the filesystem may be damaged and we return a dummy inode.
	1268	*/
	1269	if (error == 0) {
	1270	ip = hammer_get_inode(&trans, dip, obj_id,
	1271	asof, localization,
	1272	flags, &error);
	1273	if (error == ENOENT) {
	1274	hkprintf("WARNING: Missing inode for dirent \"%s\"\n"
	1275	"\tobj_id = %016jx, asof=%016jx, lo=%08x\n",
	1276	ncp->nc_name,
	1277	(intmax_t)obj_id, (intmax_t)asof,
	1278	localization);
	1279	error = 0;
	1280	ip = hammer_get_dummy_inode(&trans, dip, obj_id,
	1281	asof, localization,
	1282	flags, &error);
	1283	}
	1284	if (error == 0) {
	1285	error = hammer_get_vnode(ip, &vp);
	1286	hammer_rel_inode(ip, 0);
	1287	} else {
	1288	vp = NULL;
	1289	}
	1290	if (error == 0) {
	1291	vn_unlock(vp);
	1292	cache_setvp(ap->a_nch, vp);
	1293	vrele(vp);
	1294	}
	1295	} else if (error == ENOENT) {
	1296	cache_setvp(ap->a_nch, NULL);
	1297	}
	1298	done:
	1299	hammer_done_transaction(&trans);
	1300	lwkt_reltoken(&hmp->fs_token);
	1301	return (error);
	1302	}
	1303
	1304	/*
	1305	* hammer_vop_nlookupdotdot { dvp, vpp, cred }
	1306	*
	1307	* Locate the parent directory of a directory vnode.
	1308	*
	1309	* dvp is referenced but not locked. *vpp must be returned referenced and
	1310	* locked. A parent_obj_id of 0 indicates that we are at the root.
	1311	*
	1312	* NOTE: as-of sequences are not linked into the directory structure. If
	1313	* we are at the root with a different asof then the mount point, reload
	1314	* the same directory with the mount point's asof. I'm not sure what this
	1315	* will do to NFS. We encode ASOF stamps in NFS file handles so it might not
	1316	* get confused, but it hasn't been tested.
	1317	*/
	1318	static
	1319	int
	1320	hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
	1321	{
	1322	struct hammer_transaction trans;
	1323	hammer_inode_t dip;
	1324	hammer_inode_t ip;
	1325	hammer_mount_t hmp;
	1326	int64_t parent_obj_id;
	1327	uint32_t parent_obj_localization;
	1328	hammer_tid_t asof;
	1329	int error;
	1330
	1331	dip = VTOI(ap->a_dvp);
	1332	asof = dip->obj_asof;
	1333	hmp = dip->hmp;
	1334
	1335	/*
	1336	* Whos are parent? This could be the root of a pseudo-filesystem
	1337	* whos parent is in another localization domain.
	1338	*/
	1339	lwkt_gettoken(&hmp->fs_token);
	1340	parent_obj_id = dip->ino_data.parent_obj_id;
	1341	if (dip->obj_id == HAMMER_OBJID_ROOT)
	1342	parent_obj_localization = HAMMER_DEF_LOCALIZATION;
	1343	else
	1344	parent_obj_localization = dip->obj_localization;
	1345
	1346	/*
	1347	* It's probably a PFS root when dip->ino_data.parent_obj_id is 0.
	1348	*/
	1349	if (parent_obj_id == 0) {
	1350	if (dip->obj_id == HAMMER_OBJID_ROOT &&
	1351	asof != hmp->asof) {
	1352	parent_obj_id = dip->obj_id;
	1353	asof = hmp->asof;
	1354	*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
	1355	ksnprintf(*ap->a_fakename, 19, "0x%016jx",
	1356	(intmax_t)dip->obj_asof);
	1357	} else {
	1358	*ap->a_vpp = NULL;
	1359	lwkt_reltoken(&hmp->fs_token);
	1360	return ENOENT;
	1361	}
	1362	}
	1363
	1364	hammer_simple_transaction(&trans, hmp);
	1365
	1366	ip = hammer_get_inode(&trans, dip, parent_obj_id,
	1367	asof, parent_obj_localization,
	1368	dip->flags, &error);
	1369	if (ip) {
	1370	error = hammer_get_vnode(ip, ap->a_vpp);
	1371	hammer_rel_inode(ip, 0);
	1372	} else {
	1373	*ap->a_vpp = NULL;
	1374	}
	1375	hammer_done_transaction(&trans);
	1376	lwkt_reltoken(&hmp->fs_token);
	1377	return (error);
	1378	}
	1379
	1380	/*
	1381	* hammer_vop_nlink { nch, dvp, vp, cred }
	1382	*/
	1383	static
	1384	int
	1385	hammer_vop_nlink(struct vop_nlink_args *ap)
	1386	{
	1387	struct hammer_transaction trans;
	1388	hammer_inode_t dip;
	1389	hammer_inode_t ip;
	1390	struct nchandle *nch;
	1391	hammer_mount_t hmp;
	1392	int error;
	1393
	1394	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
	1395	return(EXDEV);
	1396
	1397	nch = ap->a_nch;
	1398	dip = VTOI(ap->a_dvp);
	1399	ip = VTOI(ap->a_vp);
	1400	hmp = dip->hmp;
	1401
	1402	if (dip->obj_localization != ip->obj_localization)
	1403	return(EXDEV);
	1404
	1405	if (dip->flags & HAMMER_INODE_RO)
	1406	return (EROFS);
	1407	if (ip->flags & HAMMER_INODE_RO)
	1408	return (EROFS);
	1409	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1410	return (error);
	1411
	1412	/*
	1413	* Create a transaction to cover the operations we perform.
	1414	*/
	1415	lwkt_gettoken(&hmp->fs_token);
	1416	hammer_start_transaction(&trans, hmp);
	1417
	1418	/*
	1419	* Add the filesystem object to the directory. Note that neither
	1420	* dip nor ip are referenced or locked, but their vnodes are
	1421	* referenced. This function will bump the inode's link count.
	1422	*/
	1423	error = hammer_ip_add_direntry(&trans, dip,
	1424	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1425	ip);
	1426
	1427	/*
	1428	* Finish up.
	1429	*/
	1430	if (error == 0) {
	1431	cache_setunresolved(nch);
	1432	cache_setvp(nch, ap->a_vp);
	1433	}
	1434	hammer_done_transaction(&trans);
	1435	hammer_knote(ap->a_vp, NOTE_LINK);
	1436	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1437	lwkt_reltoken(&hmp->fs_token);
	1438	return (error);
	1439	}
	1440
	1441	/*
	1442	* hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
	1443	*
	1444	* The operating system has already ensured that the directory entry
	1445	* does not exist and done all appropriate namespace locking.
	1446	*/
	1447	static
	1448	int
	1449	hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
	1450	{
	1451	struct hammer_transaction trans;
	1452	hammer_inode_t dip;
	1453	hammer_inode_t nip;
	1454	struct nchandle *nch;
	1455	hammer_mount_t hmp;
	1456	int error;
	1457
	1458	nch = ap->a_nch;
	1459	dip = VTOI(ap->a_dvp);
	1460	hmp = dip->hmp;
	1461
	1462	if (dip->flags & HAMMER_INODE_RO)
	1463	return (EROFS);
	1464	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1465	return (error);
	1466
	1467	/*
	1468	* Create a transaction to cover the operations we perform.
	1469	*/
	1470	lwkt_gettoken(&hmp->fs_token);
	1471	hammer_start_transaction(&trans, hmp);
	1472
	1473	/*
	1474	* Create a new filesystem object of the requested type. The
	1475	* returned inode will be referenced but not locked.
	1476	*/
	1477	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	1478	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	1479	NULL, &nip);
	1480	if (error) {
	1481	hammer_done_transaction(&trans);
	1482	*ap->a_vpp = NULL;
	1483	lwkt_reltoken(&hmp->fs_token);
	1484	return (error);
	1485	}
	1486	/*
	1487	* Add the new filesystem object to the directory. This will also
	1488	* bump the inode's link count.
	1489	*/
	1490	error = hammer_ip_add_direntry(&trans, dip,
	1491	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1492	nip);
	1493	if (error)
	1494	hkprintf("hammer_mkdir (add) error %d\n", error);
	1495
	1496	/*
	1497	* Finish up.
	1498	*/
	1499	if (error) {
	1500	hammer_rel_inode(nip, 0);
	1501	*ap->a_vpp = NULL;
	1502	} else {
	1503	error = hammer_get_vnode(nip, ap->a_vpp);
	1504	hammer_rel_inode(nip, 0);
	1505	if (error == 0) {
	1506	cache_setunresolved(ap->a_nch);
	1507	cache_setvp(ap->a_nch, *ap->a_vpp);
	1508	}
	1509	}
	1510	hammer_done_transaction(&trans);
	1511	if (error == 0)
	1512	hammer_knote(ap->a_dvp, NOTE_WRITE \| NOTE_LINK);
	1513	lwkt_reltoken(&hmp->fs_token);
	1514	return (error);
	1515	}
	1516
	1517	/*
	1518	* hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
	1519	*
	1520	* The operating system has already ensured that the directory entry
	1521	* does not exist and done all appropriate namespace locking.
	1522	*/
	1523	static
	1524	int
	1525	hammer_vop_nmknod(struct vop_nmknod_args *ap)
	1526	{
	1527	struct hammer_transaction trans;
	1528	hammer_inode_t dip;
	1529	hammer_inode_t nip;
	1530	struct nchandle *nch;
	1531	hammer_mount_t hmp;
	1532	int error;
	1533
	1534	nch = ap->a_nch;
	1535	dip = VTOI(ap->a_dvp);
	1536	hmp = dip->hmp;
	1537
	1538	if (dip->flags & HAMMER_INODE_RO)
	1539	return (EROFS);
	1540	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1541	return (error);
	1542
	1543	/*
	1544	* Create a transaction to cover the operations we perform.
	1545	*/
	1546	lwkt_gettoken(&hmp->fs_token);
	1547	hammer_start_transaction(&trans, hmp);
	1548
	1549	/*
	1550	* Create a new filesystem object of the requested type. The
	1551	* returned inode will be referenced but not locked.
	1552	*
	1553	* If mknod specifies a directory a pseudo-fs is created.
	1554	*/
	1555	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	1556	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	1557	NULL, &nip);
	1558	if (error) {
	1559	hammer_done_transaction(&trans);
	1560	*ap->a_vpp = NULL;
	1561	lwkt_reltoken(&hmp->fs_token);
	1562	return (error);
	1563	}
	1564
	1565	/*
	1566	* Add the new filesystem object to the directory. This will also
	1567	* bump the inode's link count.
	1568	*/
	1569	error = hammer_ip_add_direntry(&trans, dip,
	1570	nch->ncp->nc_name, nch->ncp->nc_nlen,
	1571	nip);
	1572
	1573	/*
	1574	* Finish up.
	1575	*/
	1576	if (error) {
	1577	hammer_rel_inode(nip, 0);
	1578	*ap->a_vpp = NULL;
	1579	} else {
	1580	error = hammer_get_vnode(nip, ap->a_vpp);
	1581	hammer_rel_inode(nip, 0);
	1582	if (error == 0) {
	1583	cache_setunresolved(ap->a_nch);
	1584	cache_setvp(ap->a_nch, *ap->a_vpp);
	1585	}
	1586	}
	1587	hammer_done_transaction(&trans);
	1588	if (error == 0)
	1589	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1590	lwkt_reltoken(&hmp->fs_token);
	1591	return (error);
	1592	}
	1593
	1594	/*
	1595	* hammer_vop_open { vp, mode, cred, fp }
	1596	*
	1597	* MPSAFE (does not require fs_token)
	1598	*/
	1599	static
	1600	int
	1601	hammer_vop_open(struct vop_open_args *ap)
	1602	{
	1603	hammer_inode_t ip;
	1604
	1605	ip = VTOI(ap->a_vp);
	1606
	1607	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
	1608	return (EROFS);
	1609	return(vop_stdopen(ap));
	1610	}
	1611
	1612	/*
	1613	* hammer_vop_print { vp }
	1614	*/
	1615	static
	1616	int
	1617	hammer_vop_print(struct vop_print_args *ap)
	1618	{
	1619	return EOPNOTSUPP;
	1620	}
	1621
	1622	/*
	1623	* hammer_vop_readdir { vp, uio, cred, eofflag, ncookies, off_t **cookies }
	1624	*/
	1625	static
	1626	int
	1627	hammer_vop_readdir(struct vop_readdir_args *ap)
	1628	{
	1629	struct hammer_transaction trans;
	1630	struct hammer_cursor cursor;
	1631	hammer_inode_t ip;
	1632	hammer_mount_t hmp;
	1633	struct uio *uio;
	1634	hammer_base_elm_t base;
	1635	int error;
	1636	int cookie_index;
	1637	int ncookies;
	1638	off_t *cookies;
	1639	off_t saveoff;
	1640	int r;
	1641	int dtype;
	1642
	1643	ip = VTOI(ap->a_vp);
	1644	uio = ap->a_uio;
	1645	saveoff = uio->uio_offset;
	1646	hmp = ip->hmp;
	1647
	1648	if (ap->a_ncookies) {
	1649	ncookies = uio->uio_resid / 16 + 1;
	1650	if (ncookies > 1024)
	1651	ncookies = 1024;
	1652	cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
	1653	cookie_index = 0;
	1654	} else {
	1655	ncookies = -1;
	1656	cookies = NULL;
	1657	cookie_index = 0;
	1658	}
	1659
	1660	lwkt_gettoken(&hmp->fs_token);
	1661	hammer_simple_transaction(&trans, hmp);
	1662
	1663	/*
	1664	* Handle artificial entries
	1665	*
	1666	* It should be noted that the minimum value for a directory
	1667	* hash key on-media is 0x0000000100000000, so we can use anything
	1668	* less then that to represent our 'special' key space.
	1669	*/
	1670	error = 0;
	1671	if (saveoff == 0) {
	1672	r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
	1673	if (r)
	1674	goto done;
	1675	if (cookies)
	1676	cookies[cookie_index] = saveoff;
	1677	++saveoff;
	1678	++cookie_index;
	1679	if (cookie_index == ncookies)
	1680	goto done;
	1681	}
	1682	if (saveoff == 1) {
	1683	if (ip->ino_data.parent_obj_id) {
	1684	r = vop_write_dirent(&error, uio,
	1685	ip->ino_data.parent_obj_id,
	1686	DT_DIR, 2, "..");
	1687	} else {
	1688	r = vop_write_dirent(&error, uio,
	1689	ip->obj_id, DT_DIR, 2, "..");
	1690	}
	1691	if (r)
	1692	goto done;
	1693	if (cookies)
	1694	cookies[cookie_index] = saveoff;
	1695	++saveoff;
	1696	++cookie_index;
	1697	if (cookie_index == ncookies)
	1698	goto done;
	1699	}
	1700
	1701	/*
	1702	* Key range (begin and end inclusive) to scan. Directory keys
	1703	* directly translate to a 64 bit 'seek' position.
	1704	*/
	1705	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1706	cursor.key_beg.localization = ip->obj_localization \|
	1707	hammer_dir_localization(ip);
	1708	cursor.key_beg.obj_id = ip->obj_id;
	1709	cursor.key_beg.create_tid = 0;
	1710	cursor.key_beg.delete_tid = 0;
	1711	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	1712	cursor.key_beg.obj_type = 0;
	1713	cursor.key_beg.key = saveoff;
	1714
	1715	cursor.key_end = cursor.key_beg;
	1716	cursor.key_end.key = HAMMER_MAX_KEY;
	1717	cursor.asof = ip->obj_asof;
	1718	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	1719
	1720	error = hammer_ip_first(&cursor);
	1721
	1722	while (error == 0) {
	1723	error = hammer_ip_resolve_data(&cursor);
	1724	if (error)
	1725	break;
	1726	base = &cursor.leaf->base;
	1727	saveoff = base->key;
	1728	KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
	1729
	1730	if (base->obj_id != ip->obj_id)
	1731	hpanic("bad record at %p", cursor.node);
	1732
	1733	dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
	1734	r = vop_write_dirent(
	1735	&error, uio, cursor.data->entry.obj_id,
	1736	dtype,
	1737	cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
	1738	(void *)cursor.data->entry.name);
	1739	if (r)
	1740	break;
	1741	++saveoff;
	1742	if (cookies)
	1743	cookies[cookie_index] = base->key;
	1744	++cookie_index;
	1745	if (cookie_index == ncookies)
	1746	break;
	1747	error = hammer_ip_next(&cursor);
	1748	}
	1749	hammer_done_cursor(&cursor);
	1750
	1751	done:
	1752	hammer_done_transaction(&trans);
	1753
	1754	if (ap->a_eofflag)
	1755	*ap->a_eofflag = (error == ENOENT);
	1756	uio->uio_offset = saveoff;
	1757	if (error && cookie_index == 0) {
	1758	if (error == ENOENT)
	1759	error = 0;
	1760	if (cookies) {
	1761	kfree(cookies, M_TEMP);
	1762	*ap->a_ncookies = 0;
	1763	*ap->a_cookies = NULL;
	1764	}
	1765	} else {
	1766	if (error == ENOENT)
	1767	error = 0;
	1768	if (cookies) {
	1769	*ap->a_ncookies = cookie_index;
	1770	*ap->a_cookies = cookies;
	1771	}
	1772	}
	1773	lwkt_reltoken(&hmp->fs_token);
	1774	return(error);
	1775	}
	1776
	1777	/*
	1778	* hammer_vop_readlink { vp, uio, cred }
	1779	*/
	1780	static
	1781	int
	1782	hammer_vop_readlink(struct vop_readlink_args *ap)
	1783	{
	1784	struct hammer_transaction trans;
	1785	struct hammer_cursor cursor;
	1786	hammer_inode_t ip;
	1787	hammer_mount_t hmp;
	1788	char buf[32];
	1789	uint32_t localization;
	1790	hammer_pseudofs_inmem_t pfsm;
	1791	int error;
	1792
	1793	ip = VTOI(ap->a_vp);
	1794	hmp = ip->hmp;
	1795
	1796	lwkt_gettoken(&hmp->fs_token);
	1797
	1798	/*
	1799	* Shortcut if the symlink data was stuffed into ino_data.
	1800	*
	1801	* Also expand special "@@PFS%05d" softlinks (expansion only
	1802	* occurs for non-historical (current) accesses made from the
	1803	* primary filesystem).
	1804	*
	1805	* Note that userspace hammer command does not allow users to
	1806	* create a @@PFS softlink under an existing other PFS (id!=0)
	1807	* so the ip localization here for @@PFS softlink is always 0.
	1808	*/
	1809	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
	1810	char *ptr;
	1811	int bytes;
	1812
	1813	ptr = ip->ino_data.ext.symlink;
	1814	bytes = (int)ip->ino_data.size;
	1815	if (bytes == 10 &&
	1816	ip->obj_asof == HAMMER_MAX_TID &&
	1817	ip->obj_localization == HAMMER_DEF_LOCALIZATION &&
	1818	strncmp(ptr, "@@PFS", 5) == 0) {
	1819	hammer_simple_transaction(&trans, hmp);
	1820	bcopy(ptr + 5, buf, 5);
	1821	buf[5] = 0;
	1822	localization = pfs_to_lo(strtoul(buf, NULL, 10));
	1823	pfsm = hammer_load_pseudofs(&trans, localization,
	1824	&error);
	1825	if (error == 0) {
	1826	if (hammer_is_pfs_slave(&pfsm->pfsd)) {
	1827	/* vap->va_size == 26 */
	1828	ksnprintf(buf, sizeof(buf),
	1829	"@@0x%016jx:%05d",
	1830	(intmax_t)pfsm->pfsd.sync_end_tid,
	1831	lo_to_pfs(localization));
	1832	} else {
	1833	/* vap->va_size == 10 */
	1834	ksnprintf(buf, sizeof(buf),
	1835	"@@-1:%05d",
	1836	lo_to_pfs(localization));
	1837	}
	1838	ptr = buf;
	1839	bytes = strlen(buf);
	1840	}
	1841	if (pfsm)
	1842	hammer_rel_pseudofs(hmp, pfsm);
	1843	hammer_done_transaction(&trans);
	1844	}
	1845	error = uiomove(ptr, bytes, ap->a_uio);
	1846	lwkt_reltoken(&hmp->fs_token);
	1847	return(error);
	1848	}
	1849
	1850	/*
	1851	* Long version
	1852	*/
	1853	hammer_simple_transaction(&trans, hmp);
	1854	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1855
	1856	/*
	1857	* Key range (begin and end inclusive) to scan. Directory keys
	1858	* directly translate to a 64 bit 'seek' position.
	1859	*/
	1860	cursor.key_beg.localization = ip->obj_localization \|
	1861	HAMMER_LOCALIZE_MISC;
	1862	cursor.key_beg.obj_id = ip->obj_id;
	1863	cursor.key_beg.create_tid = 0;
	1864	cursor.key_beg.delete_tid = 0;
	1865	cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
	1866	cursor.key_beg.obj_type = 0;
	1867	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
	1868	cursor.asof = ip->obj_asof;
	1869	cursor.flags \|= HAMMER_CURSOR_ASOF;
	1870
	1871	error = hammer_ip_lookup(&cursor);
	1872	if (error == 0) {
	1873	error = hammer_ip_resolve_data(&cursor);
	1874	if (error == 0) {
	1875	KKASSERT(cursor.leaf->data_len >=
	1876	HAMMER_SYMLINK_NAME_OFF);
	1877	error = uiomove(cursor.data->symlink.name,
	1878	cursor.leaf->data_len -
	1879	HAMMER_SYMLINK_NAME_OFF,
	1880	ap->a_uio);
	1881	}
	1882	}
	1883	hammer_done_cursor(&cursor);
	1884	hammer_done_transaction(&trans);
	1885	lwkt_reltoken(&hmp->fs_token);
	1886	return(error);
	1887	}
	1888
	1889	/*
	1890	* hammer_vop_nremove { nch, dvp, cred }
	1891	*/
	1892	static
	1893	int
	1894	hammer_vop_nremove(struct vop_nremove_args *ap)
	1895	{
	1896	struct hammer_transaction trans;
	1897	hammer_inode_t dip;
	1898	hammer_mount_t hmp;
	1899	int error;
	1900
	1901	dip = VTOI(ap->a_dvp);
	1902	hmp = dip->hmp;
	1903
	1904	if (hammer_nohistory(dip) == 0 &&
	1905	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	1906	return (error);
	1907	}
	1908
	1909	lwkt_gettoken(&hmp->fs_token);
	1910	hammer_start_transaction(&trans, hmp);
	1911	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
	1912	hammer_done_transaction(&trans);
	1913	if (error == 0)
	1914	hammer_knote(ap->a_dvp, NOTE_WRITE);
	1915	lwkt_reltoken(&hmp->fs_token);
	1916	return (error);
	1917	}
	1918
	1919	/*
	1920	* hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
	1921	*/
	1922	static
	1923	int
	1924	hammer_vop_nrename(struct vop_nrename_args *ap)
	1925	{
	1926	struct hammer_transaction trans;
	1927	struct namecache *fncp;
	1928	struct namecache *tncp;
	1929	hammer_inode_t fdip;
	1930	hammer_inode_t tdip;
	1931	hammer_inode_t ip;
	1932	hammer_mount_t hmp;
	1933	struct hammer_cursor cursor;
	1934	int64_t namekey;
	1935	uint32_t max_iterations;
	1936	int nlen, error;
	1937
	1938	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
	1939	return(EXDEV);
	1940	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
	1941	return(EXDEV);
	1942
	1943	fdip = VTOI(ap->a_fdvp);
	1944	tdip = VTOI(ap->a_tdvp);
	1945	fncp = ap->a_fnch->ncp;
	1946	tncp = ap->a_tnch->ncp;
	1947	ip = VTOI(fncp->nc_vp);
	1948	KKASSERT(ip != NULL);
	1949
	1950	hmp = ip->hmp;
	1951
	1952	if (fdip->obj_localization != tdip->obj_localization)
	1953	return(EXDEV);
	1954	if (fdip->obj_localization != ip->obj_localization)
	1955	return(EXDEV);
	1956
	1957	if (fdip->flags & HAMMER_INODE_RO)
	1958	return (EROFS);
	1959	if (tdip->flags & HAMMER_INODE_RO)
	1960	return (EROFS);
	1961	if (ip->flags & HAMMER_INODE_RO)
	1962	return (EROFS);
	1963	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	1964	return (error);
	1965
	1966	lwkt_gettoken(&hmp->fs_token);
	1967	hammer_start_transaction(&trans, hmp);
	1968
	1969	/*
	1970	* Remove tncp from the target directory and then link ip as
	1971	* tncp. XXX pass trans to dounlink
	1972	*
	1973	* Force the inode sync-time to match the transaction so it is
	1974	* in-sync with the creation of the target directory entry.
	1975	*/
	1976	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
	1977	ap->a_cred, 0, -1);
	1978	if (error == 0 \|\| error == ENOENT) {
	1979	error = hammer_ip_add_direntry(&trans, tdip,
	1980	tncp->nc_name, tncp->nc_nlen,
	1981	ip);
	1982	if (error == 0) {
	1983	ip->ino_data.parent_obj_id = tdip->obj_id;
	1984	ip->ino_data.ctime = trans.time;
	1985	hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
	1986	}
	1987	}
	1988	if (error)
	1989	goto failed; /* XXX */
	1990
	1991	/*
	1992	* Locate the record in the originating directory and remove it.
	1993	*
	1994	* Calculate the namekey and setup the key range for the scan. This
	1995	* works kinda like a chained hash table where the lower 32 bits
	1996	* of the namekey synthesize the chain.
	1997	*
	1998	* The key range is inclusive of both key_beg and key_end.
	1999	*/
	2000	namekey = hammer_direntry_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
	2001	&max_iterations);
	2002	retry:
	2003	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
	2004	cursor.key_beg.localization = fdip->obj_localization \|
	2005	hammer_dir_localization(fdip);
	2006	cursor.key_beg.obj_id = fdip->obj_id;
	2007	cursor.key_beg.key = namekey;
	2008	cursor.key_beg.create_tid = 0;
	2009	cursor.key_beg.delete_tid = 0;
	2010	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	2011	cursor.key_beg.obj_type = 0;
	2012
	2013	cursor.key_end = cursor.key_beg;
	2014	cursor.key_end.key += max_iterations;
	2015	cursor.asof = fdip->obj_asof;
	2016	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	2017
	2018	/*
	2019	* Scan all matching records (the chain), locate the one matching
	2020	* the requested path component.
	2021	*
	2022	* The hammer_ip_*() functions merge in-memory records with on-disk
	2023	* records for the purposes of the search.
	2024	*/
	2025	error = hammer_ip_first(&cursor);
	2026	while (error == 0) {
	2027	if (hammer_ip_resolve_data(&cursor) != 0)
	2028	break;
	2029	nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
	2030	KKASSERT(nlen > 0);
	2031	if (fncp->nc_nlen == nlen &&
	2032	bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	2033	break;
	2034	}
	2035	error = hammer_ip_next(&cursor);
	2036	}
	2037
	2038	/*
	2039	* If all is ok we have to get the inode so we can adjust nlinks.
	2040	*
	2041	* WARNING: hammer_ip_del_direntry() may have to terminate the
	2042	* cursor to avoid a recursion. It's ok to call hammer_done_cursor()
	2043	* twice.
	2044	*/
	2045	if (error == 0)
	2046	error = hammer_ip_del_direntry(&trans, &cursor, fdip, ip);
	2047
	2048	/*
	2049	* XXX A deadlock here will break rename's atomicy for the purposes
	2050	* of crash recovery.
	2051	*/
	2052	if (error == EDEADLK) {
	2053	hammer_done_cursor(&cursor);
	2054	goto retry;
	2055	}
	2056
	2057	/*
	2058	* Cleanup and tell the kernel that the rename succeeded.
	2059	*
	2060	* NOTE: ip->vp, if non-NULL, cannot be directly referenced
	2061	* without formally acquiring the vp since the vp might
	2062	* have zero refs on it, or in the middle of a reclaim,
	2063	* etc.
	2064	*/
	2065	hammer_done_cursor(&cursor);
	2066	if (error == 0) {
	2067	cache_rename(ap->a_fnch, ap->a_tnch);
	2068	hammer_knote(ap->a_fdvp, NOTE_WRITE);
	2069	hammer_knote(ap->a_tdvp, NOTE_WRITE);
	2070	while (ip->vp) {
	2071	struct vnode *vp;
	2072
	2073	error = hammer_get_vnode(ip, &vp);
	2074	if (error == 0 && vp) {
	2075	vn_unlock(vp);
	2076	hammer_knote(ip->vp, NOTE_RENAME);
	2077	vrele(vp);
	2078	break;
	2079	}
	2080	hdkprintf("ip/vp race2 avoided\n");
	2081	}
	2082	}
	2083
	2084	failed:
	2085	hammer_done_transaction(&trans);
	2086	lwkt_reltoken(&hmp->fs_token);
	2087	return (error);
	2088	}
	2089
	2090	/*
	2091	* hammer_vop_nrmdir { nch, dvp, cred }
	2092	*/
	2093	static
	2094	int
	2095	hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
	2096	{
	2097	struct hammer_transaction trans;
	2098	hammer_inode_t dip;
	2099	hammer_mount_t hmp;
	2100	int error;
	2101
	2102	dip = VTOI(ap->a_dvp);
	2103	hmp = dip->hmp;
	2104
	2105	if (hammer_nohistory(dip) == 0 &&
	2106	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	2107	return (error);
	2108	}
	2109
	2110	lwkt_gettoken(&hmp->fs_token);
	2111	hammer_start_transaction(&trans, hmp);
	2112	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
	2113	hammer_done_transaction(&trans);
	2114	if (error == 0)
	2115	hammer_knote(ap->a_dvp, NOTE_WRITE \| NOTE_LINK);
	2116	lwkt_reltoken(&hmp->fs_token);
	2117	return (error);
	2118	}
	2119
	2120	/*
	2121	* hammer_vop_markatime { vp, cred }
	2122	*/
	2123	static
	2124	int
	2125	hammer_vop_markatime(struct vop_markatime_args *ap)
	2126	{
	2127	struct hammer_transaction trans;
	2128	hammer_inode_t ip;
	2129	hammer_mount_t hmp;
	2130
	2131	ip = VTOI(ap->a_vp);
	2132	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	2133	return (EROFS);
	2134	if (ip->flags & HAMMER_INODE_RO)
	2135	return (EROFS);
	2136	hmp = ip->hmp;
	2137	if (hmp->mp->mnt_flag & MNT_NOATIME)
	2138	return (0);
	2139	lwkt_gettoken(&hmp->fs_token);
	2140	hammer_start_transaction(&trans, hmp);
	2141
	2142	ip->ino_data.atime = trans.time;
	2143	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
	2144	hammer_done_transaction(&trans);
	2145	hammer_knote(ap->a_vp, NOTE_ATTRIB);
	2146	lwkt_reltoken(&hmp->fs_token);
	2147	return (0);
	2148	}
	2149
	2150	/*
	2151	* hammer_vop_setattr { vp, vap, cred }
	2152	*/
	2153	static
	2154	int
	2155	hammer_vop_setattr(struct vop_setattr_args *ap)
	2156	{
	2157	struct hammer_transaction trans;
	2158	hammer_inode_t ip;
	2159	struct vattr *vap;
	2160	hammer_mount_t hmp;
	2161	int modflags;
	2162	int error;
	2163	int truncating;
	2164	int blksize;
	2165	int kflags;
	2166	#if 0
	2167	int64_t aligned_size;
	2168	#endif
	2169	uint32_t flags;
	2170
	2171	vap = ap->a_vap;
	2172	ip = ap->a_vp->v_data;
	2173	modflags = 0;
	2174	kflags = 0;
	2175	hmp = ip->hmp;
	2176
	2177	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	2178	return(EROFS);
	2179	if (ip->flags & HAMMER_INODE_RO)
	2180	return (EROFS);
	2181	if (hammer_nohistory(ip) == 0 &&
	2182	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
	2183	return (error);
	2184	}
	2185
	2186	lwkt_gettoken(&hmp->fs_token);
	2187	hammer_start_transaction(&trans, hmp);
	2188	error = 0;
	2189
	2190	if (vap->va_flags != VNOVAL) {
	2191	flags = ip->ino_data.uflags;
	2192	error = vop_helper_setattr_flags(&flags, vap->va_flags,
	2193	hammer_to_unix_xid(&ip->ino_data.uid),
	2194	ap->a_cred);
	2195	if (error == 0) {
	2196	if (ip->ino_data.uflags != flags) {
	2197	ip->ino_data.uflags = flags;
	2198	ip->ino_data.ctime = trans.time;
	2199	modflags \|= HAMMER_INODE_DDIRTY;
	2200	kflags \|= NOTE_ATTRIB;
	2201	}
	2202	if (ip->ino_data.uflags & (IMMUTABLE \| APPEND)) {
	2203	error = 0;
	2204	goto done;
	2205	}
	2206	}
	2207	goto done;
	2208	}
	2209	if (ip->ino_data.uflags & (IMMUTABLE \| APPEND)) {
	2210	error = EPERM;
	2211	goto done;
	2212	}
	2213	if (vap->va_uid != (uid_t)VNOVAL \|\| vap->va_gid != (gid_t)VNOVAL) {
	2214	mode_t cur_mode = ip->ino_data.mode;
	2215	uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	2216	gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	2217	hammer_uuid_t uuid_uid;
	2218	hammer_uuid_t uuid_gid;
	2219
	2220	error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
	2221	ap->a_cred,
	2222	&cur_uid, &cur_gid, &cur_mode);
	2223	if (error == 0) {
	2224	hammer_guid_to_uuid(&uuid_uid, cur_uid);
	2225	hammer_guid_to_uuid(&uuid_gid, cur_gid);
	2226	if (kuuid_compare(&uuid_uid, &ip->ino_data.uid) \|\|
	2227	kuuid_compare(&uuid_gid, &ip->ino_data.gid) \|\|
	2228	ip->ino_data.mode != cur_mode) {
	2229	ip->ino_data.uid = uuid_uid;
	2230	ip->ino_data.gid = uuid_gid;
	2231	ip->ino_data.mode = cur_mode;
	2232	ip->ino_data.ctime = trans.time;
	2233	modflags \|= HAMMER_INODE_DDIRTY;
	2234	}
	2235	kflags \|= NOTE_ATTRIB;
	2236	}
	2237	}
	2238	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
	2239	switch(ap->a_vp->v_type) {
	2240	case VREG:
	2241	if (vap->va_size == ip->ino_data.size)
	2242	break;
	2243
	2244	/*
	2245	* Log the operation if in fast-fsync mode or if
	2246	* there are unterminated redo write records present.
	2247	*
	2248	* The second check is needed so the recovery code
	2249	* properly truncates write redos even if nominal
	2250	* REDO operations is turned off due to excessive
	2251	* writes, because the related records might be
	2252	* destroyed and never lay down a TERM_WRITE.
	2253	*/
	2254	if ((ip->flags & HAMMER_INODE_REDO) \|\|
	2255	(ip->flags & HAMMER_INODE_RDIRTY)) {
	2256	error = hammer_generate_redo(&trans, ip,
	2257	vap->va_size,
	2258	HAMMER_REDO_TRUNC,
	2259	NULL, 0);
	2260	}
	2261	blksize = hammer_blocksize(vap->va_size);
	2262
	2263	/*
	2264	* XXX break atomicy, we can deadlock the backend
	2265	* if we do not release the lock. Probably not a
	2266	* big deal here.
	2267	*/
	2268	if (vap->va_size < ip->ino_data.size) {
	2269	nvtruncbuf(ap->a_vp, vap->va_size,
	2270	blksize,
	2271	hammer_blockoff(vap->va_size),
	2272	0);
	2273	truncating = 1;
	2274	kflags \|= NOTE_WRITE;
	2275	} else {
	2276	nvextendbuf(ap->a_vp,
	2277	ip->ino_data.size,
	2278	vap->va_size,
	2279	hammer_blocksize(ip->ino_data.size),
	2280	hammer_blocksize(vap->va_size),
	2281	hammer_blockoff(ip->ino_data.size),
	2282	hammer_blockoff(vap->va_size),
	2283	0);
	2284	truncating = 0;
	2285	kflags \|= NOTE_WRITE \| NOTE_EXTEND;
	2286	}
	2287	ip->ino_data.size = vap->va_size;
	2288	ip->ino_data.mtime = trans.time;
	2289	/* XXX safe to use SDIRTY instead of DDIRTY here? */
	2290	modflags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_DDIRTY;
	2291	vclrflags(ap->a_vp, VLASTWRITETS);
	2292
	2293	/*
	2294	* On-media truncation is cached in the inode until
	2295	* the inode is synchronized. We must immediately
	2296	* handle any frontend records.
	2297	*/
	2298	if (truncating) {
	2299	hammer_ip_frontend_trunc(ip, vap->va_size);
	2300	if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
	2301	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2302	ip->trunc_off = vap->va_size;
	2303	hammer_inode_dirty(ip);
	2304	} else if (ip->trunc_off > vap->va_size) {
	2305	ip->trunc_off = vap->va_size;
	2306	}
	2307	}
	2308
	2309	#if 0
	2310	/*
	2311	* When truncating, nvtruncbuf() may have cleaned out
	2312	* a portion of the last block on-disk in the buffer
	2313	* cache. We must clean out any frontend records
	2314	* for blocks beyond the new last block.
	2315	*/
	2316	aligned_size = (vap->va_size + (blksize - 1)) &
	2317	~(int64_t)(blksize - 1);
	2318	if (truncating && vap->va_size < aligned_size) {
	2319	aligned_size -= blksize;
	2320	hammer_ip_frontend_trunc(ip, aligned_size);
	2321	}
	2322	#endif
	2323	break;
	2324	case VDATABASE:
	2325	if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
	2326	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2327	ip->trunc_off = vap->va_size;
	2328	hammer_inode_dirty(ip);
	2329	} else if (ip->trunc_off > vap->va_size) {
	2330	ip->trunc_off = vap->va_size;
	2331	}
	2332	hammer_ip_frontend_trunc(ip, vap->va_size);
	2333	ip->ino_data.size = vap->va_size;
	2334	ip->ino_data.mtime = trans.time;
	2335	modflags \|= HAMMER_INODE_MTIME \| HAMMER_INODE_DDIRTY;
	2336	vclrflags(ap->a_vp, VLASTWRITETS);
	2337	kflags \|= NOTE_ATTRIB;
	2338	break;
	2339	default:
	2340	error = EINVAL;
	2341	goto done;
	2342	}
	2343	break;
	2344	}
	2345	if (vap->va_atime.tv_sec != VNOVAL) {
	2346	ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
	2347	modflags \|= HAMMER_INODE_ATIME;
	2348	kflags \|= NOTE_ATTRIB;
	2349	}
	2350	if (vap->va_mtime.tv_sec != VNOVAL) {
	2351	ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
	2352	modflags \|= HAMMER_INODE_MTIME;
	2353	kflags \|= NOTE_ATTRIB;
	2354	vclrflags(ap->a_vp, VLASTWRITETS);
	2355	}
	2356	if (vap->va_mode != (mode_t)VNOVAL) {
	2357	mode_t cur_mode = ip->ino_data.mode;
	2358	uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
	2359	gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
	2360
	2361	error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
	2362	cur_uid, cur_gid, &cur_mode);
	2363	if (error == 0) {
	2364	ip->ino_data.mode = cur_mode;
	2365	ip->ino_data.ctime = trans.time;
	2366	modflags \|= HAMMER_INODE_DDIRTY;
	2367	kflags \|= NOTE_ATTRIB;
	2368	}
	2369	}
	2370	done:
	2371	if (error == 0)
	2372	hammer_modify_inode(&trans, ip, modflags);
	2373	hammer_done_transaction(&trans);
	2374	hammer_knote(ap->a_vp, kflags);
	2375	lwkt_reltoken(&hmp->fs_token);
	2376	return (error);
	2377	}
	2378
	2379	/*
	2380	* hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
	2381	*/
	2382	static
	2383	int
	2384	hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
	2385	{
	2386	struct hammer_transaction trans;
	2387	hammer_inode_t dip;
	2388	hammer_inode_t nip;
	2389	hammer_record_t record;
	2390	struct nchandle *nch;
	2391	hammer_mount_t hmp;
	2392	int error;
	2393	int bytes;
	2394
	2395	ap->a_vap->va_type = VLNK;
	2396
	2397	nch = ap->a_nch;
	2398	dip = VTOI(ap->a_dvp);
	2399	hmp = dip->hmp;
	2400
	2401	if (dip->flags & HAMMER_INODE_RO)
	2402	return (EROFS);
	2403	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
	2404	return (error);
	2405
	2406	/*
	2407	* Create a transaction to cover the operations we perform.
	2408	*/
	2409	lwkt_gettoken(&hmp->fs_token);
	2410	hammer_start_transaction(&trans, hmp);
	2411
	2412	/*
	2413	* Create a new filesystem object of the requested type. The
	2414	* returned inode will be referenced but not locked.
	2415	*/
	2416
	2417	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
	2418	dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
	2419	NULL, &nip);
	2420	if (error) {
	2421	hammer_done_transaction(&trans);
	2422	*ap->a_vpp = NULL;
	2423	lwkt_reltoken(&hmp->fs_token);
	2424	return (error);
	2425	}
	2426
	2427	/*
	2428	* Add a record representing the symlink. symlink stores the link
	2429	* as pure data, not a string, and is no \0 terminated.
	2430	*/
	2431	if (error == 0) {
	2432	bytes = strlen(ap->a_target);
	2433
	2434	if (bytes <= HAMMER_INODE_BASESYMLEN) {
	2435	bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
	2436	} else {
	2437	record = hammer_alloc_mem_record(nip, bytes);
	2438	record->type = HAMMER_MEM_RECORD_GENERAL;
	2439
	2440	record->leaf.base.localization = nip->obj_localization \|
	2441	HAMMER_LOCALIZE_MISC;
	2442	record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
	2443	record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
	2444	record->leaf.data_len = bytes;
	2445	KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
	2446	bcopy(ap->a_target, record->data->symlink.name, bytes);
	2447	error = hammer_ip_add_record(&trans, record);
	2448	}
	2449
	2450	/*
	2451	* Set the file size to the length of the link.
	2452	*/
	2453	if (error == 0) {
	2454	nip->ino_data.size = bytes;
	2455	hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
	2456	}
	2457	}
	2458	if (error == 0)
	2459	error = hammer_ip_add_direntry(&trans, dip, nch->ncp->nc_name,
	2460	nch->ncp->nc_nlen, nip);
	2461
	2462	/*
	2463	* Finish up.
	2464	*/
	2465	if (error) {
	2466	hammer_rel_inode(nip, 0);
	2467	*ap->a_vpp = NULL;
	2468	} else {
	2469	error = hammer_get_vnode(nip, ap->a_vpp);
	2470	hammer_rel_inode(nip, 0);
	2471	if (error == 0) {
	2472	cache_setunresolved(ap->a_nch);
	2473	cache_setvp(ap->a_nch, *ap->a_vpp);
	2474	hammer_knote(ap->a_dvp, NOTE_WRITE);
	2475	}
	2476	}
	2477	hammer_done_transaction(&trans);
	2478	lwkt_reltoken(&hmp->fs_token);
	2479	return (error);
	2480	}
	2481
	2482	/*
	2483	* hammer_vop_nwhiteout { nch, dvp, cred, flags }
	2484	*/
	2485	static
	2486	int
	2487	hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
	2488	{
	2489	struct hammer_transaction trans;
	2490	hammer_inode_t dip;
	2491	hammer_mount_t hmp;
	2492	int error;
	2493
	2494	dip = VTOI(ap->a_dvp);
	2495	hmp = dip->hmp;
	2496
	2497	if (hammer_nohistory(dip) == 0 &&
	2498	(error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
	2499	return (error);
	2500	}
	2501
	2502	lwkt_gettoken(&hmp->fs_token);
	2503	hammer_start_transaction(&trans, hmp);
	2504	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
	2505	ap->a_cred, ap->a_flags, -1);
	2506	hammer_done_transaction(&trans);
	2507	lwkt_reltoken(&hmp->fs_token);
	2508
	2509	return (error);
	2510	}
	2511
	2512	/*
	2513	* hammer_vop_ioctl { vp, command, data, fflag, cred }
	2514	*/
	2515	static
	2516	int
	2517	hammer_vop_ioctl(struct vop_ioctl_args *ap)
	2518	{
	2519	hammer_inode_t ip = ap->a_vp->v_data;
	2520	hammer_mount_t hmp = ip->hmp;
	2521	int error;
	2522
	2523	lwkt_gettoken(&hmp->fs_token);
	2524	error = hammer_ioctl(ip, ap->a_command, ap->a_data,
	2525	ap->a_fflag, ap->a_cred);
	2526	lwkt_reltoken(&hmp->fs_token);
	2527	return (error);
	2528	}
	2529
	2530	static
	2531	int
	2532	hammer_vop_mountctl(struct vop_mountctl_args *ap)
	2533	{
	2534	static const struct mountctl_opt extraopt[] = {
	2535	{ HMNT_NOHISTORY, "nohistory" },
	2536	{ HMNT_MASTERID, "master" },
	2537	{ HMNT_NOMIRROR, "nomirror" },
	2538	{ 0, NULL}
	2539
	2540	};
	2541	hammer_mount_t hmp;
	2542	struct mount *mp;
	2543	int usedbytes;
	2544	int error;
	2545
	2546	error = 0;
	2547	usedbytes = 0;
	2548	mp = ap->a_head.a_ops->head.vv_mount;
	2549	KKASSERT(mp->mnt_data != NULL);
	2550	hmp = (hammer_mount_t)mp->mnt_data;
	2551
	2552	lwkt_gettoken(&hmp->fs_token);
	2553
	2554	switch(ap->a_op) {
	2555	case MOUNTCTL_SET_EXPORT:
	2556	if (ap->a_ctllen != sizeof(struct export_args))
	2557	error = EINVAL;
	2558	else
	2559	error = hammer_vfs_export(mp, ap->a_op,
	2560	(const struct export_args *)ap->a_ctl);
	2561	break;
	2562	case MOUNTCTL_MOUNTFLAGS:
	2563	/*
	2564	* Call standard mountctl VOP function
	2565	* so we get user mount flags.
	2566	*/
	2567	error = vop_stdmountctl(ap);
	2568	if (error)
	2569	break;
	2570
	2571	usedbytes = *ap->a_res;
	2572
	2573	if (usedbytes > 0 && usedbytes < ap->a_buflen) {
	2574	usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
	2575	ap->a_buf,
	2576	ap->a_buflen - usedbytes,
	2577	&error);
	2578	}
	2579
	2580	*ap->a_res += usedbytes;
	2581	break;
	2582	default:
	2583	error = vop_stdmountctl(ap);
	2584	break;
	2585	}
	2586	lwkt_reltoken(&hmp->fs_token);
	2587	return(error);
	2588	}
	2589
	2590	/*
	2591	* hammer_vop_strategy { vp, bio }
	2592	*
	2593	* Strategy call, used for regular file read & write only. Note that the
	2594	* bp may represent a cluster.
	2595	*
	2596	* To simplify operation and allow better optimizations in the future,
	2597	* this code does not make any assumptions with regards to buffer alignment
	2598	* or size.
	2599	*/
	2600	static
	2601	int
	2602	hammer_vop_strategy(struct vop_strategy_args *ap)
	2603	{
	2604	struct buf *bp;
	2605	int error;
	2606
	2607	bp = ap->a_bio->bio_buf;
	2608
	2609	switch(bp->b_cmd) {
	2610	case BUF_CMD_READ:
	2611	error = hammer_vop_strategy_read(ap);
	2612	break;
	2613	case BUF_CMD_WRITE:
	2614	error = hammer_vop_strategy_write(ap);
	2615	break;
	2616	default:
	2617	bp->b_error = error = EINVAL;
	2618	bp->b_flags \|= B_ERROR;
	2619	biodone(ap->a_bio);
	2620	break;
	2621	}
	2622	return (error);
	2623	}
	2624
	2625	/*
	2626	* Read from a regular file. Iterate the related records and fill in the
	2627	* BIO/BUF. Gaps are zero-filled.
	2628	*
	2629	* The support code in hammer_object.c should be used to deal with mixed
	2630	* in-memory and on-disk records.
	2631	*
	2632	* NOTE: Can be called from the cluster code with an oversized buf.
	2633	*
	2634	* XXX atime update
	2635	*/
	2636	static
	2637	int
	2638	hammer_vop_strategy_read(struct vop_strategy_args *ap)
	2639	{
	2640	struct hammer_transaction trans;
	2641	hammer_inode_t ip;
	2642	hammer_inode_t dip;
	2643	hammer_mount_t hmp;
	2644	struct hammer_cursor cursor;
	2645	hammer_base_elm_t base;
	2646	hammer_off_t disk_offset;
	2647	struct bio *bio;
	2648	struct bio *nbio;
	2649	struct buf *bp;
	2650	int64_t rec_offset;
	2651	int64_t ran_end;
	2652	int64_t tmp64;
	2653	int error;
	2654	int boff;
	2655	int roff;
	2656	int n;
	2657	int isdedupable;
	2658
	2659	bio = ap->a_bio;
	2660	bp = bio->bio_buf;
	2661	ip = ap->a_vp->v_data;
	2662	hmp = ip->hmp;
	2663
	2664	/*
	2665	* The zone-2 disk offset may have been set by the cluster code via
	2666	* a BMAP operation, or else should be NOOFFSET.
	2667	*
	2668	* Checking the high bits for a match against zone-2 should suffice.
	2669	*
	2670	* In cases where a lot of data duplication is present it may be
	2671	* more beneficial to drop through and doubule-buffer through the
	2672	* device.
	2673	*/
	2674	nbio = push_bio(bio);
	2675	if (hammer_is_zone_large_data(nbio->bio_offset)) {
	2676	if (hammer_double_buffer == 0) {
	2677	lwkt_gettoken(&hmp->fs_token);
	2678	error = hammer_io_direct_read(hmp, nbio, NULL);
	2679	lwkt_reltoken(&hmp->fs_token);
	2680	return (error);
	2681	}
	2682
	2683	/*
	2684	* Try to shortcut requests for double_buffer mode too.
	2685	* Since this mode runs through the device buffer cache
	2686	* only compatible buffer sizes (meaning those generated
	2687	* by normal filesystem buffers) are legal.
	2688	*/
	2689	if ((bp->b_flags & B_PAGING) == 0) {
	2690	lwkt_gettoken(&hmp->fs_token);
	2691	error = hammer_io_indirect_read(hmp, nbio, NULL);
	2692	lwkt_reltoken(&hmp->fs_token);
	2693	return (error);
	2694	}
	2695	}
	2696
	2697	/*
	2698	* Well, that sucked. Do it the hard way. If all the stars are
	2699	* aligned we may still be able to issue a direct-read.
	2700	*/
	2701	lwkt_gettoken(&hmp->fs_token);
	2702	hammer_simple_transaction(&trans, hmp);
	2703	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	2704
	2705	/*
	2706	* Key range (begin and end inclusive) to scan. Note that the key's
	2707	* stored in the actual records represent BASE+LEN, not BASE. The
	2708	* first record containing bio_offset will have a key > bio_offset.
	2709	*/
	2710	cursor.key_beg.localization = ip->obj_localization \|
	2711	HAMMER_LOCALIZE_MISC;
	2712	cursor.key_beg.obj_id = ip->obj_id;
	2713	cursor.key_beg.create_tid = 0;
	2714	cursor.key_beg.delete_tid = 0;
	2715	cursor.key_beg.obj_type = 0;
	2716	cursor.key_beg.key = bio->bio_offset + 1;
	2717	cursor.asof = ip->obj_asof;
	2718	cursor.flags \|= HAMMER_CURSOR_ASOF;
	2719
	2720	cursor.key_end = cursor.key_beg;
	2721	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
	2722	#if 0
	2723	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
	2724	cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
	2725	cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
	2726	cursor.key_end.key = HAMMER_MAX_KEY;
	2727	} else
	2728	#endif
	2729	{
	2730	ran_end = bio->bio_offset + bp->b_bufsize;
	2731	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
	2732	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
	2733	tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
	2734	if (tmp64 < ran_end)
	2735	cursor.key_end.key = HAMMER_MAX_KEY;
	2736	else
	2737	cursor.key_end.key = ran_end + MAXPHYS + 1;
	2738	}
	2739	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE;
	2740
	2741	/*
	2742	* Set NOSWAPCACHE for cursor data extraction if double buffering
	2743	* is disabled or (if the file is not marked cacheable via chflags
	2744	* and vm.swapcache_use_chflags is enabled).
	2745	*/
	2746	if (hammer_double_buffer == 0 \|\|
	2747	((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
	2748	vm_swapcache_use_chflags)) {
	2749	cursor.flags \|= HAMMER_CURSOR_NOSWAPCACHE;
	2750	}
	2751
	2752	error = hammer_ip_first(&cursor);
	2753	boff = 0;
	2754
	2755	while (error == 0) {
	2756	/*
	2757	* Get the base file offset of the record. The key for
	2758	* data records is (base + bytes) rather then (base).
	2759	*/
	2760	base = &cursor.leaf->base;
	2761	rec_offset = base->key - cursor.leaf->data_len;
	2762
	2763	/*
	2764	* Calculate the gap, if any, and zero-fill it.
	2765	*
	2766	* n is the offset of the start of the record verses our
	2767	* current seek offset in the bio.
	2768	*/
	2769	n = (int)(rec_offset - (bio->bio_offset + boff));
	2770	if (n > 0) {
	2771	if (n > bp->b_bufsize - boff)
	2772	n = bp->b_bufsize - boff;
	2773	bzero((char *)bp->b_data + boff, n);
	2774	boff += n;
	2775	n = 0;
	2776	}
	2777
	2778	/*
	2779	* Calculate the data offset in the record and the number
	2780	* of bytes we can copy.
	2781	*
	2782	* There are two degenerate cases. First, boff may already
	2783	* be at bp->b_bufsize. Secondly, the data offset within
	2784	* the record may exceed the record's size.
	2785	*/
	2786	roff = -n;
	2787	rec_offset += roff;
	2788	n = cursor.leaf->data_len - roff;
	2789	if (n <= 0) {
	2790	hdkprintf("bad n=%d roff=%d\n", n, roff);
	2791	n = 0;
	2792	} else if (n > bp->b_bufsize - boff) {
	2793	n = bp->b_bufsize - boff;
	2794	}
	2795
	2796	/*
	2797	* Deal with cached truncations. This cool bit of code
	2798	* allows truncate()/ftruncate() to avoid having to sync
	2799	* the file.
	2800	*
	2801	* If the frontend is truncated then all backend records are
	2802	* subject to the frontend's truncation.
	2803	*
	2804	* If the backend is truncated then backend records on-disk
	2805	* (but not in-memory) are subject to the backend's
	2806	* truncation. In-memory records owned by the backend
	2807	* represent data written after the truncation point on the
	2808	* backend and must not be truncated.
	2809	*
	2810	* Truncate operations deal with frontend buffer cache
	2811	* buffers and frontend-owned in-memory records synchronously.
	2812	*/
	2813	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	2814	if (hammer_cursor_ondisk(&cursor)/* \|\|
	2815	cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
	2816	if (ip->trunc_off <= rec_offset)
	2817	n = 0;
	2818	else if (ip->trunc_off < rec_offset + n)
	2819	n = (int)(ip->trunc_off - rec_offset);
	2820	}
	2821	}
	2822	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2823	if (hammer_cursor_ondisk(&cursor)) {
	2824	if (ip->sync_trunc_off <= rec_offset)
	2825	n = 0;
	2826	else if (ip->sync_trunc_off < rec_offset + n)
	2827	n = (int)(ip->sync_trunc_off - rec_offset);
	2828	}
	2829	}
	2830
	2831	/*
	2832	* Try to issue a direct read into our bio if possible,
	2833	* otherwise resolve the element data into a hammer_buffer
	2834	* and copy.
	2835	*
	2836	* The buffer on-disk should be zerod past any real
	2837	* truncation point, but may not be for any synthesized
	2838	* truncation point from above.
	2839	*
	2840	* NOTE: disk_offset is only valid if the cursor data is
	2841	* on-disk.
	2842	*/
	2843	disk_offset = cursor.leaf->data_offset + roff;
	2844	isdedupable = (boff == 0 && n == bp->b_bufsize &&
	2845	hammer_cursor_ondisk(&cursor) &&
	2846	((int)disk_offset & HAMMER_BUFMASK) == 0);
	2847
	2848	if (isdedupable && hammer_double_buffer == 0) {
	2849	/*
	2850	* Direct read case
	2851	*/
	2852	KKASSERT(hammer_is_zone_large_data(disk_offset));
	2853	nbio->bio_offset = disk_offset;
	2854	error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
	2855	goto done;
	2856	} else if (isdedupable) {
	2857	/*
	2858	* Async I/O case for reading from backing store
	2859	* and copying the data to the filesystem buffer.
	2860	*/
	2861	KKASSERT(hammer_is_zone_large_data(disk_offset));
	2862	nbio->bio_offset = disk_offset;
	2863	error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
	2864	goto done;
	2865	} else if (n) {
	2866	error = hammer_ip_resolve_data(&cursor);
	2867	if (error == 0) {
	2868	bcopy((char *)cursor.data + roff,
	2869	(char *)bp->b_data + boff, n);
	2870	}
	2871	}
	2872	if (error)
	2873	break;
	2874
	2875	/*
	2876	* Iterate until we have filled the request.
	2877	*/
	2878	boff += n;
	2879	if (boff == bp->b_bufsize)
	2880	break;
	2881	error = hammer_ip_next(&cursor);
	2882	}
	2883
	2884	/*
	2885	* There may have been a gap after the last record
	2886	*/
	2887	if (error == ENOENT)
	2888	error = 0;
	2889	if (error == 0 && boff != bp->b_bufsize) {
	2890	KKASSERT(boff < bp->b_bufsize);
	2891	bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
	2892	/* boff = bp->b_bufsize; */
	2893	}
	2894
	2895	/*
	2896	* Disallow swapcache operation on the vnode buffer if double
	2897	* buffering is enabled, the swapcache will get the data via
	2898	* the block device buffer.
	2899	*/
	2900	if (hammer_double_buffer)
	2901	bp->b_flags \|= B_NOTMETA;
	2902
	2903	/*
	2904	* Cleanup
	2905	*/
	2906	bp->b_resid = 0;
	2907	bp->b_error = error;
	2908	if (error)
	2909	bp->b_flags \|= B_ERROR;
	2910	biodone(ap->a_bio);
	2911
	2912	done:
	2913	/*
	2914	* Cache the b-tree node for the last data read in cache[1].
	2915	*
	2916	* If we hit the file EOF then also cache the node in the
	2917	* governing directory's cache[3], it will be used to initialize
	2918	* the new inode's cache[1] for any inodes looked up via the directory.
	2919	*
	2920	* This doesn't reduce disk accesses since the B-Tree chain is
	2921	* likely cached, but it does reduce cpu overhead when looking
	2922	* up file offsets for cpdup/tar/cpio style iterations.
	2923	*/
	2924	if (cursor.node)
	2925	hammer_cache_node(&ip->cache[1], cursor.node);
	2926	if (ran_end >= ip->ino_data.size) {
	2927	dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
	2928	ip->obj_asof, ip->obj_localization);
	2929	if (dip) {
	2930	hammer_cache_node(&dip->cache[3], cursor.node);
	2931	hammer_rel_inode(dip, 0);
	2932	}
	2933	}
	2934	hammer_done_cursor(&cursor);
	2935	hammer_done_transaction(&trans);
	2936	lwkt_reltoken(&hmp->fs_token);
	2937	return(error);
	2938	}
	2939
	2940	/*
	2941	* BMAP operation - used to support cluster_read() only.
	2942	*
	2943	* (struct vnode vp, off_t loffset, off_t doffsetp, int runp, int runb)
	2944	*
	2945	* This routine may return EOPNOTSUPP if the opration is not supported for
	2946	* the specified offset. The contents of the pointer arguments do not
	2947	* need to be initialized in that case.
	2948	*
	2949	* If a disk address is available and properly aligned return 0 with
	2950	* doffsetp set to the zone-2 address, and runp / *runb set appropriately
	2951	* to the run-length relative to that offset. Callers may assume that
	2952	* doffsetp is valid if 0 is returned, even if runp is not sufficiently
	2953	* large, so return EOPNOTSUPP if it is not sufficiently large.
	2954	*/
	2955	static
	2956	int
	2957	hammer_vop_bmap(struct vop_bmap_args *ap)
	2958	{
	2959	struct hammer_transaction trans;
	2960	hammer_inode_t ip;
	2961	hammer_mount_t hmp;
	2962	struct hammer_cursor cursor;
	2963	hammer_base_elm_t base;
	2964	int64_t rec_offset;
	2965	int64_t ran_end;
	2966	int64_t tmp64;
	2967	int64_t base_offset;
	2968	int64_t base_disk_offset;
	2969	int64_t last_offset;
	2970	hammer_off_t last_disk_offset;
	2971	hammer_off_t disk_offset;
	2972	int rec_len;
	2973	int error;
	2974	int blksize;
	2975
	2976	ip = ap->a_vp->v_data;
	2977	hmp = ip->hmp;
	2978
	2979	/*
	2980	* We can only BMAP regular files. We can't BMAP database files,
	2981	* directories, etc.
	2982	*/
	2983	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
	2984	return(EOPNOTSUPP);
	2985
	2986	/*
	2987	* bmap is typically called with runp/runb both NULL when used
	2988	* for writing. We do not support BMAP for writing atm.
	2989	*/
	2990	if (ap->a_cmd != BUF_CMD_READ)
	2991	return(EOPNOTSUPP);
	2992
	2993	/*
	2994	* Scan the B-Tree to acquire blockmap addresses, then translate
	2995	* to raw addresses.
	2996	*/
	2997	lwkt_gettoken(&hmp->fs_token);
	2998	hammer_simple_transaction(&trans, hmp);
	2999
	3000	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	3001
	3002	/*
	3003	* Key range (begin and end inclusive) to scan. Note that the key's
	3004	* stored in the actual records represent BASE+LEN, not BASE. The
	3005	* first record containing bio_offset will have a key > bio_offset.
	3006	*/
	3007	cursor.key_beg.localization = ip->obj_localization \|
	3008	HAMMER_LOCALIZE_MISC;
	3009	cursor.key_beg.obj_id = ip->obj_id;
	3010	cursor.key_beg.create_tid = 0;
	3011	cursor.key_beg.delete_tid = 0;
	3012	cursor.key_beg.obj_type = 0;
	3013	if (ap->a_runb)
	3014	cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
	3015	else
	3016	cursor.key_beg.key = ap->a_loffset + 1;
	3017	if (cursor.key_beg.key < 0)
	3018	cursor.key_beg.key = 0;
	3019	cursor.asof = ip->obj_asof;
	3020	cursor.flags \|= HAMMER_CURSOR_ASOF;
	3021
	3022	cursor.key_end = cursor.key_beg;
	3023	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
	3024
	3025	ran_end = ap->a_loffset + MAXPHYS;
	3026	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
	3027	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
	3028	tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
	3029	if (tmp64 < ran_end)
	3030	cursor.key_end.key = HAMMER_MAX_KEY;
	3031	else
	3032	cursor.key_end.key = ran_end + MAXPHYS + 1;
	3033
	3034	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE;
	3035
	3036	error = hammer_ip_first(&cursor);
	3037	base_offset = last_offset = 0;
	3038	base_disk_offset = last_disk_offset = 0;
	3039
	3040	while (error == 0) {
	3041	/*
	3042	* Get the base file offset of the record. The key for
	3043	* data records is (base + bytes) rather then (base).
	3044	*
	3045	* NOTE: rec_offset + rec_len may exceed the end-of-file.
	3046	* The extra bytes should be zero on-disk and the BMAP op
	3047	* should still be ok.
	3048	*/
	3049	base = &cursor.leaf->base;
	3050	rec_offset = base->key - cursor.leaf->data_len;
	3051	rec_len = cursor.leaf->data_len;
	3052
	3053	/*
	3054	* Incorporate any cached truncation.
	3055	*
	3056	* NOTE: Modifications to rec_len based on synthesized
	3057	* truncation points remove the guarantee that any extended
	3058	* data on disk is zero (since the truncations may not have
	3059	* taken place on-media yet).
	3060	*/
	3061	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	3062	if (hammer_cursor_ondisk(&cursor) \|\|
	3063	cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
	3064	if (ip->trunc_off <= rec_offset)
	3065	rec_len = 0;
	3066	else if (ip->trunc_off < rec_offset + rec_len)
	3067	rec_len = (int)(ip->trunc_off - rec_offset);
	3068	}
	3069	}
	3070	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	3071	if (hammer_cursor_ondisk(&cursor)) {
	3072	if (ip->sync_trunc_off <= rec_offset)
	3073	rec_len = 0;
	3074	else if (ip->sync_trunc_off < rec_offset + rec_len)
	3075	rec_len = (int)(ip->sync_trunc_off - rec_offset);
	3076	}
	3077	}
	3078
	3079	/*
	3080	* Accumulate information. If we have hit a discontiguous
	3081	* block reset base_offset unless we are already beyond the
	3082	* requested offset. If we are, that's it, we stop.
	3083	*/
	3084	if (error)
	3085	break;
	3086	if (hammer_cursor_ondisk(&cursor)) {
	3087	disk_offset = cursor.leaf->data_offset;
	3088	if (rec_offset != last_offset \|\|
	3089	disk_offset != last_disk_offset) {
	3090	if (rec_offset > ap->a_loffset)
	3091	break;
	3092	base_offset = rec_offset;
	3093	base_disk_offset = disk_offset;
	3094	}
	3095	last_offset = rec_offset + rec_len;
	3096	last_disk_offset = disk_offset + rec_len;
	3097	}
	3098	error = hammer_ip_next(&cursor);
	3099	}
	3100
	3101	if (cursor.node)
	3102	hammer_cache_node(&ip->cache[1], cursor.node);
	3103
	3104	hammer_done_cursor(&cursor);
	3105	hammer_done_transaction(&trans);
	3106	lwkt_reltoken(&hmp->fs_token);
	3107
	3108	/*
	3109	* If we couldn't find any records or the records we did find were
	3110	* all behind the requested offset, return failure. A forward
	3111	* truncation can leave a hole w/ no on-disk records.
	3112	*/
	3113	if (last_offset == 0 \|\| last_offset < ap->a_loffset)
	3114	return (EOPNOTSUPP);
	3115
	3116	/*
	3117	* Figure out the block size at the requested offset and adjust
	3118	* our limits so the cluster_read() does not create inappropriately
	3119	* sized buffer cache buffers.
	3120	*/
	3121	blksize = hammer_blocksize(ap->a_loffset);
	3122	if (hammer_blocksize(base_offset) != blksize) {
	3123	base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
	3124	}
	3125	if (last_offset != ap->a_loffset &&
	3126	hammer_blocksize(last_offset - 1) != blksize) {
	3127	last_offset = hammer_blockdemarc(ap->a_loffset,
	3128	last_offset - 1);
	3129	}
	3130
	3131	/*
	3132	* Returning EOPNOTSUPP simply prevents the direct-IO optimization
	3133	* from occuring.
	3134	*/
	3135	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
	3136
	3137	if (!hammer_is_zone_large_data(disk_offset)) {
	3138	/*
	3139	* Only large-data zones can be direct-IOd
	3140	*/
	3141	error = EOPNOTSUPP;
	3142	} else if ((disk_offset & HAMMER_BUFMASK) \|\|
	3143	(last_offset - ap->a_loffset) < blksize) {
	3144	/*
	3145	* doffsetp is not aligned or the forward run size does
	3146	* not cover a whole buffer, disallow the direct I/O.
	3147	*/
	3148	error = EOPNOTSUPP;
	3149	} else {
	3150	/*
	3151	* We're good.
	3152	*/
	3153	*ap->a_doffsetp = disk_offset;
	3154	if (ap->a_runb) {
	3155	*ap->a_runb = ap->a_loffset - base_offset;
	3156	KKASSERT(*ap->a_runb >= 0);
	3157	}
	3158	if (ap->a_runp) {
	3159	*ap->a_runp = last_offset - ap->a_loffset;
	3160	KKASSERT(*ap->a_runp >= 0);
	3161	}
	3162	error = 0;
	3163	}
	3164	return(error);
	3165	}
	3166
	3167	/*
	3168	* Write to a regular file. Because this is a strategy call the OS is
	3169	* trying to actually get data onto the media.
	3170	*/
	3171	static
	3172	int
	3173	hammer_vop_strategy_write(struct vop_strategy_args *ap)
	3174	{
	3175	hammer_record_t record;
	3176	hammer_mount_t hmp;
	3177	hammer_inode_t ip;
	3178	struct bio *bio;
	3179	struct buf *bp;
	3180	int blksize __debugvar;
	3181	int bytes;
	3182	int error;
	3183
	3184	bio = ap->a_bio;
	3185	bp = bio->bio_buf;
	3186	ip = ap->a_vp->v_data;
	3187	hmp = ip->hmp;
	3188
	3189	blksize = hammer_blocksize(bio->bio_offset);
	3190	KKASSERT(bp->b_bufsize == blksize);
	3191
	3192	if (ip->flags & HAMMER_INODE_RO) {
	3193	bp->b_error = EROFS;
	3194	bp->b_flags \|= B_ERROR;
	3195	biodone(ap->a_bio);
	3196	return(EROFS);
	3197	}
	3198
	3199	lwkt_gettoken(&hmp->fs_token);
	3200
	3201	/*
	3202	* Disallow swapcache operation on the vnode buffer if double
	3203	* buffering is enabled, the swapcache will get the data via
	3204	* the block device buffer.
	3205	*/
	3206	if (hammer_double_buffer)
	3207	bp->b_flags \|= B_NOTMETA;
	3208
	3209	/*
	3210	* Interlock with inode destruction (no in-kernel or directory
	3211	* topology visibility). If we queue new IO while trying to
	3212	* destroy the inode we can deadlock the vtrunc call in
	3213	* hammer_inode_unloadable_check().
	3214	*
	3215	* Besides, there's no point flushing a bp associated with an
	3216	* inode that is being destroyed on-media and has no kernel
	3217	* references.
	3218	*/
	3219	if ((ip->flags \| ip->sync_flags) &
	3220	(HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) {
	3221	bp->b_resid = 0;
	3222	biodone(ap->a_bio);
	3223	lwkt_reltoken(&hmp->fs_token);
	3224	return(0);
	3225	}
	3226
	3227	/*
	3228	* Reserve space and issue a direct-write from the front-end.
	3229	* NOTE: The direct_io code will hammer_bread/bcopy smaller
	3230	* allocations.
	3231	*
	3232	* An in-memory record will be installed to reference the storage
	3233	* until the flusher can get to it.
	3234	*
	3235	* Since we own the high level bio the front-end will not try to
	3236	* do a direct-read until the write completes.
	3237	*
	3238	* NOTE: The only time we do not reserve a full-sized buffers
	3239	* worth of data is if the file is small. We do not try to
	3240	* allocate a fragment (from the small-data zone) at the end of
	3241	* an otherwise large file as this can lead to wildly separated
	3242	* data.
	3243	*/
	3244	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
	3245	KKASSERT(bio->bio_offset < ip->ino_data.size);
	3246	if (bio->bio_offset \|\| ip->ino_data.size > HAMMER_HBUFSIZE)
	3247	bytes = bp->b_bufsize;
	3248	else
	3249	bytes = HAMMER_DATA_DOALIGN_WITH(int, ip->ino_data.size);
	3250
	3251	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
	3252	bytes, &error);
	3253
	3254	/*
	3255	* B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
	3256	* in hammer_vop_write(). We must flag the record so the proper
	3257	* REDO_TERM_WRITE entry is generated during the flush.
	3258	*/
	3259	if (record) {
	3260	if (bp->b_flags & B_VFSFLAG1) {
	3261	record->flags \|= HAMMER_RECF_REDO;
	3262	bp->b_flags &= ~B_VFSFLAG1;
	3263	}
	3264	hammer_io_direct_write(hmp, bio, record);
	3265	if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
	3266	hammer_flush_inode(ip, 0);
	3267	} else {
	3268	bp->b_bio2.bio_offset = NOOFFSET;
	3269	bp->b_error = error;
	3270	bp->b_flags \|= B_ERROR;
	3271	biodone(ap->a_bio);
	3272	}
	3273	lwkt_reltoken(&hmp->fs_token);
	3274	return(error);
	3275	}
	3276
	3277	/*
	3278	* dounlink - disconnect a directory entry
	3279	*
	3280	* XXX whiteout support not really in yet
	3281	*/
	3282	static int
	3283	hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
	3284	struct vnode dvp, struct ucred cred,
	3285	int flags, int isdir)
	3286	{
	3287	struct namecache *ncp;
	3288	hammer_inode_t dip;
	3289	hammer_inode_t ip;
	3290	hammer_mount_t hmp;
	3291	struct hammer_cursor cursor;
	3292	int64_t namekey;
	3293	uint32_t max_iterations;
	3294	int nlen, error;
	3295
	3296	/*
	3297	* Calculate the namekey and setup the key range for the scan. This
	3298	* works kinda like a chained hash table where the lower 32 bits
	3299	* of the namekey synthesize the chain.
	3300	*
	3301	* The key range is inclusive of both key_beg and key_end.
	3302	*/
	3303	dip = VTOI(dvp);
	3304	ncp = nch->ncp;
	3305	hmp = dip->hmp;
	3306
	3307	if (dip->flags & HAMMER_INODE_RO)
	3308	return (EROFS);
	3309
	3310	namekey = hammer_direntry_namekey(dip, ncp->nc_name, ncp->nc_nlen,
	3311	&max_iterations);
	3312	retry:
	3313	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
	3314	cursor.key_beg.localization = dip->obj_localization \|
	3315	hammer_dir_localization(dip);
	3316	cursor.key_beg.obj_id = dip->obj_id;
	3317	cursor.key_beg.key = namekey;
	3318	cursor.key_beg.create_tid = 0;
	3319	cursor.key_beg.delete_tid = 0;
	3320	cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
	3321	cursor.key_beg.obj_type = 0;
	3322
	3323	cursor.key_end = cursor.key_beg;
	3324	cursor.key_end.key += max_iterations;
	3325	cursor.asof = dip->obj_asof;
	3326	cursor.flags \|= HAMMER_CURSOR_END_INCLUSIVE \| HAMMER_CURSOR_ASOF;
	3327
	3328	/*
	3329	* Scan all matching records (the chain), locate the one matching
	3330	* the requested path component. info->last_error contains the
	3331	* error code on search termination and could be 0, ENOENT, or
	3332	* something else.
	3333	*
	3334	* The hammer_ip_*() functions merge in-memory records with on-disk
	3335	* records for the purposes of the search.
	3336	*/
	3337	error = hammer_ip_first(&cursor);
	3338
	3339	while (error == 0) {
	3340	error = hammer_ip_resolve_data(&cursor);
	3341	if (error)
	3342	break;
	3343	nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
	3344	KKASSERT(nlen > 0);
	3345	if (ncp->nc_nlen == nlen &&
	3346	bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
	3347	break;
	3348	}
	3349	error = hammer_ip_next(&cursor);
	3350	}
	3351
	3352	/*
	3353	* If all is ok we have to get the inode so we can adjust nlinks.
	3354	* To avoid a deadlock with the flusher we must release the inode
	3355	* lock on the directory when acquiring the inode for the entry.
	3356	*
	3357	* If the target is a directory, it must be empty.
	3358	*/
	3359	if (error == 0) {
	3360	hammer_unlock(&cursor.ip->lock);
	3361	ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
	3362	hmp->asof,
	3363	cursor.data->entry.localization,
	3364	0, &error);
	3365	hammer_lock_sh(&cursor.ip->lock);
	3366	if (error == ENOENT) {
	3367	hkprintf("WARNING: Removing dirent w/missing inode "
	3368	"\"%s\"\n"
	3369	"\tobj_id = %016jx\n",
	3370	ncp->nc_name,
	3371	(intmax_t)cursor.data->entry.obj_id);
	3372	error = 0;
	3373	}
	3374
	3375	/*
	3376	* If isdir >= 0 we validate that the entry is or is not a
	3377	* directory. If isdir < 0 we don't care.
	3378	*/
	3379	if (error == 0 && isdir >= 0 && ip) {
	3380	if (isdir &&
	3381	ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
	3382	error = ENOTDIR;
	3383	} else if (isdir == 0 &&
	3384	ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	3385	error = EISDIR;
	3386	}
	3387	}
	3388
	3389	/*
	3390	* If we are trying to remove a directory the directory must
	3391	* be empty.
	3392	*
	3393	* The check directory code can loop and deadlock/retry. Our
	3394	* own cursor's node locks must be released to avoid a 3-way
	3395	* deadlock with the flusher if the check directory code
	3396	* blocks.
	3397	*
	3398	* If any changes whatsoever have been made to the cursor
	3399	* set EDEADLK and retry.
	3400	*
	3401	* WARNING: See warnings in hammer_unlock_cursor()
	3402	* function.
	3403	*/
	3404	if (error == 0 && ip && ip->ino_data.obj_type ==
	3405	HAMMER_OBJTYPE_DIRECTORY) {
	3406	hammer_unlock_cursor(&cursor);
	3407	error = hammer_ip_check_directory_empty(trans, ip);
	3408	hammer_lock_cursor(&cursor);
	3409	if (cursor.flags & HAMMER_CURSOR_RETEST) {
	3410	hkprintf("Warning: avoided deadlock "
	3411	"on rmdir '%s'\n",
	3412	ncp->nc_name);
	3413	error = EDEADLK;
	3414	}
	3415	}
	3416
	3417	/*
	3418	* Delete the directory entry.
	3419	*
	3420	* WARNING: hammer_ip_del_direntry() may have to terminate
	3421	* the cursor to avoid a deadlock. It is ok to call
	3422	* hammer_done_cursor() twice.
	3423	*/
	3424	if (error == 0) {
	3425	error = hammer_ip_del_direntry(trans, &cursor,
	3426	dip, ip);
	3427	}
	3428	hammer_done_cursor(&cursor);
	3429	if (error == 0) {
	3430	/*
	3431	* Tell the namecache that we are now unlinked.
	3432	*/
	3433	cache_unlink(nch);
	3434
	3435	/*
	3436	* NOTE: ip->vp, if non-NULL, cannot be directly
	3437	* referenced without formally acquiring the
	3438	* vp since the vp might have zero refs on it,
	3439	* or in the middle of a reclaim, etc.
	3440	*
	3441	* NOTE: The cache_setunresolved() can rip the vp
	3442	* out from under us since the vp may not have
	3443	* any refs, in which case ip->vp will be NULL
	3444	* from the outset.
	3445	*/
	3446	while (ip && ip->vp) {
	3447	struct vnode *vp;
	3448
	3449	error = hammer_get_vnode(ip, &vp);
	3450	if (error == 0 && vp) {
	3451	vn_unlock(vp);
	3452	hammer_knote(ip->vp, NOTE_DELETE);
	3453	#if 0
	3454	/*
	3455	* Don't do this, it can deadlock
	3456	* on concurrent rm's of hardlinks.
	3457	* Shouldn't be needed any more.
	3458	*/
	3459	cache_inval_vp(ip->vp, CINV_DESTROY);
	3460	#endif
	3461	vrele(vp);
	3462	break;
	3463	}
	3464	hdkprintf("ip/vp race1 avoided\n");
	3465	}
	3466	}
	3467	if (ip)
	3468	hammer_rel_inode(ip, 0);
	3469	} else {
	3470	hammer_done_cursor(&cursor);
	3471	}
	3472	if (error == EDEADLK)
	3473	goto retry;
	3474
	3475	return (error);
	3476	}
	3477
	3478	/************************************************************************
	3479	* FIFO AND SPECFS OPS *
	3480	************************************************************************
	3481	*
	3482	*/
	3483	static int
	3484	hammer_vop_fifoclose (struct vop_close_args *ap)
	3485	{
	3486	/* XXX update itimes */
	3487	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
	3488	}
	3489
	3490	static int
	3491	hammer_vop_fiforead (struct vop_read_args *ap)
	3492	{
	3493	int error;
	3494
	3495	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3496	/* XXX update access time */
	3497	return (error);
	3498	}
	3499
	3500	static int
	3501	hammer_vop_fifowrite (struct vop_write_args *ap)
	3502	{
	3503	int error;
	3504
	3505	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3506	/* XXX update access time */
	3507	return (error);
	3508	}
	3509
	3510	static
	3511	int
	3512	hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
	3513	{
	3514	int error;
	3515
	3516	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
	3517	if (error)
	3518	error = hammer_vop_kqfilter(ap);
	3519	return(error);
	3520	}
	3521
	3522	/************************************************************************
	3523	* KQFILTER OPS *
	3524	************************************************************************
	3525	*
	3526	*/
	3527	static void filt_hammerdetach(struct knote *kn);
	3528	static int filt_hammerread(struct knote *kn, long hint);
	3529	static int filt_hammerwrite(struct knote *kn, long hint);
	3530	static int filt_hammervnode(struct knote *kn, long hint);
	3531
	3532	static struct filterops hammerread_filtops =
	3533	{ FILTEROP_ISFD \| FILTEROP_MPSAFE,
	3534	NULL, filt_hammerdetach, filt_hammerread };
	3535	static struct filterops hammerwrite_filtops =
	3536	{ FILTEROP_ISFD \| FILTEROP_MPSAFE,
	3537	NULL, filt_hammerdetach, filt_hammerwrite };
	3538	static struct filterops hammervnode_filtops =
	3539	{ FILTEROP_ISFD \| FILTEROP_MPSAFE,
	3540	NULL, filt_hammerdetach, filt_hammervnode };
	3541
	3542	static
	3543	int
	3544	hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
	3545	{
	3546	struct vnode *vp = ap->a_vp;
	3547	struct knote *kn = ap->a_kn;
	3548
	3549	switch (kn->kn_filter) {
	3550	case EVFILT_READ:
	3551	kn->kn_fop = &hammerread_filtops;
	3552	break;
	3553	case EVFILT_WRITE:
	3554	kn->kn_fop = &hammerwrite_filtops;
	3555	break;
	3556	case EVFILT_VNODE:
	3557	kn->kn_fop = &hammervnode_filtops;
	3558	break;
	3559	default:
	3560	return (EOPNOTSUPP);
	3561	}
	3562
	3563	kn->kn_hook = (caddr_t)vp;
	3564
	3565	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
	3566
	3567	return(0);
	3568	}
	3569
	3570	static void
	3571	filt_hammerdetach(struct knote *kn)
	3572	{
	3573	struct vnode vp = (void )kn->kn_hook;
	3574
	3575	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
	3576	}
	3577
	3578	static int
	3579	filt_hammerread(struct knote *kn, long hint)
	3580	{
	3581	struct vnode vp = (void )kn->kn_hook;
	3582	hammer_inode_t ip = VTOI(vp);
	3583	hammer_mount_t hmp = ip->hmp;
	3584	off_t off;
	3585
	3586	if (hint == NOTE_REVOKE) {
	3587	kn->kn_flags \|= (EV_EOF \| EV_NODATA \| EV_ONESHOT);
	3588	return(1);
	3589	}
	3590	lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */
	3591	off = ip->ino_data.size - kn->kn_fp->f_offset;
	3592	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
	3593	lwkt_reltoken(&hmp->fs_token);
	3594	if (kn->kn_sfflags & NOTE_OLDAPI)
	3595	return(1);
	3596	return (kn->kn_data != 0);
	3597	}
	3598
	3599	static int
	3600	filt_hammerwrite(struct knote *kn, long hint)
	3601	{
	3602	if (hint == NOTE_REVOKE)
	3603	kn->kn_flags \|= (EV_EOF \| EV_NODATA \| EV_ONESHOT);
	3604	kn->kn_data = 0;
	3605	return (1);
	3606	}
	3607
	3608	static int
	3609	filt_hammervnode(struct knote *kn, long hint)
	3610	{
	3611	if (kn->kn_sfflags & hint)
	3612	kn->kn_fflags \|= hint;
	3613	if (hint == NOTE_REVOKE) {
	3614	kn->kn_flags \|= (EV_EOF \| EV_NODATA);
	3615	return (1);
	3616	}
	3617	return (kn->kn_fflags != 0);
	3618	}
	3619