gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.93 2008/07/04 07:25:36 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39	#include <sys/buf.h>
	40	#include <sys/buf2.h>
	41
	42	static int hammer_unload_inode(struct hammer_inode *ip);
	43	static void hammer_free_inode(hammer_inode_t ip);
	44	static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
	45	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	46	static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
	47	static int hammer_setup_parent_inodes(hammer_inode_t ip);
	48	static int hammer_setup_parent_inodes_helper(hammer_record_t record);
	49	static void hammer_inode_wakereclaims(hammer_inode_t ip);
	50
	51	#ifdef DEBUG_TRUNCATE
	52	extern struct hammer_inode *HammerTruncIp;
	53	#endif
	54
	55	/*
	56	* RB-Tree support for inode structures
	57	*/
	58	int
	59	hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
	60	{
	61	if (ip1->obj_localization < ip2->obj_localization)
	62	return(-1);
	63	if (ip1->obj_localization > ip2->obj_localization)
	64	return(1);
	65	if (ip1->obj_id < ip2->obj_id)
	66	return(-1);
	67	if (ip1->obj_id > ip2->obj_id)
	68	return(1);
	69	if (ip1->obj_asof < ip2->obj_asof)
	70	return(-1);
	71	if (ip1->obj_asof > ip2->obj_asof)
	72	return(1);
	73	return(0);
	74	}
	75
	76	/*
	77	* RB-Tree support for inode structures / special LOOKUP_INFO
	78	*/
	79	static int
	80	hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
	81	{
	82	if (info->obj_localization < ip->obj_localization)
	83	return(-1);
	84	if (info->obj_localization > ip->obj_localization)
	85	return(1);
	86	if (info->obj_id < ip->obj_id)
	87	return(-1);
	88	if (info->obj_id > ip->obj_id)
	89	return(1);
	90	if (info->obj_asof < ip->obj_asof)
	91	return(-1);
	92	if (info->obj_asof > ip->obj_asof)
	93	return(1);
	94	return(0);
	95	}
	96
	97	/*
	98	* Used by hammer_scan_inode_snapshots() to locate all of an object's
	99	* snapshots. Note that the asof field is not tested, which we can get
	100	* away with because it is the lowest-priority field.
	101	*/
	102	static int
	103	hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
	104	{
	105	hammer_inode_info_t info = data;
	106
	107	if (ip->obj_localization > info->obj_localization)
	108	return(1);
	109	if (ip->obj_localization < info->obj_localization)
	110	return(-1);
	111	if (ip->obj_id > info->obj_id)
	112	return(1);
	113	if (ip->obj_id < info->obj_id)
	114	return(-1);
	115	return(0);
	116	}
	117
	118	/*
	119	* RB-Tree support for pseudofs structures
	120	*/
	121	static int
	122	hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
	123	{
	124	if (p1->localization < p2->localization)
	125	return(-1);
	126	if (p1->localization > p2->localization)
	127	return(1);
	128	return(0);
	129	}
	130
	131
	132	RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
	133	RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
	134	hammer_inode_info_cmp, hammer_inode_info_t);
	135	RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
	136	hammer_pfs_rb_compare, u_int32_t, localization);
	137
	138	/*
	139	* The kernel is not actively referencing this vnode but is still holding
	140	* it cached.
	141	*
	142	* This is called from the frontend.
	143	*/
	144	int
	145	hammer_vop_inactive(struct vop_inactive_args *ap)
	146	{
	147	struct hammer_inode *ip = VTOI(ap->a_vp);
	148
	149	/*
	150	* Degenerate case
	151	*/
	152	if (ip == NULL) {
	153	vrecycle(ap->a_vp);
	154	return(0);
	155	}
	156
	157	/*
	158	* If the inode no longer has visibility in the filesystem try to
	159	* recycle it immediately, even if the inode is dirty. Recycling
	160	* it quickly allows the system to reclaim buffer cache and VM
	161	* resources which can matter a lot in a heavily loaded system.
	162	*
	163	* This can deadlock in vfsync() if we aren't careful.
	164	*
	165	* Do not queue the inode to the flusher if we still have visibility,
	166	* otherwise namespace calls such as chmod will unnecessarily generate
	167	* multiple inode updates.
	168	*/
	169	hammer_inode_unloadable_check(ip, 0);
	170	if (ip->ino_data.nlinks == 0) {
	171	if (ip->flags & HAMMER_INODE_MODMASK)
	172	hammer_flush_inode(ip, 0);
	173	vrecycle(ap->a_vp);
	174	}
	175	return(0);
	176	}
	177
	178	/*
	179	* Release the vnode association. This is typically (but not always)
	180	* the last reference on the inode.
	181	*
	182	* Once the association is lost we are on our own with regards to
	183	* flushing the inode.
	184	*/
	185	int
	186	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	187	{
	188	struct hammer_inode *ip;
	189	hammer_mount_t hmp;
	190	struct vnode *vp;
	191
	192	vp = ap->a_vp;
	193
	194	if ((ip = vp->v_data) != NULL) {
	195	hmp = ip->hmp;
	196	vp->v_data = NULL;
	197	ip->vp = NULL;
	198
	199	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
	200	++hammer_count_reclaiming;
	201	++hmp->inode_reclaims;
	202	ip->flags \|= HAMMER_INODE_RECLAIM;
	203	if (hmp->inode_reclaims > HAMMER_RECLAIM_FLUSH &&
	204	(hmp->inode_reclaims & 255) == 0) {
	205	hammer_flusher_async(hmp);
	206	}
	207	}
	208	hammer_rel_inode(ip, 1);
	209	}
	210	return(0);
	211	}
	212
	213	/*
	214	* Return a locked vnode for the specified inode. The inode must be
	215	* referenced but NOT LOCKED on entry and will remain referenced on
	216	* return.
	217	*
	218	* Called from the frontend.
	219	*/
	220	int
	221	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	222	{
	223	hammer_mount_t hmp;
	224	struct vnode *vp;
	225	int error = 0;
	226
	227	hmp = ip->hmp;
	228
	229	for (;;) {
	230	if ((vp = ip->vp) == NULL) {
	231	error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
	232	if (error)
	233	break;
	234	hammer_lock_ex(&ip->lock);
	235	if (ip->vp != NULL) {
	236	hammer_unlock(&ip->lock);
	237	vp->v_type = VBAD;
	238	vx_put(vp);
	239	continue;
	240	}
	241	hammer_ref(&ip->lock);
	242	vp = *vpp;
	243	ip->vp = vp;
	244	vp->v_type =
	245	hammer_get_vnode_type(ip->ino_data.obj_type);
	246
	247	hammer_inode_wakereclaims(ip);
	248
	249	switch(ip->ino_data.obj_type) {
	250	case HAMMER_OBJTYPE_CDEV:
	251	case HAMMER_OBJTYPE_BDEV:
	252	vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
	253	addaliasu(vp, ip->ino_data.rmajor,
	254	ip->ino_data.rminor);
	255	break;
	256	case HAMMER_OBJTYPE_FIFO:
	257	vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
	258	break;
	259	default:
	260	break;
	261	}
	262
	263	/*
	264	* Only mark as the root vnode if the ip is not
	265	* historical, otherwise the VFS cache will get
	266	* confused. The other half of the special handling
	267	* is in hammer_vop_nlookupdotdot().
	268	*
	269	* Pseudo-filesystem roots also do not count.
	270	*/
	271	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	272	ip->obj_asof == hmp->asof &&
	273	ip->obj_localization == 0) {
	274	vp->v_flag \|= VROOT;
	275	}
	276
	277	vp->v_data = (void *)ip;
	278	/* vnode locked by getnewvnode() */
	279	/* make related vnode dirty if inode dirty? */
	280	hammer_unlock(&ip->lock);
	281	if (vp->v_type == VREG)
	282	vinitvmio(vp, ip->ino_data.size);
	283	break;
	284	}
	285
	286	/*
	287	* loop if the vget fails (aka races), or if the vp
	288	* no longer matches ip->vp.
	289	*/
	290	if (vget(vp, LK_EXCLUSIVE) == 0) {
	291	if (vp == ip->vp)
	292	break;
	293	vput(vp);
	294	}
	295	}
	296	*vpp = vp;
	297	return(error);
	298	}
	299
	300	/*
	301	* Locate all copies of the inode for obj_id compatible with the specified
	302	* asof, reference, and issue the related call-back. This routine is used
	303	* for direct-io invalidation and does not create any new inodes.
	304	*/
	305	void
	306	hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
	307	int (callback)(hammer_inode_t ip, void data),
	308	void *data)
	309	{
	310	hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
	311	hammer_inode_info_cmp_all_history,
	312	callback, iinfo);
	313	}
	314
	315	/*
	316	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	317	* do not attach or detach the related vnode (use hammer_get_vnode() for
	318	* that).
	319	*
	320	* The flags argument is only applied for newly created inodes, and only
	321	* certain flags are inherited.
	322	*
	323	* Called from the frontend.
	324	*/
	325	struct hammer_inode *
	326	hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
	327	u_int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	328	int flags, int *errorp)
	329	{
	330	hammer_mount_t hmp = trans->hmp;
	331	struct hammer_inode_info iinfo;
	332	struct hammer_cursor cursor;
	333	struct hammer_inode *ip;
	334
	335
	336	/*
	337	* Determine if we already have an inode cached. If we do then
	338	* we are golden.
	339	*/
	340	iinfo.obj_id = obj_id;
	341	iinfo.obj_asof = asof;
	342	iinfo.obj_localization = localization;
	343	loop:
	344	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	345	if (ip) {
	346	hammer_ref(&ip->lock);
	347	*errorp = 0;
	348	return(ip);
	349	}
	350
	351	/*
	352	* Allocate a new inode structure and deal with races later.
	353	*/
	354	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	355	++hammer_count_inodes;
	356	++hmp->count_inodes;
	357	ip->obj_id = obj_id;
	358	ip->obj_asof = iinfo.obj_asof;
	359	ip->obj_localization = localization;
	360	ip->hmp = hmp;
	361	ip->flags = flags & HAMMER_INODE_RO;
	362	ip->cache[0].ip = ip;
	363	ip->cache[1].ip = ip;
	364	if (hmp->ronly)
	365	ip->flags \|= HAMMER_INODE_RO;
	366	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	367	0x7FFFFFFFFFFFFFFFLL;
	368	RB_INIT(&ip->rec_tree);
	369	TAILQ_INIT(&ip->target_list);
	370	hammer_ref(&ip->lock);
	371
	372	/*
	373	* Locate the on-disk inode.
	374	*/
	375	retry:
	376	hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL);
	377	cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
	378	cursor.key_beg.obj_id = ip->obj_id;
	379	cursor.key_beg.key = 0;
	380	cursor.key_beg.create_tid = 0;
	381	cursor.key_beg.delete_tid = 0;
	382	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	383	cursor.key_beg.obj_type = 0;
	384	cursor.asof = iinfo.obj_asof;
	385	cursor.flags = HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_GET_DATA \|
	386	HAMMER_CURSOR_ASOF;
	387
	388	*errorp = hammer_btree_lookup(&cursor);
	389	if (*errorp == EDEADLK) {
	390	hammer_done_cursor(&cursor);
	391	goto retry;
	392	}
	393
	394	/*
	395	* On success the B-Tree lookup will hold the appropriate
	396	* buffer cache buffers and provide a pointer to the requested
	397	* information. Copy the information to the in-memory inode
	398	* and cache the B-Tree node to improve future operations.
	399	*/
	400	if (*errorp == 0) {
	401	ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
	402	ip->ino_data = cursor.data->inode;
	403
	404	/*
	405	* cache[0] tries to cache the location of the object inode.
	406	* The assumption is that it is near the directory inode.
	407	*
	408	* cache[1] tries to cache the location of the object data.
	409	* The assumption is that it is near the directory data.
	410	*/
	411	hammer_cache_node(&ip->cache[0], cursor.node);
	412	if (dip && dip->cache[1].node)
	413	hammer_cache_node(&ip->cache[1], dip->cache[1].node);
	414
	415	/*
	416	* The file should not contain any data past the file size
	417	* stored in the inode. Setting save_trunc_off to the
	418	* file size instead of max reduces B-Tree lookup overheads
	419	* on append by allowing the flusher to avoid checking for
	420	* record overwrites.
	421	*/
	422	ip->save_trunc_off = ip->ino_data.size;
	423
	424	/*
	425	* Locate and assign the pseudofs management structure to
	426	* the inode.
	427	*/
	428	if (dip && dip->obj_localization == ip->obj_localization) {
	429	ip->pfsm = dip->pfsm;
	430	hammer_ref(&ip->pfsm->lock);
	431	} else {
	432	*errorp = hammer_load_pseudofs(trans, ip);
	433	}
	434	}
	435
	436	/*
	437	* The inode is placed on the red-black tree and will be synced to
	438	* the media when flushed or by the filesystem sync. If this races
	439	* another instantiation/lookup the insertion will fail.
	440	*/
	441	if (*errorp == 0) {
	442	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	443	hammer_free_inode(ip);
	444	hammer_done_cursor(&cursor);
	445	goto loop;
	446	}
	447	ip->flags \|= HAMMER_INODE_ONDISK;
	448	} else {
	449	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	450	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	451	--hmp->rsv_inodes;
	452	}
	453
	454	hammer_free_inode(ip);
	455	ip = NULL;
	456	}
	457	hammer_done_cursor(&cursor);
	458	return (ip);
	459	}
	460
	461	/*
	462	* Create a new filesystem object, returning the inode in *ipp. The
	463	* returned inode will be referenced.
	464	*
	465	* The inode is created in-memory.
	466	*/
	467	int
	468	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	469	struct ucred *cred, hammer_inode_t dip,
	470	int pseudofs, struct hammer_inode **ipp)
	471	{
	472	hammer_mount_t hmp;
	473	hammer_inode_t ip;
	474	uid_t xuid;
	475	u_int32_t localization;
	476	int error;
	477
	478	hmp = trans->hmp;
	479
	480	/*
	481	* Assign the localization domain. If if dip is NULL we are creating
	482	* a pseudo-fs and must locate an unused localization domain.
	483	*/
	484	if (pseudofs) {
	485	for (localization = HAMMER_DEF_LOCALIZATION;
	486	localization < HAMMER_LOCALIZE_PSEUDOFS_MASK;
	487	localization += HAMMER_LOCALIZE_PSEUDOFS_INC) {
	488	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
	489	hmp->asof, localization,
	490	0, &error);
	491	if (ip == NULL) {
	492	if (error != ENOENT)
	493	return(error);
	494	break;
	495	}
	496	if (ip)
	497	hammer_rel_inode(ip, 0);
	498	}
	499	} else {
	500	localization = dip->obj_localization;
	501	}
	502
	503	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	504	++hammer_count_inodes;
	505	++hmp->count_inodes;
	506
	507	/*
	508	* Allocate a new object id. If creating a new pseudo-fs the
	509	* obj_id is 1.
	510	*/
	511	if (pseudofs)
	512	ip->obj_id = HAMMER_OBJID_ROOT;
	513	else
	514	ip->obj_id = hammer_alloc_objid(hmp, dip);
	515	ip->obj_localization = localization;
	516
	517	KKASSERT(ip->obj_id != 0);
	518	ip->obj_asof = hmp->asof;
	519	ip->hmp = hmp;
	520	ip->flush_state = HAMMER_FST_IDLE;
	521	ip->flags = HAMMER_INODE_DDIRTY \|
	522	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME;
	523	ip->cache[0].ip = ip;
	524	ip->cache[1].ip = ip;
	525
	526	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	527	/* ip->save_trunc_off = 0; (already zero) */
	528	RB_INIT(&ip->rec_tree);
	529	TAILQ_INIT(&ip->target_list);
	530
	531	ip->ino_data.atime = trans->time;
	532	ip->ino_data.mtime = trans->time;
	533	ip->ino_data.size = 0;
	534	ip->ino_data.nlinks = 0;
	535
	536	/*
	537	* A nohistory designator on the parent directory is inherited by
	538	* the child. We will do this even for pseudo-fs creation... the
	539	* sysad can turn it off.
	540	*/
	541	ip->ino_data.uflags = dip->ino_data.uflags &
	542	(SF_NOHISTORY\|UF_NOHISTORY\|UF_NODUMP);
	543
	544	ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
	545	ip->ino_leaf.base.localization = ip->obj_localization +
	546	HAMMER_LOCALIZE_INODE;
	547	ip->ino_leaf.base.obj_id = ip->obj_id;
	548	ip->ino_leaf.base.key = 0;
	549	ip->ino_leaf.base.create_tid = 0;
	550	ip->ino_leaf.base.delete_tid = 0;
	551	ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
	552	ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
	553
	554	ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
	555	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	556	ip->ino_data.mode = vap->va_mode;
	557	ip->ino_data.ctime = trans->time;
	558
	559	/*
	560	* Setup the ".." pointer. This only needs to be done for directories
	561	* but we do it for all objects as a recovery aid.
	562	*
	563	* The parent_obj_localization field only applies to pseudo-fs roots.
	564	*/
	565	ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
	566	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
	567	ip->obj_id == HAMMER_OBJID_ROOT) {
	568	ip->ino_data.ext.obj.parent_obj_localization =
	569	dip->obj_localization;
	570	}
	571
	572	switch(ip->ino_leaf.base.obj_type) {
	573	case HAMMER_OBJTYPE_CDEV:
	574	case HAMMER_OBJTYPE_BDEV:
	575	ip->ino_data.rmajor = vap->va_rmajor;
	576	ip->ino_data.rminor = vap->va_rminor;
	577	break;
	578	default:
	579	break;
	580	}
	581
	582	/*
	583	* Calculate default uid/gid and overwrite with information from
	584	* the vap.
	585	*/
	586	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	587	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
	588	&vap->va_mode);
	589	ip->ino_data.mode = vap->va_mode;
	590
	591	if (vap->va_vaflags & VA_UID_UUID_VALID)
	592	ip->ino_data.uid = vap->va_uid_uuid;
	593	else if (vap->va_uid != (uid_t)VNOVAL)
	594	hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
	595	else
	596	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	597
	598	if (vap->va_vaflags & VA_GID_UUID_VALID)
	599	ip->ino_data.gid = vap->va_gid_uuid;
	600	else if (vap->va_gid != (gid_t)VNOVAL)
	601	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	602	else
	603	ip->ino_data.gid = dip->ino_data.gid;
	604
	605	hammer_ref(&ip->lock);
	606
	607	if (dip->obj_localization == ip->obj_localization) {
	608	ip->pfsm = dip->pfsm;
	609	hammer_ref(&ip->pfsm->lock);
	610	error = 0;
	611	} else {
	612	error = hammer_load_pseudofs(trans, ip);
	613	}
	614
	615	if (error) {
	616	hammer_free_inode(ip);
	617	ip = NULL;
	618	} else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	619	panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
	620	/* not reached */
	621	hammer_free_inode(ip);
	622	}
	623	*ipp = ip;
	624	return(error);
	625	}
	626
	627	/*
	628	* Final cleanup / freeing of an inode structure
	629	*/
	630	static void
	631	hammer_free_inode(hammer_inode_t ip)
	632	{
	633	KKASSERT(ip->lock.refs == 1);
	634	hammer_uncache_node(&ip->cache[0]);
	635	hammer_uncache_node(&ip->cache[1]);
	636	hammer_inode_wakereclaims(ip);
	637	if (ip->objid_cache)
	638	hammer_clear_objid(ip);
	639	--hammer_count_inodes;
	640	--ip->hmp->count_inodes;
	641	if (ip->pfsm) {
	642	hammer_rel_pseudofs(ip->hmp, ip->pfsm);
	643	ip->pfsm = NULL;
	644	}
	645	kfree(ip, M_HAMMER);
	646	ip = NULL;
	647	}
	648
	649	/*
	650	* Retrieve pseudo-fs data.
	651	*/
	652	int
	653	hammer_load_pseudofs(hammer_transaction_t trans, hammer_inode_t ip)
	654	{
	655	hammer_mount_t hmp = trans->hmp;
	656	hammer_pseudofs_inmem_t pfsm;
	657	struct hammer_cursor cursor;
	658	int error;
	659	int bytes;
	660
	661	retry:
	662	pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root,
	663	ip->obj_localization);
	664	if (pfsm) {
	665	KKASSERT(ip->pfsm == NULL);
	666	ip->pfsm = pfsm;
	667	hammer_ref(&pfsm->lock);
	668	return(0);
	669	}
	670
	671	pfsm = kmalloc(sizeof(*pfsm), M_HAMMER, M_WAITOK \| M_ZERO);
	672	pfsm->localization = ip->obj_localization;
	673
	674	hammer_init_cursor(trans, &cursor, NULL, NULL);
	675	cursor.key_beg.localization = ip->obj_localization +
	676	HAMMER_LOCALIZE_MISC;
	677	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	678	cursor.key_beg.create_tid = 0;
	679	cursor.key_beg.delete_tid = 0;
	680	cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
	681	cursor.key_beg.obj_type = 0;
	682	cursor.key_beg.key = HAMMER_FIXKEY_PSEUDOFS;
	683	cursor.asof = HAMMER_MAX_TID;
	684	cursor.flags \|= HAMMER_CURSOR_ASOF;
	685
	686	error = hammer_btree_lookup(&cursor);
	687	if (error == 0) {
	688	error = hammer_btree_extract(&cursor, HAMMER_CURSOR_GET_DATA);
	689	if (error == 0) {
	690	bytes = cursor.leaf->data_len;
	691	if (bytes > sizeof(pfsm->pfsd))
	692	bytes = sizeof(pfsm->pfsd);
	693	bcopy(cursor.data, &pfsm->pfsd, bytes);
	694	}
	695	} else if (error == ENOENT) {
	696	error = 0;
	697	}
	698
	699	hammer_done_cursor(&cursor);
	700
	701	if (error == 0) {
	702	hammer_ref(&pfsm->lock);
	703	if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
	704	kfree(pfsm, M_HAMMER);
	705	goto retry;
	706	}
	707	ip->pfsm = pfsm;
	708
	709	/*
	710	* Certain aspects of the pseudofs configuration are reflected
	711	* in the inode.
	712	*/
	713	if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) {
	714	ip->flags \|= HAMMER_INODE_RO;
	715	ip->flags \|= HAMMER_INODE_PFSD;
	716	if (ip->obj_asof > pfsm->pfsd.sync_beg_tid)
	717	ip->obj_asof = pfsm->pfsd.sync_beg_tid;
	718	} else if (pfsm->pfsd.master_id >= 0) {
	719	ip->flags \|= HAMMER_INODE_PFSD;
	720	}
	721	} else {
	722	kprintf("cannot load pfsm error %d\n", error);
	723	kfree(pfsm, M_HAMMER);
	724	}
	725	return(error);
	726	}
	727
	728	/*
	729	* Store pseudo-fs data. The backend will automatically delete any prior
	730	* on-disk pseudo-fs data but we have to delete in-memory versions.
	731	*/
	732	int
	733	hammer_save_pseudofs(hammer_transaction_t trans, hammer_inode_t ip)
	734	{
	735	struct hammer_cursor cursor;
	736	hammer_pseudofs_inmem_t pfsm;
	737	hammer_record_t record;
	738	int error;
	739
	740	retry:
	741	pfsm = ip->pfsm;
	742	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	743	cursor.key_beg.localization = ip->obj_localization +
	744	HAMMER_LOCALIZE_MISC;
	745	cursor.key_beg.obj_id = ip->obj_id;
	746	cursor.key_beg.create_tid = 0;
	747	cursor.key_beg.delete_tid = 0;
	748	cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
	749	cursor.key_beg.obj_type = 0;
	750	cursor.key_beg.key = HAMMER_FIXKEY_PSEUDOFS;
	751	cursor.asof = HAMMER_MAX_TID;
	752	cursor.flags \|= HAMMER_CURSOR_ASOF;
	753
	754	error = hammer_ip_lookup(&cursor);
	755	if (error == 0 && hammer_cursor_inmem(&cursor)) {
	756	record = cursor.iprec;
	757	if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
	758	KKASSERT(cursor.deadlk_rec == NULL);
	759	hammer_ref(&record->lock);
	760	cursor.deadlk_rec = record;
	761	error = EDEADLK;
	762	} else {
	763	record->flags \|= HAMMER_RECF_DELETED_FE;
	764	error = 0;
	765	}
	766	}
	767	if (error == 0 \|\| error == ENOENT) {
	768	record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
	769	record->type = HAMMER_MEM_RECORD_GENERAL;
	770
	771	record->leaf.base.localization = ip->obj_localization +
	772	HAMMER_LOCALIZE_MISC;
	773	record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
	774	record->leaf.base.key = HAMMER_FIXKEY_PSEUDOFS;
	775	record->leaf.data_len = sizeof(pfsm->pfsd);
	776	bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
	777	error = hammer_ip_add_record(trans, record);
	778	}
	779	hammer_done_cursor(&cursor);
	780	if (error == EDEADLK)
	781	goto retry;
	782	if (error == 0) {
	783	/*
	784	* Certain aspects of the pseudofs configuration are reflected
	785	* in the inode. Note that we cannot mess with the as-of or
	786	* clear the read-only state.
	787	*
	788	* If this inode represented a slave snapshot its asof will
	789	* be set to a snapshot tid. When clearing slave mode any
	790	* re-access of the inode via the parent directory will
	791	* wind up using a different asof and thus will instantiate
	792	* a new inode.
	793	*/
	794	if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) {
	795	ip->flags \|= HAMMER_INODE_RO;
	796	ip->flags \|= HAMMER_INODE_PFSD;
	797	} else if (pfsm->pfsd.master_id >= 0) {
	798	ip->flags \|= HAMMER_INODE_PFSD;
	799	} else {
	800	ip->flags &= ~HAMMER_INODE_PFSD;
	801	}
	802	}
	803	return(error);
	804	}
	805
	806	void
	807	hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
	808	{
	809	hammer_unref(&pfsm->lock);
	810	if (pfsm->lock.refs == 0) {
	811	RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
	812	kfree(pfsm, M_HAMMER);
	813	}
	814	}
	815
	816	/*
	817	* Called by hammer_sync_inode().
	818	*/
	819	static int
	820	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	821	{
	822	hammer_transaction_t trans = cursor->trans;
	823	hammer_record_t record;
	824	int error;
	825	int redirty;
	826
	827	retry:
	828	error = 0;
	829
	830	/*
	831	* If the inode has a presence on-disk then locate it and mark
	832	* it deleted, setting DELONDISK.
	833	*
	834	* The record may or may not be physically deleted, depending on
	835	* the retention policy.
	836	*/
	837	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	838	HAMMER_INODE_ONDISK) {
	839	hammer_normalize_cursor(cursor);
	840	cursor->key_beg.localization = ip->obj_localization +
	841	HAMMER_LOCALIZE_INODE;
	842	cursor->key_beg.obj_id = ip->obj_id;
	843	cursor->key_beg.key = 0;
	844	cursor->key_beg.create_tid = 0;
	845	cursor->key_beg.delete_tid = 0;
	846	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	847	cursor->key_beg.obj_type = 0;
	848	cursor->asof = ip->obj_asof;
	849	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	850	cursor->flags \|= HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_ASOF;
	851	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	852
	853	error = hammer_btree_lookup(cursor);
	854	if (hammer_debug_inode)
	855	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	856	if (error) {
	857	kprintf("error %d\n", error);
	858	Debugger("hammer_update_inode");
	859	}
	860
	861	if (error == 0) {
	862	error = hammer_ip_delete_record(cursor, ip, trans->tid);
	863	if (hammer_debug_inode)
	864	kprintf(" error %d\n", error);
	865	if (error && error != EDEADLK) {
	866	kprintf("error %d\n", error);
	867	Debugger("hammer_update_inode2");
	868	}
	869	if (error == 0) {
	870	ip->flags \|= HAMMER_INODE_DELONDISK;
	871	}
	872	if (cursor->node)
	873	hammer_cache_node(&ip->cache[0], cursor->node);
	874	}
	875	if (error == EDEADLK) {
	876	hammer_done_cursor(cursor);
	877	error = hammer_init_cursor(trans, cursor,
	878	&ip->cache[0], ip);
	879	if (hammer_debug_inode)
	880	kprintf("IPDED %p %d\n", ip, error);
	881	if (error == 0)
	882	goto retry;
	883	}
	884	}
	885
	886	/*
	887	* Ok, write out the initial record or a new record (after deleting
	888	* the old one), unless the DELETED flag is set. This routine will
	889	* clear DELONDISK if it writes out a record.
	890	*
	891	* Update our inode statistics if this is the first application of
	892	* the inode on-disk.
	893	*/
	894	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	895	/*
	896	* Generate a record and write it to the media
	897	*/
	898	record = hammer_alloc_mem_record(ip, 0);
	899	record->type = HAMMER_MEM_RECORD_INODE;
	900	record->flush_state = HAMMER_FST_FLUSH;
	901	record->leaf = ip->sync_ino_leaf;
	902	record->leaf.base.create_tid = trans->tid;
	903	record->leaf.data_len = sizeof(ip->sync_ino_data);
	904	record->leaf.create_ts = trans->time32;
	905	record->data = (void *)&ip->sync_ino_data;
	906	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	907
	908	/*
	909	* If this flag is set we cannot sync the new file size
	910	* because we haven't finished related truncations. The
	911	* inode will be flushed in another flush group to finish
	912	* the job.
	913	*/
	914	if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
	915	ip->sync_ino_data.size != ip->ino_data.size) {
	916	redirty = 1;
	917	ip->sync_ino_data.size = ip->ino_data.size;
	918	} else {
	919	redirty = 0;
	920	}
	921
	922	for (;;) {
	923	error = hammer_ip_sync_record_cursor(cursor, record);
	924	if (hammer_debug_inode)
	925	kprintf("GENREC %p rec %08x %d\n",
	926	ip, record->flags, error);
	927	if (error != EDEADLK)
	928	break;
	929	hammer_done_cursor(cursor);
	930	error = hammer_init_cursor(trans, cursor,
	931	&ip->cache[0], ip);
	932	if (hammer_debug_inode)
	933	kprintf("GENREC reinit %d\n", error);
	934	if (error)
	935	break;
	936	}
	937	if (error) {
	938	kprintf("error %d\n", error);
	939	Debugger("hammer_update_inode3");
	940	}
	941
	942	/*
	943	* The record isn't managed by the inode's record tree,
	944	* destroy it whether we succeed or fail.
	945	*/
	946	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	947	record->flags \|= HAMMER_RECF_DELETED_FE;
	948	record->flush_state = HAMMER_FST_IDLE;
	949	hammer_rel_mem_record(record);
	950
	951	/*
	952	* Finish up.
	953	*/
	954	if (error == 0) {
	955	if (hammer_debug_inode)
	956	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	957	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	958	HAMMER_INODE_ATIME \|
	959	HAMMER_INODE_MTIME);
	960	ip->flags &= ~HAMMER_INODE_DELONDISK;
	961	if (redirty)
	962	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	963
	964	/*
	965	* Root volume count of inodes
	966	*/
	967	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	968	hammer_modify_volume_field(trans,
	969	trans->rootvol,
	970	vol0_stat_inodes);
	971	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	972	hammer_modify_volume_done(trans->rootvol);
	973	ip->flags \|= HAMMER_INODE_ONDISK;
	974	if (hammer_debug_inode)
	975	kprintf("NOWONDISK %p\n", ip);
	976	}
	977	}
	978	}
	979
	980	/*
	981	* If the inode has been destroyed, clean out any left-over flags
	982	* that may have been set by the frontend.
	983	*/
	984	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	985	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	986	HAMMER_INODE_ATIME \|
	987	HAMMER_INODE_MTIME);
	988	}
	989	return(error);
	990	}
	991
	992	/*
	993	* Update only the itimes fields.
	994	*
	995	* ATIME can be updated without generating any UNDO. MTIME is updated
	996	* with UNDO so it is guaranteed to be synchronized properly in case of
	997	* a crash.
	998	*
	999	* Neither field is included in the B-Tree leaf element's CRC, which is how
	1000	* we can get away with updating ATIME the way we do.
	1001	*/
	1002	static int
	1003	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	1004	{
	1005	hammer_transaction_t trans = cursor->trans;
	1006	int error;
	1007
	1008	retry:
	1009	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) !=
	1010	HAMMER_INODE_ONDISK) {
	1011	return(0);
	1012	}
	1013
	1014	hammer_normalize_cursor(cursor);
	1015	cursor->key_beg.localization = ip->obj_localization +
	1016	HAMMER_LOCALIZE_INODE;
	1017	cursor->key_beg.obj_id = ip->obj_id;
	1018	cursor->key_beg.key = 0;
	1019	cursor->key_beg.create_tid = 0;
	1020	cursor->key_beg.delete_tid = 0;
	1021	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1022	cursor->key_beg.obj_type = 0;
	1023	cursor->asof = ip->obj_asof;
	1024	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1025	cursor->flags \|= HAMMER_CURSOR_ASOF;
	1026	cursor->flags \|= HAMMER_CURSOR_GET_LEAF;
	1027	cursor->flags \|= HAMMER_CURSOR_GET_DATA;
	1028	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1029
	1030	error = hammer_btree_lookup(cursor);
	1031	if (error) {
	1032	kprintf("error %d\n", error);
	1033	Debugger("hammer_update_itimes1");
	1034	}
	1035	if (error == 0) {
	1036	hammer_cache_node(&ip->cache[0], cursor->node);
	1037	if (ip->sync_flags & HAMMER_INODE_MTIME) {
	1038	/*
	1039	* Updating MTIME requires an UNDO. Just cover
	1040	* both atime and mtime.
	1041	*/
	1042	hammer_modify_buffer(trans, cursor->data_buffer,
	1043	HAMMER_ITIMES_BASE(&cursor->data->inode),
	1044	HAMMER_ITIMES_BYTES);
	1045	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1046	cursor->data->inode.mtime = ip->sync_ino_data.mtime;
	1047	hammer_modify_buffer_done(cursor->data_buffer);
	1048	} else if (ip->sync_flags & HAMMER_INODE_ATIME) {
	1049	/*
	1050	* Updating atime only can be done in-place with
	1051	* no UNDO.
	1052	*/
	1053	hammer_modify_buffer(trans, cursor->data_buffer,
	1054	NULL, 0);
	1055	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1056	hammer_modify_buffer_done(cursor->data_buffer);
	1057	}
	1058	ip->sync_flags &= ~(HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME);
	1059	}
	1060	if (error == EDEADLK) {
	1061	hammer_done_cursor(cursor);
	1062	error = hammer_init_cursor(trans, cursor,
	1063	&ip->cache[0], ip);
	1064	if (error == 0)
	1065	goto retry;
	1066	}
	1067	return(error);
	1068	}
	1069
	1070	/*
	1071	* Release a reference on an inode, flush as requested.
	1072	*
	1073	* On the last reference we queue the inode to the flusher for its final
	1074	* disposition.
	1075	*/
	1076	void
	1077	hammer_rel_inode(struct hammer_inode *ip, int flush)
	1078	{
	1079	hammer_mount_t hmp = ip->hmp;
	1080
	1081	/*
	1082	* Handle disposition when dropping the last ref.
	1083	*/
	1084	for (;;) {
	1085	if (ip->lock.refs == 1) {
	1086	/*
	1087	* Determine whether on-disk action is needed for
	1088	* the inode's final disposition.
	1089	*/
	1090	KKASSERT(ip->vp == NULL);
	1091	hammer_inode_unloadable_check(ip, 0);
	1092	if (ip->flags & HAMMER_INODE_MODMASK) {
	1093	if (hmp->rsv_inodes > desiredvnodes) {
	1094	hammer_flush_inode(ip,
	1095	HAMMER_FLUSH_SIGNAL);
	1096	} else {
	1097	hammer_flush_inode(ip, 0);
	1098	}
	1099	} else if (ip->lock.refs == 1) {
	1100	hammer_unload_inode(ip);
	1101	break;
	1102	}
	1103	} else {
	1104	if (flush)
	1105	hammer_flush_inode(ip, 0);
	1106
	1107	/*
	1108	* The inode still has multiple refs, try to drop
	1109	* one ref.
	1110	*/
	1111	KKASSERT(ip->lock.refs >= 1);
	1112	if (ip->lock.refs > 1) {
	1113	hammer_unref(&ip->lock);
	1114	break;
	1115	}
	1116	}
	1117	}
	1118	}
	1119
	1120	/*
	1121	* Unload and destroy the specified inode. Must be called with one remaining
	1122	* reference. The reference is disposed of.
	1123	*
	1124	* This can only be called in the context of the flusher.
	1125	*/
	1126	static int
	1127	hammer_unload_inode(struct hammer_inode *ip)
	1128	{
	1129	hammer_mount_t hmp = ip->hmp;
	1130
	1131	KASSERT(ip->lock.refs == 1,
	1132	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	1133	KKASSERT(ip->vp == NULL);
	1134	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	1135	KKASSERT(ip->cursor_ip_refs == 0);
	1136	KKASSERT(ip->lock.lockcount == 0);
	1137	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	1138
	1139	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1140	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	1141
	1142	RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
	1143
	1144	hammer_free_inode(ip);
	1145	return(0);
	1146	}
	1147
	1148	/*
	1149	* Called on mount -u when switching from RW to RO or vise-versa. Adjust
	1150	* the read-only flag for cached inodes.
	1151	*
	1152	* This routine is called from a RB_SCAN().
	1153	*/
	1154	int
	1155	hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
	1156	{
	1157	hammer_mount_t hmp = ip->hmp;
	1158
	1159	if (hmp->ronly \|\| hmp->asof != HAMMER_MAX_TID)
	1160	ip->flags \|= HAMMER_INODE_RO;
	1161	else
	1162	ip->flags &= ~HAMMER_INODE_RO;
	1163	return(0);
	1164	}
	1165
	1166	/*
	1167	* A transaction has modified an inode, requiring updates as specified by
	1168	* the passed flags.
	1169	*
	1170	* HAMMER_INODE_DDIRTY: Inode data has been updated
	1171	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	1172	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	1173	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	1174	* HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
	1175	*/
	1176	void
	1177	hammer_modify_inode(hammer_inode_t ip, int flags)
	1178	{
	1179	KKASSERT(ip->hmp->ronly == 0 \|\|
	1180	(flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	1181	HAMMER_INODE_BUFS \| HAMMER_INODE_DELETED \|
	1182	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) == 0);
	1183	if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
	1184	ip->flags \|= HAMMER_INODE_RSV_INODES;
	1185	++ip->hmp->rsv_inodes;
	1186	}
	1187
	1188	ip->flags \|= flags;
	1189	}
	1190
	1191	/*
	1192	* Request that an inode be flushed. This whole mess cannot block and may
	1193	* recurse (if not synchronous). Once requested HAMMER will attempt to
	1194	* actively flush the inode until the flush can be done.
	1195	*
	1196	* The inode may already be flushing, or may be in a setup state. We can
	1197	* place the inode in a flushing state if it is currently idle and flag it
	1198	* to reflush if it is currently flushing.
	1199	*
	1200	* If the HAMMER_FLUSH_SYNCHRONOUS flag is specified we will attempt to
	1201	* flush the indoe synchronously using the caller's context.
	1202	*/
	1203	void
	1204	hammer_flush_inode(hammer_inode_t ip, int flags)
	1205	{
	1206	int good;
	1207
	1208	/*
	1209	* Trivial 'nothing to flush' case. If the inode is ina SETUP
	1210	* state we have to put it back into an IDLE state so we can
	1211	* drop the extra ref.
	1212	*/
	1213	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	1214	if (ip->flush_state == HAMMER_FST_SETUP) {
	1215	ip->flush_state = HAMMER_FST_IDLE;
	1216	hammer_rel_inode(ip, 0);
	1217	}
	1218	return;
	1219	}
	1220
	1221	/*
	1222	* Our flush action will depend on the current state.
	1223	*/
	1224	switch(ip->flush_state) {
	1225	case HAMMER_FST_IDLE:
	1226	/*
	1227	* We have no dependancies and can flush immediately. Some
	1228	* our children may not be flushable so we have to re-test
	1229	* with that additional knowledge.
	1230	*/
	1231	hammer_flush_inode_core(ip, flags);
	1232	break;
	1233	case HAMMER_FST_SETUP:
	1234	/*
	1235	* Recurse upwards through dependancies via target_list
	1236	* and start their flusher actions going if possible.
	1237	*
	1238	* 'good' is our connectivity. -1 means we have none and
	1239	* can't flush, 0 means there weren't any dependancies, and
	1240	* 1 means we have good connectivity.
	1241	*/
	1242	good = hammer_setup_parent_inodes(ip);
	1243
	1244	/*
	1245	* We can continue if good >= 0. Determine how many records
	1246	* under our inode can be flushed (and mark them).
	1247	*/
	1248	if (good >= 0) {
	1249	hammer_flush_inode_core(ip, flags);
	1250	} else {
	1251	ip->flags \|= HAMMER_INODE_REFLUSH;
	1252	if (flags & HAMMER_FLUSH_SIGNAL) {
	1253	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1254	hammer_flusher_async(ip->hmp);
	1255	}
	1256	}
	1257	break;
	1258	default:
	1259	/*
	1260	* We are already flushing, flag the inode to reflush
	1261	* if needed after it completes its current flush.
	1262	*/
	1263	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	1264	ip->flags \|= HAMMER_INODE_REFLUSH;
	1265	if (flags & HAMMER_FLUSH_SIGNAL) {
	1266	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1267	hammer_flusher_async(ip->hmp);
	1268	}
	1269	break;
	1270	}
	1271	}
	1272
	1273	/*
	1274	* Scan ip->target_list, which is a list of records owned by PARENTS to our
	1275	* ip which reference our ip.
	1276	*
	1277	* XXX This is a huge mess of recursive code, but not one bit of it blocks
	1278	* so for now do not ref/deref the structures. Note that if we use the
	1279	* ref/rel code later, the rel CAN block.
	1280	*/
	1281	static int
	1282	hammer_setup_parent_inodes(hammer_inode_t ip)
	1283	{
	1284	hammer_record_t depend;
	1285	#if 0
	1286	hammer_record_t next;
	1287	hammer_inode_t pip;
	1288	#endif
	1289	int good;
	1290	int r;
	1291
	1292	good = 0;
	1293	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	1294	r = hammer_setup_parent_inodes_helper(depend);
	1295	KKASSERT(depend->target_ip == ip);
	1296	if (r < 0 && good == 0)
	1297	good = -1;
	1298	if (r > 0)
	1299	good = 1;
	1300	}
	1301	return(good);
	1302
	1303	#if 0
	1304	retry:
	1305	good = 0;
	1306	next = TAILQ_FIRST(&ip->target_list);
	1307	if (next) {
	1308	hammer_ref(&next->lock);
	1309	hammer_ref(&next->ip->lock);
	1310	}
	1311	while ((depend = next) != NULL) {
	1312	if (depend->target_ip == NULL) {
	1313	pip = depend->ip;
	1314	hammer_rel_mem_record(depend);
	1315	hammer_rel_inode(pip, 0);
	1316	goto retry;
	1317	}
	1318	KKASSERT(depend->target_ip == ip);
	1319	next = TAILQ_NEXT(depend, target_entry);
	1320	if (next) {
	1321	hammer_ref(&next->lock);
	1322	hammer_ref(&next->ip->lock);
	1323	}
	1324	r = hammer_setup_parent_inodes_helper(depend);
	1325	if (r < 0 && good == 0)
	1326	good = -1;
	1327	if (r > 0)
	1328	good = 1;
	1329	pip = depend->ip;
	1330	hammer_rel_mem_record(depend);
	1331	hammer_rel_inode(pip, 0);
	1332	}
	1333	return(good);
	1334	#endif
	1335	}
	1336
	1337	/*
	1338	* This helper function takes a record representing the dependancy between
	1339	* the parent inode and child inode.
	1340	*
	1341	* record->ip = parent inode
	1342	* record->target_ip = child inode
	1343	*
	1344	* We are asked to recurse upwards and convert the record from SETUP
	1345	* to FLUSH if possible.
	1346	*
	1347	* Return 1 if the record gives us connectivity
	1348	*
	1349	* Return 0 if the record is not relevant
	1350	*
	1351	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	1352	*/
	1353	static int
	1354	hammer_setup_parent_inodes_helper(hammer_record_t record)
	1355	{
	1356	hammer_mount_t hmp;
	1357	hammer_inode_t pip;
	1358	int good;
	1359
	1360	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	1361	pip = record->ip;
	1362	hmp = pip->hmp;
	1363
	1364	/*
	1365	* If the record is already flushing, is it in our flush group?
	1366	*
	1367	* If it is in our flush group but it is a general record or a
	1368	* delete-on-disk, it does not improve our connectivity (return 0),
	1369	* and if the target inode is not trying to destroy itself we can't
	1370	* allow the operation yet anyway (the second return -1).
	1371	*/
	1372	if (record->flush_state == HAMMER_FST_FLUSH) {
	1373	if (record->flush_group != hmp->flusher.next) {
	1374	pip->flags \|= HAMMER_INODE_REFLUSH;
	1375	return(-1);
	1376	}
	1377	if (record->type == HAMMER_MEM_RECORD_ADD)
	1378	return(1);
	1379	/* GENERAL or DEL */
	1380	return(0);
	1381	}
	1382
	1383	/*
	1384	* It must be a setup record. Try to resolve the setup dependancies
	1385	* by recursing upwards so we can place ip on the flush list.
	1386	*/
	1387	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1388
	1389	good = hammer_setup_parent_inodes(pip);
	1390
	1391	/*
	1392	* We can't flush ip because it has no connectivity (XXX also check
	1393	* nlinks for pre-existing connectivity!). Flag it so any resolution
	1394	* recurses back down.
	1395	*/
	1396	if (good < 0) {
	1397	pip->flags \|= HAMMER_INODE_REFLUSH;
	1398	return(good);
	1399	}
	1400
	1401	/*
	1402	* We are go, place the parent inode in a flushing state so we can
	1403	* place its record in a flushing state. Note that the parent
	1404	* may already be flushing. The record must be in the same flush
	1405	* group as the parent.
	1406	*/
	1407	if (pip->flush_state != HAMMER_FST_FLUSH)
	1408	hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION);
	1409	KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
	1410	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1411
	1412	#if 0
	1413	if (record->type == HAMMER_MEM_RECORD_DEL &&
	1414	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	1415	/*
	1416	* Regardless of flushing state we cannot sync this path if the
	1417	* record represents a delete-on-disk but the target inode
	1418	* is not ready to sync its own deletion.
	1419	*
	1420	* XXX need to count effective nlinks to determine whether
	1421	* the flush is ok, otherwise removing a hardlink will
	1422	* just leave the DEL record to rot.
	1423	*/
	1424	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1425	return(-1);
	1426	} else
	1427	#endif
	1428	if (pip->flush_group == pip->hmp->flusher.next) {
	1429	/*
	1430	* This is the record we wanted to synchronize. If the
	1431	* record went into a flush state while we blocked it
	1432	* had better be in the correct flush group.
	1433	*/
	1434	if (record->flush_state != HAMMER_FST_FLUSH) {
	1435	record->flush_state = HAMMER_FST_FLUSH;
	1436	record->flush_group = pip->flush_group;
	1437	hammer_ref(&record->lock);
	1438	} else {
	1439	KKASSERT(record->flush_group == pip->flush_group);
	1440	}
	1441	if (record->type == HAMMER_MEM_RECORD_ADD)
	1442	return(1);
	1443
	1444	/*
	1445	* A general or delete-on-disk record does not contribute
	1446	* to our visibility. We can still flush it, however.
	1447	*/
	1448	return(0);
	1449	} else {
	1450	/*
	1451	* We couldn't resolve the dependancies, request that the
	1452	* inode be flushed when the dependancies can be resolved.
	1453	*/
	1454	pip->flags \|= HAMMER_INODE_REFLUSH;
	1455	return(-1);
	1456	}
	1457	}
	1458
	1459	/*
	1460	* This is the core routine placing an inode into the FST_FLUSH state.
	1461	*/
	1462	static void
	1463	hammer_flush_inode_core(hammer_inode_t ip, int flags)
	1464	{
	1465	int go_count;
	1466
	1467	/*
	1468	* Set flush state and prevent the flusher from cycling into
	1469	* the next flush group. Do not place the ip on the list yet.
	1470	* Inodes not in the idle state get an extra reference.
	1471	*/
	1472	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	1473	if (ip->flush_state == HAMMER_FST_IDLE)
	1474	hammer_ref(&ip->lock);
	1475	ip->flush_state = HAMMER_FST_FLUSH;
	1476	ip->flush_group = ip->hmp->flusher.next;
	1477	++ip->hmp->flusher.group_lock;
	1478	++ip->hmp->count_iqueued;
	1479	++hammer_count_iqueued;
	1480
	1481	/*
	1482	* We need to be able to vfsync/truncate from the backend.
	1483	*/
	1484	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	1485	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	1486	ip->flags \|= HAMMER_INODE_VHELD;
	1487	vref(ip->vp);
	1488	}
	1489
	1490	/*
	1491	* Figure out how many in-memory records we can actually flush
	1492	* (not including inode meta-data, buffers, etc).
	1493	*
	1494	* Do not add new records to the flush if this is a recursion or
	1495	* if we must still complete a flush from the previous flush cycle.
	1496	*/
	1497	if (flags & HAMMER_FLUSH_RECURSION) {
	1498	go_count = 1;
	1499	} else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1500	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1501	hammer_syncgrp_child_callback, NULL);
	1502	go_count = 1;
	1503	} else {
	1504	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1505	hammer_setup_child_callback, NULL);
	1506	}
	1507
	1508	/*
	1509	* This is a more involved test that includes go_count. If we
	1510	* can't flush, flag the inode and return. If go_count is 0 we
	1511	* were are unable to flush any records in our rec_tree and
	1512	* must ignore the XDIRTY flag.
	1513	*/
	1514	if (go_count == 0) {
	1515	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	1516	ip->flags \|= HAMMER_INODE_REFLUSH;
	1517
	1518	--ip->hmp->count_iqueued;
	1519	--hammer_count_iqueued;
	1520
	1521	ip->flush_state = HAMMER_FST_SETUP;
	1522	if (ip->flags & HAMMER_INODE_VHELD) {
	1523	ip->flags &= ~HAMMER_INODE_VHELD;
	1524	vrele(ip->vp);
	1525	}
	1526	if (flags & HAMMER_FLUSH_SIGNAL) {
	1527	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1528	hammer_flusher_async(ip->hmp);
	1529	}
	1530	if (--ip->hmp->flusher.group_lock == 0)
	1531	wakeup(&ip->hmp->flusher.group_lock);
	1532	return;
	1533	}
	1534	}
	1535
	1536	/*
	1537	* Snapshot the state of the inode for the backend flusher.
	1538	*
	1539	* We continue to retain save_trunc_off even when all truncations
	1540	* have been resolved as an optimization to determine if we can
	1541	* skip the B-Tree lookup for overwrite deletions.
	1542	*
	1543	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	1544	* and stays in ip->flags. Once set, it stays set until the
	1545	* inode is destroyed.
	1546	*
	1547	* NOTE: If a truncation from a previous flush cycle had to be
	1548	* continued into this one, the TRUNCATED flag will still be
	1549	* set in sync_flags as will WOULDBLOCK. When this occurs
	1550	* we CANNOT safely integrate a new truncation from the front-end
	1551	* because there may be data records in-memory assigned a flush
	1552	* state from the previous cycle that are supposed to be flushed
	1553	* before the next frontend truncation.
	1554	*/
	1555	if ((ip->flags & (HAMMER_INODE_TRUNCATED \| HAMMER_INODE_WOULDBLOCK)) ==
	1556	HAMMER_INODE_TRUNCATED) {
	1557	KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
	1558	ip->sync_trunc_off = ip->trunc_off;
	1559	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	1560	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	1561	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	1562
	1563	/*
	1564	* The save_trunc_off used to cache whether the B-Tree
	1565	* holds any records past that point is not used until
	1566	* after the truncation has succeeded, so we can safely
	1567	* set it now.
	1568	*/
	1569	if (ip->save_trunc_off > ip->sync_trunc_off)
	1570	ip->save_trunc_off = ip->sync_trunc_off;
	1571	}
	1572	ip->sync_flags \|= (ip->flags & HAMMER_INODE_MODMASK &
	1573	~HAMMER_INODE_TRUNCATED);
	1574	ip->sync_ino_leaf = ip->ino_leaf;
	1575	ip->sync_ino_data = ip->ino_data;
	1576	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	1577	#ifdef DEBUG_TRUNCATE
	1578	if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
	1579	kprintf("truncateS %016llx\n", ip->sync_trunc_off);
	1580	#endif
	1581
	1582	/*
	1583	* The flusher list inherits our inode and reference.
	1584	*/
	1585	TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
	1586	if (--ip->hmp->flusher.group_lock == 0)
	1587	wakeup(&ip->hmp->flusher.group_lock);
	1588
	1589	if (flags & HAMMER_FLUSH_SIGNAL) {
	1590	hammer_flusher_async(ip->hmp);
	1591	}
	1592	}
	1593
	1594	/*
	1595	* Callback for scan of ip->rec_tree. Try to include each record in our
	1596	* flush. ip->flush_group has been set but the inode has not yet been
	1597	* moved into a flushing state.
	1598	*
	1599	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	1600	* both inodes.
	1601	*
	1602	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	1603	* the caller from shortcutting the flush.
	1604	*/
	1605	static int
	1606	hammer_setup_child_callback(hammer_record_t rec, void *data)
	1607	{
	1608	hammer_inode_t target_ip;
	1609	hammer_inode_t ip;
	1610	int r;
	1611
	1612	/*
	1613	* Deleted records are ignored. Note that the flush detects deleted
	1614	* front-end records at multiple points to deal with races. This is
	1615	* just the first line of defense. The only time DELETED_FE cannot
	1616	* be set is when HAMMER_RECF_INTERLOCK_BE is set.
	1617	*
	1618	* Don't get confused between record deletion and, say, directory
	1619	* entry deletion. The deletion of a directory entry that is on
	1620	* the media has nothing to do with the record deletion flags.
	1621	*
	1622	* The flush_group for a record already in a flush state must
	1623	* be updated. This case can only occur if the inode deleting
	1624	* too many records had to be moved to the next flush group.
	1625	*/
	1626	if (rec->flags & (HAMMER_RECF_DELETED_FE\|HAMMER_RECF_DELETED_BE)) {
	1627	if (rec->flush_state == HAMMER_FST_FLUSH) {
	1628	KKASSERT(rec->ip->flags & HAMMER_INODE_WOULDBLOCK);
	1629	rec->flush_group = rec->ip->flush_group;
	1630	r = 1;
	1631	} else {
	1632	r = 0;
	1633	}
	1634	return(r);
	1635	}
	1636
	1637	/*
	1638	* If the record is in an idle state it has no dependancies and
	1639	* can be flushed.
	1640	*/
	1641	ip = rec->ip;
	1642	r = 0;
	1643
	1644	switch(rec->flush_state) {
	1645	case HAMMER_FST_IDLE:
	1646	/*
	1647	* Record has no setup dependancy, we can flush it.
	1648	*/
	1649	KKASSERT(rec->target_ip == NULL);
	1650	rec->flush_state = HAMMER_FST_FLUSH;
	1651	rec->flush_group = ip->flush_group;
	1652	hammer_ref(&rec->lock);
	1653	r = 1;
	1654	break;
	1655	case HAMMER_FST_SETUP:
	1656	/*
	1657	* Record has a setup dependancy. Try to include the
	1658	* target ip in the flush.
	1659	*
	1660	* We have to be careful here, if we do not do the right
	1661	* thing we can lose track of dirty inodes and the system
	1662	* will lockup trying to allocate buffers.
	1663	*/
	1664	target_ip = rec->target_ip;
	1665	KKASSERT(target_ip != NULL);
	1666	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	1667	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	1668	/*
	1669	* If the target IP is already flushing in our group
	1670	* we are golden, otherwise make sure the target
	1671	* reflushes.
	1672	*/
	1673	if (target_ip->flush_group == ip->flush_group) {
	1674	rec->flush_state = HAMMER_FST_FLUSH;
	1675	rec->flush_group = ip->flush_group;
	1676	hammer_ref(&rec->lock);
	1677	r = 1;
	1678	} else {
	1679	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1680	}
	1681	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	1682	/*
	1683	* If the target IP is not flushing we can force
	1684	* it to flush, even if it is unable to write out
	1685	* any of its own records we have at least one in
	1686	* hand that we CAN deal with.
	1687	*/
	1688	rec->flush_state = HAMMER_FST_FLUSH;
	1689	rec->flush_group = ip->flush_group;
	1690	hammer_ref(&rec->lock);
	1691	hammer_flush_inode_core(target_ip,
	1692	HAMMER_FLUSH_RECURSION);
	1693	r = 1;
	1694	} else {
	1695	/*
	1696	* General or delete-on-disk record.
	1697	*
	1698	* XXX this needs help. If a delete-on-disk we could
	1699	* disconnect the target. If the target has its own
	1700	* dependancies they really need to be flushed.
	1701	*
	1702	* XXX
	1703	*/
	1704	rec->flush_state = HAMMER_FST_FLUSH;
	1705	rec->flush_group = ip->flush_group;
	1706	hammer_ref(&rec->lock);
	1707	hammer_flush_inode_core(target_ip,
	1708	HAMMER_FLUSH_RECURSION);
	1709	r = 1;
	1710	}
	1711	break;
	1712	case HAMMER_FST_FLUSH:
	1713	/*
	1714	* If the WOULDBLOCK flag is set records may have been left
	1715	* over from a previous flush attempt and should be moved
	1716	* to the current flush group. If it is not set then all
	1717	* such records had better have been flushed already or
	1718	* already associated with the current flush group.
	1719	*/
	1720	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1721	rec->flush_group = ip->flush_group;
	1722	} else {
	1723	KKASSERT(rec->flush_group == ip->flush_group);
	1724	}
	1725	r = 1;
	1726	break;
	1727	}
	1728	return(r);
	1729	}
	1730
	1731	/*
	1732	* This version just moves records already in a flush state to the new
	1733	* flush group and that is it.
	1734	*/
	1735	static int
	1736	hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
	1737	{
	1738	hammer_inode_t ip = rec->ip;
	1739
	1740	switch(rec->flush_state) {
	1741	case HAMMER_FST_FLUSH:
	1742	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1743	rec->flush_group = ip->flush_group;
	1744	} else {
	1745	KKASSERT(rec->flush_group == ip->flush_group);
	1746	}
	1747	break;
	1748	default:
	1749	break;
	1750	}
	1751	return(0);
	1752	}
	1753
	1754	/*
	1755	* Wait for a previously queued flush to complete. Not only do we need to
	1756	* wait for the inode to sync out, we also may have to run the flusher again
	1757	* to get it past the UNDO position pertaining to the flush so a crash does
	1758	* not 'undo' our flush.
	1759	*/
	1760	void
	1761	hammer_wait_inode(hammer_inode_t ip)
	1762	{
	1763	hammer_mount_t hmp = ip->hmp;
	1764	int sync_group;
	1765	int waitcount;
	1766
	1767	sync_group = ip->flush_group;
	1768	waitcount = (ip->flags & HAMMER_INODE_REFLUSH) ? 2 : 1;
	1769
	1770	if (ip->flush_state == HAMMER_FST_SETUP) {
	1771	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1772	}
	1773	/* XXX can we make this != FST_IDLE ? check SETUP depends */
	1774	while (ip->flush_state == HAMMER_FST_FLUSH &&
	1775	(ip->flush_group - sync_group) < waitcount) {
	1776	ip->flags \|= HAMMER_INODE_FLUSHW;
	1777	tsleep(&ip->flags, 0, "hmrwin", 0);
	1778	}
	1779	while (hmp->flusher.done - sync_group < waitcount) {
	1780	kprintf("Y");
	1781	hammer_flusher_sync(hmp);
	1782	}
	1783	}
	1784
	1785	/*
	1786	* Called by the backend code when a flush has been completed.
	1787	* The inode has already been removed from the flush list.
	1788	*
	1789	* A pipelined flush can occur, in which case we must re-enter the
	1790	* inode on the list and re-copy its fields.
	1791	*/
	1792	void
	1793	hammer_flush_inode_done(hammer_inode_t ip)
	1794	{
	1795	hammer_mount_t hmp;
	1796	int dorel;
	1797
	1798	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	1799
	1800	hmp = ip->hmp;
	1801
	1802	/*
	1803	* Merge left-over flags back into the frontend and fix the state.
	1804	* Incomplete truncations are retained by the backend.
	1805	*/
	1806	ip->flags \|= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
	1807	ip->sync_flags &= HAMMER_INODE_TRUNCATED;
	1808
	1809	/*
	1810	* The backend may have adjusted nlinks, so if the adjusted nlinks
	1811	* does not match the fronttend set the frontend's RDIRTY flag again.
	1812	*/
	1813	if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
	1814	ip->flags \|= HAMMER_INODE_DDIRTY;
	1815
	1816	/*
	1817	* Fix up the dirty buffer status.
	1818	*/
	1819	if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
	1820	ip->flags \|= HAMMER_INODE_BUFS;
	1821	}
	1822
	1823	/*
	1824	* Re-set the XDIRTY flag if some of the inode's in-memory records
	1825	* could not be flushed.
	1826	*/
	1827	KKASSERT((RB_EMPTY(&ip->rec_tree) &&
	1828	(ip->flags & HAMMER_INODE_XDIRTY) == 0) \|\|
	1829	(!RB_EMPTY(&ip->rec_tree) &&
	1830	(ip->flags & HAMMER_INODE_XDIRTY) != 0));
	1831
	1832	/*
	1833	* Do not lose track of inodes which no longer have vnode
	1834	* assocations, otherwise they may never get flushed again.
	1835	*/
	1836	if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
	1837	ip->flags \|= HAMMER_INODE_REFLUSH;
	1838
	1839	/*
	1840	* Clean up the vnode ref
	1841	*/
	1842	if (ip->flags & HAMMER_INODE_VHELD) {
	1843	ip->flags &= ~HAMMER_INODE_VHELD;
	1844	vrele(ip->vp);
	1845	}
	1846
	1847	/*
	1848	* Adjust flush_state. The target state (idle or setup) shouldn't
	1849	* be terribly important since we will reflush if we really need
	1850	* to do anything.
	1851	*
	1852	* If the WOULDBLOCK flag is set we must re-flush immediately
	1853	* to continue a potentially large deletion. The flag also causes
	1854	* the hammer_setup_child_callback() to move records in the old
	1855	* flush group to the new one.
	1856	*/
	1857	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1858	ip->flush_state = HAMMER_FST_IDLE;
	1859	hammer_flush_inode_core(ip, HAMMER_FLUSH_SIGNAL);
	1860	ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
	1861	dorel = 1;
	1862	} else if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	1863	ip->flush_state = HAMMER_FST_IDLE;
	1864	dorel = 1;
	1865	} else {
	1866	ip->flush_state = HAMMER_FST_SETUP;
	1867	dorel = 0;
	1868	}
	1869
	1870	--hmp->count_iqueued;
	1871	--hammer_count_iqueued;
	1872
	1873	/*
	1874	* If the frontend made more changes and requested another flush,
	1875	* then try to get it running.
	1876	*/
	1877	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1878	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1879	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	1880	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	1881	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1882	} else {
	1883	hammer_flush_inode(ip, 0);
	1884	}
	1885	}
	1886
	1887	/*
	1888	* If the inode is now clean drop the space reservation.
	1889	*/
	1890	if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	1891	(ip->flags & HAMMER_INODE_RSV_INODES)) {
	1892	ip->flags &= ~HAMMER_INODE_RSV_INODES;
	1893	--hmp->rsv_inodes;
	1894	}
	1895
	1896	/*
	1897	* Finally, if the frontend is waiting for a flush to complete,
	1898	* wake it up.
	1899	*/
	1900	if (ip->flush_state != HAMMER_FST_FLUSH) {
	1901	if (ip->flags & HAMMER_INODE_FLUSHW) {
	1902	ip->flags &= ~HAMMER_INODE_FLUSHW;
	1903	wakeup(&ip->flags);
	1904	}
	1905	}
	1906	if (dorel)
	1907	hammer_rel_inode(ip, 0);
	1908	}
	1909
	1910	/*
	1911	* Called from hammer_sync_inode() to synchronize in-memory records
	1912	* to the media.
	1913	*/
	1914	static int
	1915	hammer_sync_record_callback(hammer_record_t record, void *data)
	1916	{
	1917	hammer_cursor_t cursor = data;
	1918	hammer_transaction_t trans = cursor->trans;
	1919	int error;
	1920
	1921	/*
	1922	* Skip records that do not belong to the current flush.
	1923	*/
	1924	++hammer_stats_record_iterations;
	1925	if (record->flush_state != HAMMER_FST_FLUSH)
	1926	return(0);
	1927
	1928	#if 1
	1929	if (record->flush_group != record->ip->flush_group) {
	1930	kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	1931	Debugger("blah2");
	1932	return(0);
	1933	}
	1934	#endif
	1935	KKASSERT(record->flush_group == record->ip->flush_group);
	1936
	1937	/*
	1938	* Interlock the record using the BE flag. Once BE is set the
	1939	* frontend cannot change the state of FE.
	1940	*
	1941	* NOTE: If FE is set prior to us setting BE we still sync the
	1942	* record out, but the flush completion code converts it to
	1943	* a delete-on-disk record instead of destroying it.
	1944	*/
	1945	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	1946	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1947
	1948	/*
	1949	* The backend may have already disposed of the record.
	1950	*/
	1951	if (record->flags & HAMMER_RECF_DELETED_BE) {
	1952	error = 0;
	1953	goto done;
	1954	}
	1955
	1956	/*
	1957	* If the whole inode is being deleting all on-disk records will
	1958	* be deleted very soon, we can't sync any new records to disk
	1959	* because they will be deleted in the same transaction they were
	1960	* created in (delete_tid == create_tid), which will assert.
	1961	*
	1962	* XXX There may be a case with RECORD_ADD with DELETED_FE set
	1963	* that we currently panic on.
	1964	*/
	1965	if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
	1966	switch(record->type) {
	1967	case HAMMER_MEM_RECORD_DATA:
	1968	/*
	1969	* We don't have to do anything, if the record was
	1970	* committed the space will have been accounted for
	1971	* in the blockmap.
	1972	*/
	1973	/* fall through */
	1974	case HAMMER_MEM_RECORD_GENERAL:
	1975	record->flags \|= HAMMER_RECF_DELETED_FE;
	1976	record->flags \|= HAMMER_RECF_DELETED_BE;
	1977	error = 0;
	1978	goto done;
	1979	case HAMMER_MEM_RECORD_ADD:
	1980	panic("hammer_sync_record_callback: illegal add "
	1981	"during inode deletion record %p", record);
	1982	break; /* NOT REACHED */
	1983	case HAMMER_MEM_RECORD_INODE:
	1984	panic("hammer_sync_record_callback: attempt to "
	1985	"sync inode record %p?", record);
	1986	break; /* NOT REACHED */
	1987	case HAMMER_MEM_RECORD_DEL:
	1988	/*
	1989	* Follow through and issue the on-disk deletion
	1990	*/
	1991	break;
	1992	}
	1993	}
	1994
	1995	/*
	1996	* If DELETED_FE is set special handling is needed for directory
	1997	* entries. Dependant pieces related to the directory entry may
	1998	* have already been synced to disk. If this occurs we have to
	1999	* sync the directory entry and then change the in-memory record
	2000	* from an ADD to a DELETE to cover the fact that it's been
	2001	* deleted by the frontend.
	2002	*
	2003	* A directory delete covering record (MEM_RECORD_DEL) can never
	2004	* be deleted by the frontend.
	2005	*
	2006	* Any other record type (aka DATA) can be deleted by the frontend.
	2007	* XXX At the moment the flusher must skip it because there may
	2008	* be another data record in the flush group for the same block,
	2009	* meaning that some frontend data changes can leak into the backend's
	2010	* synchronization point.
	2011	*/
	2012	if (record->flags & HAMMER_RECF_DELETED_FE) {
	2013	if (record->type == HAMMER_MEM_RECORD_ADD) {
	2014	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	2015	} else {
	2016	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	2017	record->flags \|= HAMMER_RECF_DELETED_BE;
	2018	error = 0;
	2019	goto done;
	2020	}
	2021	}
	2022
	2023	/*
	2024	* Assign the create_tid for new records. Deletions already
	2025	* have the record's entire key properly set up.
	2026	*/
	2027	if (record->type != HAMMER_MEM_RECORD_DEL)
	2028	record->leaf.base.create_tid = trans->tid;
	2029	record->leaf.create_ts = trans->time32;
	2030	for (;;) {
	2031	error = hammer_ip_sync_record_cursor(cursor, record);
	2032	if (error != EDEADLK)
	2033	break;
	2034	hammer_done_cursor(cursor);
	2035	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	2036	record->ip);
	2037	if (error)
	2038	break;
	2039	}
	2040	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	2041
	2042	if (error) {
	2043	error = -error;
	2044	if (error != -ENOSPC) {
	2045	kprintf("hammer_sync_record_callback: sync failed rec "
	2046	"%p, error %d\n", record, error);
	2047	Debugger("sync failed rec");
	2048	}
	2049	}
	2050	done:
	2051	hammer_flush_record_done(record, error);
	2052	return(error);
	2053	}
	2054
	2055	/*
	2056	* XXX error handling
	2057	*/
	2058	int
	2059	hammer_sync_inode(hammer_inode_t ip)
	2060	{
	2061	struct hammer_transaction trans;
	2062	struct hammer_cursor cursor;
	2063	hammer_node_t tmp_node;
	2064	hammer_record_t depend;
	2065	hammer_record_t next;
	2066	int error, tmp_error;
	2067	u_int64_t nlinks;
	2068
	2069	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	2070	return(0);
	2071
	2072	hammer_start_transaction_fls(&trans, ip->hmp);
	2073	error = hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	2074	if (error)
	2075	goto done;
	2076
	2077	/*
	2078	* Any directory records referencing this inode which are not in
	2079	* our current flush group must adjust our nlink count for the
	2080	* purposes of synchronization to disk.
	2081	*
	2082	* Records which are in our flush group can be unlinked from our
	2083	* inode now, potentially allowing the inode to be physically
	2084	* deleted.
	2085	*
	2086	* This cannot block.
	2087	*/
	2088	nlinks = ip->ino_data.nlinks;
	2089	next = TAILQ_FIRST(&ip->target_list);
	2090	while ((depend = next) != NULL) {
	2091	next = TAILQ_NEXT(depend, target_entry);
	2092	if (depend->flush_state == HAMMER_FST_FLUSH &&
	2093	depend->flush_group == ip->hmp->flusher.act) {
	2094	/*
	2095	* If this is an ADD that was deleted by the frontend
	2096	* the frontend nlinks count will have already been
	2097	* decremented, but the backend is going to sync its
	2098	* directory entry and must account for it. The
	2099	* record will be converted to a delete-on-disk when
	2100	* it gets synced.
	2101	*
	2102	* If the ADD was not deleted by the frontend we
	2103	* can remove the dependancy from our target_list.
	2104	*/
	2105	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	2106	++nlinks;
	2107	} else {
	2108	TAILQ_REMOVE(&ip->target_list, depend,
	2109	target_entry);
	2110	depend->target_ip = NULL;
	2111	}
	2112	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	2113	/*
	2114	* Not part of our flush group
	2115	*/
	2116	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	2117	switch(depend->type) {
	2118	case HAMMER_MEM_RECORD_ADD:
	2119	--nlinks;
	2120	break;
	2121	case HAMMER_MEM_RECORD_DEL:
	2122	++nlinks;
	2123	break;
	2124	default:
	2125	break;
	2126	}
	2127	}
	2128	}
	2129
	2130	/*
	2131	* Set dirty if we had to modify the link count.
	2132	*/
	2133	if (ip->sync_ino_data.nlinks != nlinks) {
	2134	KKASSERT((int64_t)nlinks >= 0);
	2135	ip->sync_ino_data.nlinks = nlinks;
	2136	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2137	}
	2138
	2139	/*
	2140	* If there is a trunction queued destroy any data past the (aligned)
	2141	* truncation point. Userland will have dealt with the buffer
	2142	* containing the truncation point for us.
	2143	*
	2144	* We don't flush pending frontend data buffers until after we've
	2145	* dealt with the truncation.
	2146	*/
	2147	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2148	/*
	2149	* Interlock trunc_off. The VOP front-end may continue to
	2150	* make adjustments to it while we are blocked.
	2151	*/
	2152	off_t trunc_off;
	2153	off_t aligned_trunc_off;
	2154	int blkmask;
	2155
	2156	trunc_off = ip->sync_trunc_off;
	2157	blkmask = hammer_blocksize(trunc_off) - 1;
	2158	aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
	2159
	2160	/*
	2161	* Delete any whole blocks on-media. The front-end has
	2162	* already cleaned out any partial block and made it
	2163	* pending. The front-end may have updated trunc_off
	2164	* while we were blocked so we only use sync_trunc_off.
	2165	*
	2166	* This operation can blow out the buffer cache, EWOULDBLOCK
	2167	* means we were unable to complete the deletion. The
	2168	* deletion will update sync_trunc_off in that case.
	2169	*/
	2170	error = hammer_ip_delete_range(&cursor, ip,
	2171	aligned_trunc_off,
	2172	0x7FFFFFFFFFFFFFFFLL, 2);
	2173	if (error == EWOULDBLOCK) {
	2174	ip->flags \|= HAMMER_INODE_WOULDBLOCK;
	2175	error = 0;
	2176	goto defer_buffer_flush;
	2177	}
	2178
	2179	if (error)
	2180	Debugger("hammer_ip_delete_range errored");
	2181
	2182	/*
	2183	* Clear the truncation flag on the backend after we have
	2184	* complete the deletions. Backend data is now good again
	2185	* (including new records we are about to sync, below).
	2186	*
	2187	* Leave sync_trunc_off intact. As we write additional
	2188	* records the backend will update sync_trunc_off. This
	2189	* tells the backend whether it can skip the overwrite
	2190	* test. This should work properly even when the backend
	2191	* writes full blocks where the truncation point straddles
	2192	* the block because the comparison is against the base
	2193	* offset of the record.
	2194	*/
	2195	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2196	/* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
	2197	} else {
	2198	error = 0;
	2199	}
	2200
	2201	/*
	2202	* Now sync related records. These will typically be directory
	2203	* entries or delete-on-disk records.
	2204	*
	2205	* Not all records will be flushed, but clear XDIRTY anyway. We
	2206	* will set it again in the frontend hammer_flush_inode_done()
	2207	* if records remain.
	2208	*/
	2209	if (error == 0) {
	2210	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2211	hammer_sync_record_callback, &cursor);
	2212	if (tmp_error < 0)
	2213	tmp_error = -error;
	2214	if (tmp_error)
	2215	error = tmp_error;
	2216	}
	2217	hammer_cache_node(&ip->cache[1], cursor.node);
	2218
	2219	/*
	2220	* Re-seek for inode update, assuming our cache hasn't been ripped
	2221	* out from under us.
	2222	*/
	2223	if (error == 0) {
	2224	tmp_node = hammer_ref_node_safe(ip->hmp, &ip->cache[0], &error);
	2225	if (tmp_node) {
	2226	hammer_cursor_downgrade(&cursor);
	2227	hammer_lock_sh(&tmp_node->lock);
	2228	if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
	2229	hammer_cursor_seek(&cursor, tmp_node, 0);
	2230	hammer_unlock(&tmp_node->lock);
	2231	hammer_rel_node(tmp_node);
	2232	}
	2233	error = 0;
	2234	}
	2235
	2236	/*
	2237	* If we are deleting the inode the frontend had better not have
	2238	* any active references on elements making up the inode.
	2239	*
	2240	* The call to hammer_ip_delete_clean() cleans up auxillary records
	2241	* but not DB or DATA records. Those must have already been deleted
	2242	* by the normal truncation mechanic.
	2243	*/
	2244	if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
	2245	RB_EMPTY(&ip->rec_tree) &&
	2246	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	2247	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	2248	int count1 = 0;
	2249
	2250	error = hammer_ip_delete_clean(&cursor, ip, &count1);
	2251	if (error == 0) {
	2252	ip->flags \|= HAMMER_INODE_DELETED;
	2253	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	2254	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2255	KKASSERT(RB_EMPTY(&ip->rec_tree));
	2256
	2257	/*
	2258	* Set delete_tid in both the frontend and backend
	2259	* copy of the inode record. The DELETED flag handles
	2260	* this, do not set RDIRTY.
	2261	*/
	2262	ip->ino_leaf.base.delete_tid = trans.tid;
	2263	ip->sync_ino_leaf.base.delete_tid = trans.tid;
	2264	ip->ino_leaf.delete_ts = trans.time32;
	2265	ip->sync_ino_leaf.delete_ts = trans.time32;
	2266
	2267
	2268	/*
	2269	* Adjust the inode count in the volume header
	2270	*/
	2271	if (ip->flags & HAMMER_INODE_ONDISK) {
	2272	hammer_modify_volume_field(&trans,
	2273	trans.rootvol,
	2274	vol0_stat_inodes);
	2275	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	2276	hammer_modify_volume_done(trans.rootvol);
	2277	}
	2278	} else {
	2279	Debugger("hammer_ip_delete_clean errored");
	2280	}
	2281	}
	2282
	2283	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	2284
	2285	if (error)
	2286	Debugger("RB_SCAN errored");
	2287
	2288	defer_buffer_flush:
	2289	/*
	2290	* Now update the inode's on-disk inode-data and/or on-disk record.
	2291	* DELETED and ONDISK are managed only in ip->flags.
	2292	*
	2293	* In the case of a defered buffer flush we still update the on-disk
	2294	* inode to satisfy visibility requirements if there happen to be
	2295	* directory dependancies.
	2296	*/
	2297	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	2298	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	2299	/*
	2300	* If deleted and on-disk, don't set any additional flags.
	2301	* the delete flag takes care of things.
	2302	*
	2303	* Clear flags which may have been set by the frontend.
	2304	*/
	2305	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2306	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2307	HAMMER_INODE_DELETING);
	2308	break;
	2309	case HAMMER_INODE_DELETED:
	2310	/*
	2311	* Take care of the case where a deleted inode was never
	2312	* flushed to the disk in the first place.
	2313	*
	2314	* Clear flags which may have been set by the frontend.
	2315	*/
	2316	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2317	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2318	HAMMER_INODE_DELETING);
	2319	while (RB_ROOT(&ip->rec_tree)) {
	2320	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	2321	hammer_ref(&record->lock);
	2322	KKASSERT(record->lock.refs == 1);
	2323	record->flags \|= HAMMER_RECF_DELETED_FE;
	2324	record->flags \|= HAMMER_RECF_DELETED_BE;
	2325	hammer_rel_mem_record(record);
	2326	}
	2327	break;
	2328	case HAMMER_INODE_ONDISK:
	2329	/*
	2330	* If already on-disk, do not set any additional flags.
	2331	*/
	2332	break;
	2333	default:
	2334	/*
	2335	* If not on-disk and not deleted, set DDIRTY to force
	2336	* an initial record to be written.
	2337	*
	2338	* Also set the create_tid in both the frontend and backend
	2339	* copy of the inode record.
	2340	*/
	2341	ip->ino_leaf.base.create_tid = trans.tid;
	2342	ip->ino_leaf.create_ts = trans.time32;
	2343	ip->sync_ino_leaf.base.create_tid = trans.tid;
	2344	ip->sync_ino_leaf.create_ts = trans.time32;
	2345	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2346	break;
	2347	}
	2348
	2349	/*
	2350	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	2351	* is already on-disk the old record is marked as deleted.
	2352	*
	2353	* If DELETED is set hammer_update_inode() will delete the existing
	2354	* record without writing out a new one.
	2355	*
	2356	* If ONLY the ITIMES flag is set we can update the record in-place.
	2357	*/
	2358	if (ip->flags & HAMMER_INODE_DELETED) {
	2359	error = hammer_update_inode(&cursor, ip);
	2360	} else
	2361	if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
	2362	(ip->sync_flags & (HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME))) {
	2363	error = hammer_update_itimes(&cursor, ip);
	2364	} else
	2365	if (ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) {
	2366	error = hammer_update_inode(&cursor, ip);
	2367	}
	2368	if (error)
	2369	Debugger("hammer_update_itimes/inode errored");
	2370	done:
	2371	/*
	2372	* Save the TID we used to sync the inode with to make sure we
	2373	* do not improperly reuse it.
	2374	*/
	2375	hammer_done_cursor(&cursor);
	2376	hammer_done_transaction(&trans);
	2377	return(error);
	2378	}
	2379
	2380	/*
	2381	* This routine is called when the OS is no longer actively referencing
	2382	* the inode (but might still be keeping it cached), or when releasing
	2383	* the last reference to an inode.
	2384	*
	2385	* At this point if the inode's nlinks count is zero we want to destroy
	2386	* it, which may mean destroying it on-media too.
	2387	*/
	2388	void
	2389	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	2390	{
	2391	struct vnode *vp;
	2392
	2393	/*
	2394	* Set the DELETING flag when the link count drops to 0 and the
	2395	* OS no longer has any opens on the inode.
	2396	*
	2397	* The backend will clear DELETING (a mod flag) and set DELETED
	2398	* (a state flag) when it is actually able to perform the
	2399	* operation.
	2400	*/
	2401	if (ip->ino_data.nlinks == 0 &&
	2402	(ip->flags & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	2403	ip->flags \|= HAMMER_INODE_DELETING;
	2404	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2405	ip->trunc_off = 0;
	2406	vp = NULL;
	2407	if (getvp) {
	2408	if (hammer_get_vnode(ip, &vp) != 0)
	2409	return;
	2410	}
	2411
	2412	/*
	2413	* Final cleanup
	2414	*/
	2415	if (ip->vp) {
	2416	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	2417	vnode_pager_setsize(ip->vp, 0);
	2418	}
	2419	if (getvp) {
	2420	vput(vp);
	2421	}
	2422	}
	2423	}
	2424
	2425	/*
	2426	* Re-test an inode when a dependancy had gone away to see if we
	2427	* can chain flush it.
	2428	*/
	2429	void
	2430	hammer_test_inode(hammer_inode_t ip)
	2431	{
	2432	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2433	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2434	hammer_ref(&ip->lock);
	2435	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2436	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2437	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2438	} else {
	2439	hammer_flush_inode(ip, 0);
	2440	}
	2441	hammer_rel_inode(ip, 0);
	2442	}
	2443	}
	2444
	2445	/*
	2446	* Clear the RECLAIM flag on an inode. This occurs when the inode is
	2447	* reassociated with a vp or just before it gets freed.
	2448	*
	2449	* Wakeup one thread blocked waiting on reclaims to complete. Note that
	2450	* the inode the thread is waiting on behalf of is a different inode then
	2451	* the inode we are called with. This is to create a pipeline.
	2452	*/
	2453	static void
	2454	hammer_inode_wakereclaims(hammer_inode_t ip)
	2455	{
	2456	struct hammer_reclaim *reclaim;
	2457	hammer_mount_t hmp = ip->hmp;
	2458
	2459	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
	2460	return;
	2461
	2462	--hammer_count_reclaiming;
	2463	--hmp->inode_reclaims;
	2464	ip->flags &= ~HAMMER_INODE_RECLAIM;
	2465
	2466	if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
	2467	TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
	2468	reclaim->okydoky = 1;
	2469	wakeup(reclaim);
	2470	}
	2471	}
	2472
	2473	/*
	2474	* Setup our reclaim pipeline. We only let so many detached (and dirty)
	2475	* inodes build up before we start blocking.
	2476	*
	2477	* When we block we don't care which inode has finished reclaiming,
	2478	* as lone as one does. This is somewhat heuristical... we also put a
	2479	* cap on how long we are willing to wait.
	2480	*/
	2481	void
	2482	hammer_inode_waitreclaims(hammer_mount_t hmp)
	2483	{
	2484	struct hammer_reclaim reclaim;
	2485	int delay;
	2486
	2487	if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT) {
	2488	reclaim.okydoky = 0;
	2489	TAILQ_INSERT_TAIL(&hmp->reclaim_list,
	2490	&reclaim, entry);
	2491	} else {
	2492	reclaim.okydoky = 1;
	2493	}
	2494
	2495	if (reclaim.okydoky == 0) {
	2496	delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
	2497	HAMMER_RECLAIM_WAIT;
	2498	if (delay >= 0)
	2499	tsleep(&reclaim, 0, "hmrrcm", delay + 1);
	2500	if (reclaim.okydoky == 0)
	2501	TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
	2502	}
	2503	}
	2504