gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.78 2008/06/20 05:38:26 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39	#include <sys/buf.h>
	40	#include <sys/buf2.h>
	41
	42	static int hammer_unload_inode(struct hammer_inode *ip);
	43	static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
	44	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	45	static int hammer_setup_parent_inodes(hammer_inode_t ip);
	46	static int hammer_setup_parent_inodes_helper(hammer_record_t record);
	47	static void hammer_inode_wakereclaims(hammer_inode_t ip);
	48
	49	#ifdef DEBUG_TRUNCATE
	50	extern struct hammer_inode *HammerTruncIp;
	51	#endif
	52
	53	/*
	54	* The kernel is not actively referencing this vnode but is still holding
	55	* it cached.
	56	*
	57	* This is called from the frontend.
	58	*/
	59	int
	60	hammer_vop_inactive(struct vop_inactive_args *ap)
	61	{
	62	struct hammer_inode *ip = VTOI(ap->a_vp);
	63
	64	/*
	65	* Degenerate case
	66	*/
	67	if (ip == NULL) {
	68	vrecycle(ap->a_vp);
	69	return(0);
	70	}
	71
	72	/*
	73	* If the inode no longer has visibility in the filesystem try to
	74	* recycle it immediately, even if the inode is dirty. Recycling
	75	* it quickly allows the system to reclaim buffer cache and VM
	76	* resources which can matter a lot in a heavily loaded system.
	77	*
	78	* This can deadlock in vfsync() if we aren't careful.
	79	*
	80	* Do not queue the inode to the flusher if we still have visibility,
	81	* otherwise namespace calls such as chmod will unnecessarily generate
	82	* multiple inode updates.
	83	*/
	84	hammer_inode_unloadable_check(ip, 0);
	85	if (ip->ino_data.nlinks == 0) {
	86	if (ip->flags & HAMMER_INODE_MODMASK)
	87	hammer_flush_inode(ip, 0);
	88	vrecycle(ap->a_vp);
	89	}
	90	return(0);
	91	}
	92
	93	/*
	94	* Release the vnode association. This is typically (but not always)
	95	* the last reference on the inode.
	96	*
	97	* Once the association is lost we are on our own with regards to
	98	* flushing the inode.
	99	*/
	100	int
	101	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	102	{
	103	struct hammer_inode *ip;
	104	hammer_mount_t hmp;
	105	struct vnode *vp;
	106
	107	vp = ap->a_vp;
	108
	109	if ((ip = vp->v_data) != NULL) {
	110	hmp = ip->hmp;
	111	vp->v_data = NULL;
	112	ip->vp = NULL;
	113
	114	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
	115	++hammer_count_reclaiming;
	116	++hmp->inode_reclaims;
	117	ip->flags \|= HAMMER_INODE_RECLAIM;
	118	if (hmp->inode_reclaims > HAMMER_RECLAIM_FLUSH &&
	119	(hmp->inode_reclaims & 255) == 0) {
	120	hammer_flusher_async(hmp);
	121	}
	122	}
	123	hammer_rel_inode(ip, 1);
	124	}
	125	return(0);
	126	}
	127
	128	/*
	129	* Return a locked vnode for the specified inode. The inode must be
	130	* referenced but NOT LOCKED on entry and will remain referenced on
	131	* return.
	132	*
	133	* Called from the frontend.
	134	*/
	135	int
	136	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	137	{
	138	hammer_mount_t hmp;
	139	struct vnode *vp;
	140	int error = 0;
	141
	142	hmp = ip->hmp;
	143
	144	for (;;) {
	145	if ((vp = ip->vp) == NULL) {
	146	error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
	147	if (error)
	148	break;
	149	hammer_lock_ex(&ip->lock);
	150	if (ip->vp != NULL) {
	151	hammer_unlock(&ip->lock);
	152	vp->v_type = VBAD;
	153	vx_put(vp);
	154	continue;
	155	}
	156	hammer_ref(&ip->lock);
	157	vp = *vpp;
	158	ip->vp = vp;
	159	vp->v_type =
	160	hammer_get_vnode_type(ip->ino_data.obj_type);
	161
	162	hammer_inode_wakereclaims(ip);
	163
	164	switch(ip->ino_data.obj_type) {
	165	case HAMMER_OBJTYPE_CDEV:
	166	case HAMMER_OBJTYPE_BDEV:
	167	vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
	168	addaliasu(vp, ip->ino_data.rmajor,
	169	ip->ino_data.rminor);
	170	break;
	171	case HAMMER_OBJTYPE_FIFO:
	172	vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
	173	break;
	174	default:
	175	break;
	176	}
	177
	178	/*
	179	* Only mark as the root vnode if the ip is not
	180	* historical, otherwise the VFS cache will get
	181	* confused. The other half of the special handling
	182	* is in hammer_vop_nlookupdotdot().
	183	*/
	184	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	185	ip->obj_asof == hmp->asof) {
	186	vp->v_flag \|= VROOT;
	187	}
	188
	189	vp->v_data = (void *)ip;
	190	/* vnode locked by getnewvnode() */
	191	/* make related vnode dirty if inode dirty? */
	192	hammer_unlock(&ip->lock);
	193	if (vp->v_type == VREG)
	194	vinitvmio(vp, ip->ino_data.size);
	195	break;
	196	}
	197
	198	/*
	199	* loop if the vget fails (aka races), or if the vp
	200	* no longer matches ip->vp.
	201	*/
	202	if (vget(vp, LK_EXCLUSIVE) == 0) {
	203	if (vp == ip->vp)
	204	break;
	205	vput(vp);
	206	}
	207	}
	208	*vpp = vp;
	209	return(error);
	210	}
	211
	212	/*
	213	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	214	* do not attach or detach the related vnode (use hammer_get_vnode() for
	215	* that).
	216	*
	217	* The flags argument is only applied for newly created inodes, and only
	218	* certain flags are inherited.
	219	*
	220	* Called from the frontend.
	221	*/
	222	struct hammer_inode *
	223	hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
	224	u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
	225	{
	226	hammer_mount_t hmp = trans->hmp;
	227	struct hammer_inode_info iinfo;
	228	struct hammer_cursor cursor;
	229	struct hammer_inode *ip;
	230
	231	/*
	232	* Determine if we already have an inode cached. If we do then
	233	* we are golden.
	234	*/
	235	iinfo.obj_id = obj_id;
	236	iinfo.obj_asof = asof;
	237	loop:
	238	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	239	if (ip) {
	240	hammer_ref(&ip->lock);
	241	*errorp = 0;
	242	return(ip);
	243	}
	244
	245	/*
	246	* Allocate a new inode structure and deal with races later.
	247	*/
	248	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	249	++hammer_count_inodes;
	250	++hmp->count_inodes;
	251	ip->obj_id = obj_id;
	252	ip->obj_asof = iinfo.obj_asof;
	253	ip->hmp = hmp;
	254	ip->flags = flags & HAMMER_INODE_RO;
	255	ip->cache[0].ip = ip;
	256	ip->cache[1].ip = ip;
	257	if (hmp->ronly)
	258	ip->flags \|= HAMMER_INODE_RO;
	259	ip->sync_trunc_off = ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	260	RB_INIT(&ip->rec_tree);
	261	TAILQ_INIT(&ip->target_list);
	262
	263	/*
	264	* Locate the on-disk inode.
	265	*/
	266	retry:
	267	hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL);
	268	cursor.key_beg.localization = HAMMER_LOCALIZE_INODE;
	269	cursor.key_beg.obj_id = ip->obj_id;
	270	cursor.key_beg.key = 0;
	271	cursor.key_beg.create_tid = 0;
	272	cursor.key_beg.delete_tid = 0;
	273	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	274	cursor.key_beg.obj_type = 0;
	275	cursor.asof = iinfo.obj_asof;
	276	cursor.flags = HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_GET_DATA \|
	277	HAMMER_CURSOR_ASOF;
	278
	279	*errorp = hammer_btree_lookup(&cursor);
	280	if (*errorp == EDEADLK) {
	281	hammer_done_cursor(&cursor);
	282	goto retry;
	283	}
	284
	285	/*
	286	* On success the B-Tree lookup will hold the appropriate
	287	* buffer cache buffers and provide a pointer to the requested
	288	* information. Copy the information to the in-memory inode
	289	* and cache the B-Tree node to improve future operations.
	290	*/
	291	if (*errorp == 0) {
	292	ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
	293	ip->ino_data = cursor.data->inode;
	294
	295	/*
	296	* cache[0] tries to cache the location of the object inode.
	297	* The assumption is that it is near the directory inode.
	298	*
	299	* cache[1] tries to cache the location of the object data.
	300	* The assumption is that it is near the directory data.
	301	*/
	302	hammer_cache_node(&ip->cache[0], cursor.node);
	303	if (dip && dip->cache[1].node)
	304	hammer_cache_node(&ip->cache[1], dip->cache[1].node);
	305
	306	/*
	307	* The file should not contain any data past the file size
	308	* stored in the inode. Setting sync_trunc_off to the
	309	* file size instead of max reduces B-Tree lookup overheads
	310	* on append by allowing the flusher to avoid checking for
	311	* record overwrites.
	312	*/
	313	ip->sync_trunc_off = ip->ino_data.size;
	314	}
	315
	316	/*
	317	* The inode is placed on the red-black tree and will be synced to
	318	* the media when flushed or by the filesystem sync. If this races
	319	* another instantiation/lookup the insertion will fail.
	320	*/
	321	if (*errorp == 0) {
	322	hammer_ref(&ip->lock);
	323	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	324	hammer_uncache_node(&ip->cache[0]);
	325	hammer_uncache_node(&ip->cache[1]);
	326	KKASSERT(ip->lock.refs == 1);
	327	--hammer_count_inodes;
	328	--hmp->count_inodes;
	329	kfree(ip, M_HAMMER);
	330	hammer_done_cursor(&cursor);
	331	goto loop;
	332	}
	333	ip->flags \|= HAMMER_INODE_ONDISK;
	334	} else {
	335	/*
	336	* Do not panic on read-only accesses which fail, particularly
	337	* historical accesses where the snapshot might not have
	338	* complete connectivity.
	339	*/
	340	if ((flags & HAMMER_INODE_RO) == 0) {
	341	kprintf("hammer_get_inode: failed ip %p obj_id %016llx cursor %p error %d\n",
	342	ip, ip->obj_id, &cursor, *errorp);
	343	Debugger("x");
	344	}
	345	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	346	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	347	--hmp->rsv_inodes;
	348	}
	349	hmp->rsv_databufs -= ip->rsv_databufs;
	350	ip->rsv_databufs = 0; /* sanity */
	351
	352	--hammer_count_inodes;
	353	--hmp->count_inodes;
	354	kfree(ip, M_HAMMER);
	355	ip = NULL;
	356	}
	357	hammer_done_cursor(&cursor);
	358	return (ip);
	359	}
	360
	361	/*
	362	* Create a new filesystem object, returning the inode in *ipp. The
	363	* returned inode will be referenced.
	364	*
	365	* The inode is created in-memory.
	366	*/
	367	int
	368	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	369	struct ucred *cred, hammer_inode_t dip,
	370	struct hammer_inode **ipp)
	371	{
	372	hammer_mount_t hmp;
	373	hammer_inode_t ip;
	374	uid_t xuid;
	375
	376	hmp = trans->hmp;
	377	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	378	++hammer_count_inodes;
	379	++hmp->count_inodes;
	380	ip->obj_id = hammer_alloc_objid(trans, dip);
	381	KKASSERT(ip->obj_id != 0);
	382	ip->obj_asof = hmp->asof;
	383	ip->hmp = hmp;
	384	ip->flush_state = HAMMER_FST_IDLE;
	385	ip->flags = HAMMER_INODE_DDIRTY \| HAMMER_INODE_ITIMES;
	386	ip->cache[0].ip = ip;
	387	ip->cache[1].ip = ip;
	388
	389	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	390	RB_INIT(&ip->rec_tree);
	391	TAILQ_INIT(&ip->target_list);
	392
	393	ip->ino_data.atime = trans->time;
	394	ip->ino_data.mtime = trans->time;
	395	ip->ino_data.size = 0;
	396	ip->ino_data.nlinks = 0;
	397
	398	/*
	399	* A nohistory designator on the parent directory is inherited by
	400	* the child.
	401	*/
	402	ip->ino_data.uflags = dip->ino_data.uflags &
	403	(SF_NOHISTORY\|UF_NOHISTORY\|UF_NODUMP);
	404
	405	ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
	406	ip->ino_leaf.base.localization = HAMMER_LOCALIZE_INODE;
	407	ip->ino_leaf.base.obj_id = ip->obj_id;
	408	ip->ino_leaf.base.key = 0;
	409	ip->ino_leaf.base.create_tid = 0;
	410	ip->ino_leaf.base.delete_tid = 0;
	411	ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
	412	ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
	413
	414	ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
	415	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	416	ip->ino_data.mode = vap->va_mode;
	417	ip->ino_data.ctime = trans->time;
	418	ip->ino_data.parent_obj_id = (dip) ? dip->ino_leaf.base.obj_id : 0;
	419
	420	switch(ip->ino_leaf.base.obj_type) {
	421	case HAMMER_OBJTYPE_CDEV:
	422	case HAMMER_OBJTYPE_BDEV:
	423	ip->ino_data.rmajor = vap->va_rmajor;
	424	ip->ino_data.rminor = vap->va_rminor;
	425	break;
	426	default:
	427	break;
	428	}
	429
	430	/*
	431	* Calculate default uid/gid and overwrite with information from
	432	* the vap.
	433	*/
	434	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	435	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
	436	&vap->va_mode);
	437	ip->ino_data.mode = vap->va_mode;
	438
	439	if (vap->va_vaflags & VA_UID_UUID_VALID)
	440	ip->ino_data.uid = vap->va_uid_uuid;
	441	else if (vap->va_uid != (uid_t)VNOVAL)
	442	hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
	443	else
	444	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	445
	446	if (vap->va_vaflags & VA_GID_UUID_VALID)
	447	ip->ino_data.gid = vap->va_gid_uuid;
	448	else if (vap->va_gid != (gid_t)VNOVAL)
	449	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	450	else
	451	ip->ino_data.gid = dip->ino_data.gid;
	452
	453	hammer_ref(&ip->lock);
	454	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	455	hammer_unref(&ip->lock);
	456	panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
	457	}
	458	*ipp = ip;
	459	return(0);
	460	}
	461
	462	/*
	463	* Called by hammer_sync_inode().
	464	*/
	465	static int
	466	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	467	{
	468	hammer_transaction_t trans = cursor->trans;
	469	hammer_record_t record;
	470	int error;
	471
	472	retry:
	473	error = 0;
	474
	475	/*
	476	* If the inode has a presence on-disk then locate it and mark
	477	* it deleted, setting DELONDISK.
	478	*
	479	* The record may or may not be physically deleted, depending on
	480	* the retention policy.
	481	*/
	482	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	483	HAMMER_INODE_ONDISK) {
	484	hammer_normalize_cursor(cursor);
	485	cursor->key_beg.localization = HAMMER_LOCALIZE_INODE;
	486	cursor->key_beg.obj_id = ip->obj_id;
	487	cursor->key_beg.key = 0;
	488	cursor->key_beg.create_tid = 0;
	489	cursor->key_beg.delete_tid = 0;
	490	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	491	cursor->key_beg.obj_type = 0;
	492	cursor->asof = ip->obj_asof;
	493	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	494	cursor->flags \|= HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_ASOF;
	495	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	496
	497	error = hammer_btree_lookup(cursor);
	498	if (hammer_debug_inode)
	499	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	500	if (error) {
	501	kprintf("error %d\n", error);
	502	Debugger("hammer_update_inode");
	503	}
	504
	505	if (error == 0) {
	506	error = hammer_ip_delete_record(cursor, ip, trans->tid);
	507	if (hammer_debug_inode)
	508	kprintf(" error %d\n", error);
	509	if (error && error != EDEADLK) {
	510	kprintf("error %d\n", error);
	511	Debugger("hammer_update_inode2");
	512	}
	513	if (error == 0) {
	514	ip->flags \|= HAMMER_INODE_DELONDISK;
	515	}
	516	if (cursor->node)
	517	hammer_cache_node(&ip->cache[0], cursor->node);
	518	}
	519	if (error == EDEADLK) {
	520	hammer_done_cursor(cursor);
	521	error = hammer_init_cursor(trans, cursor,
	522	&ip->cache[0], ip);
	523	if (hammer_debug_inode)
	524	kprintf("IPDED %p %d\n", ip, error);
	525	if (error == 0)
	526	goto retry;
	527	}
	528	}
	529
	530	/*
	531	* Ok, write out the initial record or a new record (after deleting
	532	* the old one), unless the DELETED flag is set. This routine will
	533	* clear DELONDISK if it writes out a record.
	534	*
	535	* Update our inode statistics if this is the first application of
	536	* the inode on-disk.
	537	*/
	538	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	539	/*
	540	* Generate a record and write it to the media
	541	*/
	542	record = hammer_alloc_mem_record(ip, 0);
	543	record->type = HAMMER_MEM_RECORD_INODE;
	544	record->flush_state = HAMMER_FST_FLUSH;
	545	record->leaf = ip->sync_ino_leaf;
	546	record->leaf.base.create_tid = trans->tid;
	547	record->leaf.data_len = sizeof(ip->sync_ino_data);
	548	record->data = (void *)&ip->sync_ino_data;
	549	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	550	for (;;) {
	551	error = hammer_ip_sync_record_cursor(cursor, record);
	552	if (hammer_debug_inode)
	553	kprintf("GENREC %p rec %08x %d\n",
	554	ip, record->flags, error);
	555	if (error != EDEADLK)
	556	break;
	557	hammer_done_cursor(cursor);
	558	error = hammer_init_cursor(trans, cursor,
	559	&ip->cache[0], ip);
	560	if (hammer_debug_inode)
	561	kprintf("GENREC reinit %d\n", error);
	562	if (error)
	563	break;
	564	}
	565	if (error) {
	566	kprintf("error %d\n", error);
	567	Debugger("hammer_update_inode3");
	568	}
	569
	570	/*
	571	* The record isn't managed by the inode's record tree,
	572	* destroy it whether we succeed or fail.
	573	*/
	574	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	575	record->flags \|= HAMMER_RECF_DELETED_FE;
	576	record->flush_state = HAMMER_FST_IDLE;
	577	hammer_rel_mem_record(record);
	578
	579	/*
	580	* Finish up.
	581	*/
	582	if (error == 0) {
	583	if (hammer_debug_inode)
	584	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	585	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	586	HAMMER_INODE_ITIMES);
	587	ip->flags &= ~HAMMER_INODE_DELONDISK;
	588
	589	/*
	590	* Root volume count of inodes
	591	*/
	592	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	593	hammer_modify_volume_field(trans,
	594	trans->rootvol,
	595	vol0_stat_inodes);
	596	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	597	hammer_modify_volume_done(trans->rootvol);
	598	ip->flags \|= HAMMER_INODE_ONDISK;
	599	if (hammer_debug_inode)
	600	kprintf("NOWONDISK %p\n", ip);
	601	}
	602	}
	603	}
	604
	605	/*
	606	* If the inode has been destroyed, clean out any left-over flags
	607	* that may have been set by the frontend.
	608	*/
	609	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	610	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	611	HAMMER_INODE_ITIMES);
	612	}
	613	return(error);
	614	}
	615
	616	/*
	617	* Update only the itimes fields. This is done no-historically. The
	618	* record is updated in-place on the disk.
	619	*/
	620	static int
	621	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	622	{
	623	hammer_transaction_t trans = cursor->trans;
	624	struct hammer_btree_leaf_elm *leaf;
	625	int error;
	626
	627	retry:
	628	error = 0;
	629	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	630	HAMMER_INODE_ONDISK) {
	631	hammer_normalize_cursor(cursor);
	632	cursor->key_beg.localization = HAMMER_LOCALIZE_INODE;
	633	cursor->key_beg.obj_id = ip->obj_id;
	634	cursor->key_beg.key = 0;
	635	cursor->key_beg.create_tid = 0;
	636	cursor->key_beg.delete_tid = 0;
	637	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	638	cursor->key_beg.obj_type = 0;
	639	cursor->asof = ip->obj_asof;
	640	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	641	cursor->flags \|= HAMMER_CURSOR_ASOF;
	642	cursor->flags \|= HAMMER_CURSOR_GET_LEAF;
	643	cursor->flags \|= HAMMER_CURSOR_GET_DATA;
	644	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	645
	646	error = hammer_btree_lookup(cursor);
	647	if (error) {
	648	kprintf("error %d\n", error);
	649	Debugger("hammer_update_itimes1");
	650	}
	651	if (error == 0) {
	652	/*
	653	* atime/mtime updates can be done in place, but
	654	* they are nasty because we also have to update the
	655	* data_crc in the B-Tree leaf, which means we
	656	* ALSO have to generate UNDO records.
	657	*/
	658	hammer_modify_buffer(trans, cursor->data_buffer,
	659	HAMMER_ITIMES_BASE(&cursor->data->inode),
	660	HAMMER_ITIMES_BYTES);
	661	cursor->data->inode.atime = ip->sync_ino_data.atime;
	662	cursor->data->inode.mtime = ip->sync_ino_data.mtime;
	663	hammer_modify_buffer_done(cursor->data_buffer);
	664
	665	leaf = cursor->leaf;
	666	hammer_modify_node(trans, cursor->node,
	667	&leaf->data_crc,
	668	sizeof(leaf->data_crc));
	669	leaf->data_crc = crc32(cursor->data, leaf->data_len);
	670	hammer_modify_node_done(cursor->node);
	671
	672	ip->sync_flags &= ~HAMMER_INODE_ITIMES;
	673	/* XXX recalculate crc */
	674	hammer_cache_node(&ip->cache[0], cursor->node);
	675	}
	676	if (error == EDEADLK) {
	677	hammer_done_cursor(cursor);
	678	error = hammer_init_cursor(trans, cursor,
	679	&ip->cache[0], ip);
	680	if (error == 0)
	681	goto retry;
	682	}
	683	}
	684	return(error);
	685	}
	686
	687	/*
	688	* Release a reference on an inode, flush as requested.
	689	*
	690	* On the last reference we queue the inode to the flusher for its final
	691	* disposition.
	692	*/
	693	void
	694	hammer_rel_inode(struct hammer_inode *ip, int flush)
	695	{
	696	hammer_mount_t hmp = ip->hmp;
	697
	698	/*
	699	* Handle disposition when dropping the last ref.
	700	*/
	701	for (;;) {
	702	if (ip->lock.refs == 1) {
	703	/*
	704	* Determine whether on-disk action is needed for
	705	* the inode's final disposition.
	706	*/
	707	KKASSERT(ip->vp == NULL);
	708	hammer_inode_unloadable_check(ip, 0);
	709	if (ip->flags & HAMMER_INODE_MODMASK) {
	710	if (hmp->rsv_inodes > desiredvnodes) {
	711	hammer_flush_inode(ip,
	712	HAMMER_FLUSH_SIGNAL);
	713	} else {
	714	hammer_flush_inode(ip, 0);
	715	}
	716	} else if (ip->lock.refs == 1) {
	717	hammer_unload_inode(ip);
	718	break;
	719	}
	720	} else {
	721	if (flush)
	722	hammer_flush_inode(ip, 0);
	723
	724	/*
	725	* The inode still has multiple refs, try to drop
	726	* one ref.
	727	*/
	728	KKASSERT(ip->lock.refs >= 1);
	729	if (ip->lock.refs > 1) {
	730	hammer_unref(&ip->lock);
	731	break;
	732	}
	733	}
	734	}
	735	}
	736
	737	/*
	738	* Unload and destroy the specified inode. Must be called with one remaining
	739	* reference. The reference is disposed of.
	740	*
	741	* This can only be called in the context of the flusher.
	742	*/
	743	static int
	744	hammer_unload_inode(struct hammer_inode *ip)
	745	{
	746	hammer_mount_t hmp = ip->hmp;
	747
	748	KASSERT(ip->lock.refs == 1,
	749	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	750	KKASSERT(ip->vp == NULL);
	751	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	752	KKASSERT(ip->cursor_ip_refs == 0);
	753	KKASSERT(ip->lock.lockcount == 0);
	754	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	755
	756	KKASSERT(RB_EMPTY(&ip->rec_tree));
	757	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	758
	759	RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
	760
	761	hammer_uncache_node(&ip->cache[0]);
	762	hammer_uncache_node(&ip->cache[1]);
	763	if (ip->objid_cache)
	764	hammer_clear_objid(ip);
	765	--hammer_count_inodes;
	766	--hmp->count_inodes;
	767
	768	hammer_inode_wakereclaims(ip);
	769	kfree(ip, M_HAMMER);
	770
	771	return(0);
	772	}
	773
	774	/*
	775	* Called on mount -u when switching from RW to RO or vise-versa. Adjust
	776	* the read-only flag for cached inodes.
	777	*
	778	* This routine is called from a RB_SCAN().
	779	*/
	780	int
	781	hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
	782	{
	783	hammer_mount_t hmp = ip->hmp;
	784
	785	if (hmp->ronly \|\| hmp->asof != HAMMER_MAX_TID)
	786	ip->flags \|= HAMMER_INODE_RO;
	787	else
	788	ip->flags &= ~HAMMER_INODE_RO;
	789	return(0);
	790	}
	791
	792	/*
	793	* A transaction has modified an inode, requiring updates as specified by
	794	* the passed flags.
	795	*
	796	* HAMMER_INODE_DDIRTY: Inode data has been updated
	797	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	798	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	799	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	800	* HAMMER_INODE_ITIMES: mtime/atime has been updated
	801	*/
	802	void
	803	hammer_modify_inode(hammer_inode_t ip, int flags)
	804	{
	805	KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 \|\|
	806	(flags & (HAMMER_INODE_DDIRTY \|
	807	HAMMER_INODE_XDIRTY \| HAMMER_INODE_BUFS \|
	808	HAMMER_INODE_DELETED \| HAMMER_INODE_ITIMES)) == 0);
	809	if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
	810	ip->flags \|= HAMMER_INODE_RSV_INODES;
	811	++ip->hmp->rsv_inodes;
	812	}
	813
	814	ip->flags \|= flags;
	815	}
	816
	817	/*
	818	* Request that an inode be flushed. This whole mess cannot block and may
	819	* recurse (if not synchronous). Once requested HAMMER will attempt to
	820	* actively flush the inode until the flush can be done.
	821	*
	822	* The inode may already be flushing, or may be in a setup state. We can
	823	* place the inode in a flushing state if it is currently idle and flag it
	824	* to reflush if it is currently flushing.
	825	*
	826	* If the HAMMER_FLUSH_SYNCHRONOUS flag is specified we will attempt to
	827	* flush the indoe synchronously using the caller's context.
	828	*/
	829	void
	830	hammer_flush_inode(hammer_inode_t ip, int flags)
	831	{
	832	int good;
	833
	834	/*
	835	* Trivial 'nothing to flush' case. If the inode is ina SETUP
	836	* state we have to put it back into an IDLE state so we can
	837	* drop the extra ref.
	838	*/
	839	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	840	if (ip->flush_state == HAMMER_FST_SETUP) {
	841	ip->flush_state = HAMMER_FST_IDLE;
	842	hammer_rel_inode(ip, 0);
	843	}
	844	return;
	845	}
	846
	847	/*
	848	* Our flush action will depend on the current state.
	849	*/
	850	switch(ip->flush_state) {
	851	case HAMMER_FST_IDLE:
	852	/*
	853	* We have no dependancies and can flush immediately. Some
	854	* our children may not be flushable so we have to re-test
	855	* with that additional knowledge.
	856	*/
	857	hammer_flush_inode_core(ip, flags);
	858	break;
	859	case HAMMER_FST_SETUP:
	860	/*
	861	* Recurse upwards through dependancies via target_list
	862	* and start their flusher actions going if possible.
	863	*
	864	* 'good' is our connectivity. -1 means we have none and
	865	* can't flush, 0 means there weren't any dependancies, and
	866	* 1 means we have good connectivity.
	867	*/
	868	good = hammer_setup_parent_inodes(ip);
	869
	870	/*
	871	* We can continue if good >= 0. Determine how many records
	872	* under our inode can be flushed (and mark them).
	873	*/
	874	if (good >= 0) {
	875	hammer_flush_inode_core(ip, flags);
	876	} else {
	877	ip->flags \|= HAMMER_INODE_REFLUSH;
	878	if (flags & HAMMER_FLUSH_SIGNAL) {
	879	ip->flags \|= HAMMER_INODE_RESIGNAL;
	880	hammer_flusher_async(ip->hmp);
	881	}
	882	}
	883	break;
	884	default:
	885	/*
	886	* We are already flushing, flag the inode to reflush
	887	* if needed after it completes its current flush.
	888	*/
	889	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	890	ip->flags \|= HAMMER_INODE_REFLUSH;
	891	if (flags & HAMMER_FLUSH_SIGNAL) {
	892	ip->flags \|= HAMMER_INODE_RESIGNAL;
	893	hammer_flusher_async(ip->hmp);
	894	}
	895	break;
	896	}
	897	}
	898
	899	/*
	900	* Scan ip->target_list, which is a list of records owned by PARENTS to our
	901	* ip which reference our ip.
	902	*
	903	* XXX This is a huge mess of recursive code, but not one bit of it blocks
	904	* so for now do not ref/deref the structures. Note that if we use the
	905	* ref/rel code later, the rel CAN block.
	906	*/
	907	static int
	908	hammer_setup_parent_inodes(hammer_inode_t ip)
	909	{
	910	hammer_record_t depend;
	911	#if 0
	912	hammer_record_t next;
	913	hammer_inode_t pip;
	914	#endif
	915	int good;
	916	int r;
	917
	918	good = 0;
	919	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	920	r = hammer_setup_parent_inodes_helper(depend);
	921	KKASSERT(depend->target_ip == ip);
	922	if (r < 0 && good == 0)
	923	good = -1;
	924	if (r > 0)
	925	good = 1;
	926	}
	927	return(good);
	928
	929	#if 0
	930	retry:
	931	good = 0;
	932	next = TAILQ_FIRST(&ip->target_list);
	933	if (next) {
	934	hammer_ref(&next->lock);
	935	hammer_ref(&next->ip->lock);
	936	}
	937	while ((depend = next) != NULL) {
	938	if (depend->target_ip == NULL) {
	939	pip = depend->ip;
	940	hammer_rel_mem_record(depend);
	941	hammer_rel_inode(pip, 0);
	942	goto retry;
	943	}
	944	KKASSERT(depend->target_ip == ip);
	945	next = TAILQ_NEXT(depend, target_entry);
	946	if (next) {
	947	hammer_ref(&next->lock);
	948	hammer_ref(&next->ip->lock);
	949	}
	950	r = hammer_setup_parent_inodes_helper(depend);
	951	if (r < 0 && good == 0)
	952	good = -1;
	953	if (r > 0)
	954	good = 1;
	955	pip = depend->ip;
	956	hammer_rel_mem_record(depend);
	957	hammer_rel_inode(pip, 0);
	958	}
	959	return(good);
	960	#endif
	961	}
	962
	963	/*
	964	* This helper function takes a record representing the dependancy between
	965	* the parent inode and child inode.
	966	*
	967	* record->ip = parent inode
	968	* record->target_ip = child inode
	969	*
	970	* We are asked to recurse upwards and convert the record from SETUP
	971	* to FLUSH if possible.
	972	*
	973	* Return 1 if the record gives us connectivity
	974	*
	975	* Return 0 if the record is not relevant
	976	*
	977	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	978	*/
	979	static int
	980	hammer_setup_parent_inodes_helper(hammer_record_t record)
	981	{
	982	hammer_mount_t hmp;
	983	hammer_inode_t pip;
	984	int good;
	985
	986	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	987	pip = record->ip;
	988	hmp = pip->hmp;
	989
	990	/*
	991	* If the record is already flushing, is it in our flush group?
	992	*
	993	* If it is in our flush group but it is a general record or a
	994	* delete-on-disk, it does not improve our connectivity (return 0),
	995	* and if the target inode is not trying to destroy itself we can't
	996	* allow the operation yet anyway (the second return -1).
	997	*/
	998	if (record->flush_state == HAMMER_FST_FLUSH) {
	999	if (record->flush_group != hmp->flusher.next) {
	1000	pip->flags \|= HAMMER_INODE_REFLUSH;
	1001	return(-1);
	1002	}
	1003	if (record->type == HAMMER_MEM_RECORD_ADD)
	1004	return(1);
	1005	/* GENERAL or DEL */
	1006	return(0);
	1007	}
	1008
	1009	/*
	1010	* It must be a setup record. Try to resolve the setup dependancies
	1011	* by recursing upwards so we can place ip on the flush list.
	1012	*/
	1013	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1014
	1015	good = hammer_setup_parent_inodes(pip);
	1016
	1017	/*
	1018	* We can't flush ip because it has no connectivity (XXX also check
	1019	* nlinks for pre-existing connectivity!). Flag it so any resolution
	1020	* recurses back down.
	1021	*/
	1022	if (good < 0) {
	1023	pip->flags \|= HAMMER_INODE_REFLUSH;
	1024	return(good);
	1025	}
	1026
	1027	/*
	1028	* We are go, place the parent inode in a flushing state so we can
	1029	* place its record in a flushing state. Note that the parent
	1030	* may already be flushing. The record must be in the same flush
	1031	* group as the parent.
	1032	*/
	1033	if (pip->flush_state != HAMMER_FST_FLUSH)
	1034	hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION);
	1035	KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
	1036	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1037
	1038	#if 0
	1039	if (record->type == HAMMER_MEM_RECORD_DEL &&
	1040	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	1041	/*
	1042	* Regardless of flushing state we cannot sync this path if the
	1043	* record represents a delete-on-disk but the target inode
	1044	* is not ready to sync its own deletion.
	1045	*
	1046	* XXX need to count effective nlinks to determine whether
	1047	* the flush is ok, otherwise removing a hardlink will
	1048	* just leave the DEL record to rot.
	1049	*/
	1050	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1051	return(-1);
	1052	} else
	1053	#endif
	1054	if (pip->flush_group == pip->hmp->flusher.next) {
	1055	/*
	1056	* This is the record we wanted to synchronize. If the
	1057	* record went into a flush state while we blocked it
	1058	* had better be in the correct flush group.
	1059	*/
	1060	if (record->flush_state != HAMMER_FST_FLUSH) {
	1061	record->flush_state = HAMMER_FST_FLUSH;
	1062	record->flush_group = pip->flush_group;
	1063	hammer_ref(&record->lock);
	1064	} else {
	1065	KKASSERT(record->flush_group == pip->flush_group);
	1066	}
	1067	if (record->type == HAMMER_MEM_RECORD_ADD)
	1068	return(1);
	1069
	1070	/*
	1071	* A general or delete-on-disk record does not contribute
	1072	* to our visibility. We can still flush it, however.
	1073	*/
	1074	return(0);
	1075	} else {
	1076	/*
	1077	* We couldn't resolve the dependancies, request that the
	1078	* inode be flushed when the dependancies can be resolved.
	1079	*/
	1080	pip->flags \|= HAMMER_INODE_REFLUSH;
	1081	return(-1);
	1082	}
	1083	}
	1084
	1085	/*
	1086	* This is the core routine placing an inode into the FST_FLUSH state.
	1087	*/
	1088	static void
	1089	hammer_flush_inode_core(hammer_inode_t ip, int flags)
	1090	{
	1091	int go_count;
	1092
	1093	/*
	1094	* Set flush state and prevent the flusher from cycling into
	1095	* the next flush group. Do not place the ip on the list yet.
	1096	* Inodes not in the idle state get an extra reference.
	1097	*/
	1098	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	1099	if (ip->flush_state == HAMMER_FST_IDLE)
	1100	hammer_ref(&ip->lock);
	1101	ip->flush_state = HAMMER_FST_FLUSH;
	1102	ip->flush_group = ip->hmp->flusher.next;
	1103	++ip->hmp->flusher.group_lock;
	1104	++ip->hmp->count_iqueued;
	1105	++hammer_count_iqueued;
	1106
	1107	/*
	1108	* We need to be able to vfsync/truncate from the backend.
	1109	*/
	1110	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	1111	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	1112	ip->flags \|= HAMMER_INODE_VHELD;
	1113	vref(ip->vp);
	1114	}
	1115
	1116	/*
	1117	* Figure out how many in-memory records we can actually flush
	1118	* (not including inode meta-data, buffers, etc).
	1119	*/
	1120	if (flags & HAMMER_FLUSH_RECURSION) {
	1121	go_count = 1;
	1122	} else {
	1123	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1124	hammer_setup_child_callback, NULL);
	1125	}
	1126
	1127	/*
	1128	* This is a more involved test that includes go_count. If we
	1129	* can't flush, flag the inode and return. If go_count is 0 we
	1130	* were are unable to flush any records in our rec_tree and
	1131	* must ignore the XDIRTY flag.
	1132	*/
	1133	if (go_count == 0) {
	1134	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	1135	ip->flags \|= HAMMER_INODE_REFLUSH;
	1136
	1137	--ip->hmp->count_iqueued;
	1138	--hammer_count_iqueued;
	1139
	1140	ip->flush_state = HAMMER_FST_SETUP;
	1141	if (ip->flags & HAMMER_INODE_VHELD) {
	1142	ip->flags &= ~HAMMER_INODE_VHELD;
	1143	vrele(ip->vp);
	1144	}
	1145	if (flags & HAMMER_FLUSH_SIGNAL) {
	1146	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1147	hammer_flusher_async(ip->hmp);
	1148	}
	1149	if (--ip->hmp->flusher.group_lock == 0)
	1150	wakeup(&ip->hmp->flusher.group_lock);
	1151	return;
	1152	}
	1153	}
	1154
	1155	/*
	1156	* Snapshot the state of the inode for the backend flusher.
	1157	*
	1158	* The truncation must be retained in the frontend until after
	1159	* we've actually performed the record deletion.
	1160	*
	1161	* We continue to retain sync_trunc_off even when all truncations
	1162	* have been resolved as an optimization to determine if we can
	1163	* skip the B-Tree lookup for overwrite deletions.
	1164	*
	1165	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	1166	* and stays in ip->flags. Once set, it stays set until the
	1167	* inode is destroyed.
	1168	*/
	1169	ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
	1170	if (ip->sync_flags & HAMMER_INODE_TRUNCATED)
	1171	ip->sync_trunc_off = ip->trunc_off;
	1172	ip->sync_ino_leaf = ip->ino_leaf;
	1173	ip->sync_ino_data = ip->ino_data;
	1174	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	1175	ip->flags &= ~HAMMER_INODE_MODMASK;
	1176	#ifdef DEBUG_TRUNCATE
	1177	if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
	1178	kprintf("truncateS %016llx\n", ip->sync_trunc_off);
	1179	#endif
	1180
	1181	/*
	1182	* The flusher list inherits our inode and reference.
	1183	*/
	1184	TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
	1185	if (--ip->hmp->flusher.group_lock == 0)
	1186	wakeup(&ip->hmp->flusher.group_lock);
	1187
	1188	if (flags & HAMMER_FLUSH_SIGNAL) {
	1189	hammer_flusher_async(ip->hmp);
	1190	}
	1191	}
	1192
	1193	/*
	1194	* Callback for scan of ip->rec_tree. Try to include each record in our
	1195	* flush. ip->flush_group has been set but the inode has not yet been
	1196	* moved into a flushing state.
	1197	*
	1198	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	1199	* both inodes.
	1200	*
	1201	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	1202	* the caller from shortcutting the flush.
	1203	*/
	1204	static int
	1205	hammer_setup_child_callback(hammer_record_t rec, void *data)
	1206	{
	1207	hammer_inode_t target_ip;
	1208	hammer_inode_t ip;
	1209	int r;
	1210
	1211	/*
	1212	* Deleted records are ignored. Note that the flush detects deleted
	1213	* front-end records at multiple points to deal with races. This is
	1214	* just the first line of defense. The only time DELETED_FE cannot
	1215	* be set is when HAMMER_RECF_INTERLOCK_BE is set.
	1216	*
	1217	* Don't get confused between record deletion and, say, directory
	1218	* entry deletion. The deletion of a directory entry that is on
	1219	* the media has nothing to do with the record deletion flags.
	1220	*/
	1221	if (rec->flags & (HAMMER_RECF_DELETED_FE\|HAMMER_RECF_DELETED_BE))
	1222	return(0);
	1223
	1224	/*
	1225	* If the record is in an idle state it has no dependancies and
	1226	* can be flushed.
	1227	*/
	1228	ip = rec->ip;
	1229	r = 0;
	1230
	1231	switch(rec->flush_state) {
	1232	case HAMMER_FST_IDLE:
	1233	/*
	1234	* Record has no setup dependancy, we can flush it.
	1235	*/
	1236	KKASSERT(rec->target_ip == NULL);
	1237	rec->flush_state = HAMMER_FST_FLUSH;
	1238	rec->flush_group = ip->flush_group;
	1239	hammer_ref(&rec->lock);
	1240	r = 1;
	1241	break;
	1242	case HAMMER_FST_SETUP:
	1243	/*
	1244	* Record has a setup dependancy. Try to include the
	1245	* target ip in the flush.
	1246	*
	1247	* We have to be careful here, if we do not do the right
	1248	* thing we can lose track of dirty inodes and the system
	1249	* will lockup trying to allocate buffers.
	1250	*/
	1251	target_ip = rec->target_ip;
	1252	KKASSERT(target_ip != NULL);
	1253	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	1254	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	1255	/*
	1256	* If the target IP is already flushing in our group
	1257	* we are golden, otherwise make sure the target
	1258	* reflushes.
	1259	*/
	1260	if (target_ip->flush_group == ip->flush_group) {
	1261	rec->flush_state = HAMMER_FST_FLUSH;
	1262	rec->flush_group = ip->flush_group;
	1263	hammer_ref(&rec->lock);
	1264	r = 1;
	1265	} else {
	1266	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1267	}
	1268	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	1269	/*
	1270	* If the target IP is not flushing we can force
	1271	* it to flush, even if it is unable to write out
	1272	* any of its own records we have at least one in
	1273	* hand that we CAN deal with.
	1274	*/
	1275	rec->flush_state = HAMMER_FST_FLUSH;
	1276	rec->flush_group = ip->flush_group;
	1277	hammer_ref(&rec->lock);
	1278	hammer_flush_inode_core(target_ip,
	1279	HAMMER_FLUSH_RECURSION);
	1280	r = 1;
	1281	} else {
	1282	/*
	1283	* General or delete-on-disk record.
	1284	*
	1285	* XXX this needs help. If a delete-on-disk we could
	1286	* disconnect the target. If the target has its own
	1287	* dependancies they really need to be flushed.
	1288	*
	1289	* XXX
	1290	*/
	1291	rec->flush_state = HAMMER_FST_FLUSH;
	1292	rec->flush_group = ip->flush_group;
	1293	hammer_ref(&rec->lock);
	1294	hammer_flush_inode_core(target_ip,
	1295	HAMMER_FLUSH_RECURSION);
	1296	r = 1;
	1297	}
	1298	break;
	1299	case HAMMER_FST_FLUSH:
	1300	/*
	1301	* Record already associated with a flush group. It had
	1302	* better be ours.
	1303	*/
	1304	KKASSERT(rec->flush_group == ip->flush_group);
	1305	r = 1;
	1306	break;
	1307	}
	1308	return(r);
	1309	}
	1310
	1311	/*
	1312	* Wait for a previously queued flush to complete
	1313	*/
	1314	void
	1315	hammer_wait_inode(hammer_inode_t ip)
	1316	{
	1317	while (ip->flush_state != HAMMER_FST_IDLE) {
	1318	if (ip->flush_state == HAMMER_FST_SETUP) {
	1319	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1320	} else {
	1321	ip->flags \|= HAMMER_INODE_FLUSHW;
	1322	tsleep(&ip->flags, 0, "hmrwin", 0);
	1323	}
	1324	}
	1325	}
	1326
	1327	/*
	1328	* Called by the backend code when a flush has been completed.
	1329	* The inode has already been removed from the flush list.
	1330	*
	1331	* A pipelined flush can occur, in which case we must re-enter the
	1332	* inode on the list and re-copy its fields.
	1333	*/
	1334	void
	1335	hammer_flush_inode_done(hammer_inode_t ip)
	1336	{
	1337	hammer_mount_t hmp;
	1338	int dorel;
	1339
	1340	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	1341
	1342	hmp = ip->hmp;
	1343
	1344	/*
	1345	* Merge left-over flags back into the frontend and fix the state.
	1346	*/
	1347	ip->flags \|= ip->sync_flags;
	1348
	1349	/*
	1350	* The backend may have adjusted nlinks, so if the adjusted nlinks
	1351	* does not match the fronttend set the frontend's RDIRTY flag again.
	1352	*/
	1353	if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
	1354	ip->flags \|= HAMMER_INODE_DDIRTY;
	1355
	1356	/*
	1357	* Fix up the dirty buffer status. IO completions will also
	1358	* try to clean up rsv_databufs.
	1359	*/
	1360	if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
	1361	ip->flags \|= HAMMER_INODE_BUFS;
	1362	} else {
	1363	hmp->rsv_databufs -= ip->rsv_databufs;
	1364	ip->rsv_databufs = 0;
	1365	}
	1366
	1367	/*
	1368	* Re-set the XDIRTY flag if some of the inode's in-memory records
	1369	* could not be flushed.
	1370	*/
	1371	KKASSERT((RB_EMPTY(&ip->rec_tree) &&
	1372	(ip->flags & HAMMER_INODE_XDIRTY) == 0) \|\|
	1373	(!RB_EMPTY(&ip->rec_tree) &&
	1374	(ip->flags & HAMMER_INODE_XDIRTY) != 0));
	1375
	1376	/*
	1377	* Do not lose track of inodes which no longer have vnode
	1378	* assocations, otherwise they may never get flushed again.
	1379	*/
	1380	if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
	1381	ip->flags \|= HAMMER_INODE_REFLUSH;
	1382
	1383	/*
	1384	* Adjust flush_state. The target state (idle or setup) shouldn't
	1385	* be terribly important since we will reflush if we really need
	1386	* to do anything. XXX
	1387	*/
	1388	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	1389	ip->flush_state = HAMMER_FST_IDLE;
	1390	dorel = 1;
	1391	} else {
	1392	ip->flush_state = HAMMER_FST_SETUP;
	1393	dorel = 0;
	1394	}
	1395
	1396	--hmp->count_iqueued;
	1397	--hammer_count_iqueued;
	1398
	1399	/*
	1400	* Clean up the vnode ref
	1401	*/
	1402	if (ip->flags & HAMMER_INODE_VHELD) {
	1403	ip->flags &= ~HAMMER_INODE_VHELD;
	1404	vrele(ip->vp);
	1405	}
	1406
	1407	/*
	1408	* If the frontend made more changes and requested another flush,
	1409	* then try to get it running.
	1410	*/
	1411	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1412	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1413	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	1414	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	1415	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1416	} else {
	1417	hammer_flush_inode(ip, 0);
	1418	}
	1419	}
	1420
	1421	/*
	1422	* If the inode is now clean drop the space reservation.
	1423	*/
	1424	if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	1425	(ip->flags & HAMMER_INODE_RSV_INODES)) {
	1426	ip->flags &= ~HAMMER_INODE_RSV_INODES;
	1427	--hmp->rsv_inodes;
	1428	}
	1429
	1430	/*
	1431	* Finally, if the frontend is waiting for a flush to complete,
	1432	* wake it up.
	1433	*/
	1434	if (ip->flush_state != HAMMER_FST_FLUSH) {
	1435	if (ip->flags & HAMMER_INODE_FLUSHW) {
	1436	ip->flags &= ~HAMMER_INODE_FLUSHW;
	1437	wakeup(&ip->flags);
	1438	}
	1439	}
	1440	if (dorel)
	1441	hammer_rel_inode(ip, 0);
	1442	}
	1443
	1444	/*
	1445	* Called from hammer_sync_inode() to synchronize in-memory records
	1446	* to the media.
	1447	*/
	1448	static int
	1449	hammer_sync_record_callback(hammer_record_t record, void *data)
	1450	{
	1451	hammer_cursor_t cursor = data;
	1452	hammer_transaction_t trans = cursor->trans;
	1453	int error;
	1454
	1455	/*
	1456	* Skip records that do not belong to the current flush.
	1457	*/
	1458	++hammer_stats_record_iterations;
	1459	if (record->flush_state != HAMMER_FST_FLUSH)
	1460	return(0);
	1461
	1462	#if 1
	1463	if (record->flush_group != record->ip->flush_group) {
	1464	kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	1465	Debugger("blah2");
	1466	return(0);
	1467	}
	1468	#endif
	1469	KKASSERT(record->flush_group == record->ip->flush_group);
	1470
	1471	/*
	1472	* Interlock the record using the BE flag. Once BE is set the
	1473	* frontend cannot change the state of FE.
	1474	*
	1475	* NOTE: If FE is set prior to us setting BE we still sync the
	1476	* record out, but the flush completion code converts it to
	1477	* a delete-on-disk record instead of destroying it.
	1478	*/
	1479	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	1480	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1481
	1482	/*
	1483	* The backend may have already disposed of the record.
	1484	*/
	1485	if (record->flags & HAMMER_RECF_DELETED_BE) {
	1486	error = 0;
	1487	goto done;
	1488	}
	1489
	1490	/*
	1491	* If the whole inode is being deleting all on-disk records will
	1492	* be deleted very soon, we can't sync any new records to disk
	1493	* because they will be deleted in the same transaction they were
	1494	* created in (delete_tid == create_tid), which will assert.
	1495	*
	1496	* XXX There may be a case with RECORD_ADD with DELETED_FE set
	1497	* that we currently panic on.
	1498	*/
	1499	if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
	1500	switch(record->type) {
	1501	case HAMMER_MEM_RECORD_DATA:
	1502	/*
	1503	* We don't have to do anything, if the record was
	1504	* committed the space will have been accounted for
	1505	* in the blockmap.
	1506	*/
	1507	/* fall through */
	1508	case HAMMER_MEM_RECORD_GENERAL:
	1509	record->flags \|= HAMMER_RECF_DELETED_FE;
	1510	record->flags \|= HAMMER_RECF_DELETED_BE;
	1511	error = 0;
	1512	goto done;
	1513	case HAMMER_MEM_RECORD_ADD:
	1514	panic("hammer_sync_record_callback: illegal add "
	1515	"during inode deletion record %p", record);
	1516	break; /* NOT REACHED */
	1517	case HAMMER_MEM_RECORD_INODE:
	1518	panic("hammer_sync_record_callback: attempt to "
	1519	"sync inode record %p?", record);
	1520	break; /* NOT REACHED */
	1521	case HAMMER_MEM_RECORD_DEL:
	1522	/*
	1523	* Follow through and issue the on-disk deletion
	1524	*/
	1525	break;
	1526	}
	1527	}
	1528
	1529	/*
	1530	* If DELETED_FE is set special handling is needed for directory
	1531	* entries. Dependant pieces related to the directory entry may
	1532	* have already been synced to disk. If this occurs we have to
	1533	* sync the directory entry and then change the in-memory record
	1534	* from an ADD to a DELETE to cover the fact that it's been
	1535	* deleted by the frontend.
	1536	*
	1537	* A directory delete covering record (MEM_RECORD_DEL) can never
	1538	* be deleted by the frontend.
	1539	*
	1540	* Any other record type (aka DATA) can be deleted by the frontend.
	1541	* XXX At the moment the flusher must skip it because there may
	1542	* be another data record in the flush group for the same block,
	1543	* meaning that some frontend data changes can leak into the backend's
	1544	* synchronization point.
	1545	*/
	1546	if (record->flags & HAMMER_RECF_DELETED_FE) {
	1547	if (record->type == HAMMER_MEM_RECORD_ADD) {
	1548	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	1549	} else {
	1550	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	1551	record->flags \|= HAMMER_RECF_DELETED_BE;
	1552	error = 0;
	1553	goto done;
	1554	}
	1555	}
	1556
	1557	/*
	1558	* Assign the create_tid for new records. Deletions already
	1559	* have the record's entire key properly set up.
	1560	*/
	1561	if (record->type != HAMMER_MEM_RECORD_DEL)
	1562	record->leaf.base.create_tid = trans->tid;
	1563	for (;;) {
	1564	error = hammer_ip_sync_record_cursor(cursor, record);
	1565	if (error != EDEADLK)
	1566	break;
	1567	hammer_done_cursor(cursor);
	1568	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	1569	record->ip);
	1570	if (error)
	1571	break;
	1572	}
	1573	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	1574
	1575	if (error) {
	1576	error = -error;
	1577	if (error != -ENOSPC) {
	1578	kprintf("hammer_sync_record_callback: sync failed rec "
	1579	"%p, error %d\n", record, error);
	1580	Debugger("sync failed rec");
	1581	}
	1582	}
	1583	done:
	1584	hammer_flush_record_done(record, error);
	1585	return(error);
	1586	}
	1587
	1588	/*
	1589	* XXX error handling
	1590	*/
	1591	int
	1592	hammer_sync_inode(hammer_inode_t ip)
	1593	{
	1594	struct hammer_transaction trans;
	1595	struct hammer_cursor cursor;
	1596	hammer_node_t tmp_node;
	1597	hammer_record_t depend;
	1598	hammer_record_t next;
	1599	int error, tmp_error;
	1600	u_int64_t nlinks;
	1601
	1602	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	1603	return(0);
	1604
	1605	hammer_start_transaction_fls(&trans, ip->hmp);
	1606	error = hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
	1607	if (error)
	1608	goto done;
	1609
	1610	/*
	1611	* Any directory records referencing this inode which are not in
	1612	* our current flush group must adjust our nlink count for the
	1613	* purposes of synchronization to disk.
	1614	*
	1615	* Records which are in our flush group can be unlinked from our
	1616	* inode now, potentially allowing the inode to be physically
	1617	* deleted.
	1618	*
	1619	* This cannot block.
	1620	*/
	1621	nlinks = ip->ino_data.nlinks;
	1622	next = TAILQ_FIRST(&ip->target_list);
	1623	while ((depend = next) != NULL) {
	1624	next = TAILQ_NEXT(depend, target_entry);
	1625	if (depend->flush_state == HAMMER_FST_FLUSH &&
	1626	depend->flush_group == ip->hmp->flusher.act) {
	1627	/*
	1628	* If this is an ADD that was deleted by the frontend
	1629	* the frontend nlinks count will have already been
	1630	* decremented, but the backend is going to sync its
	1631	* directory entry and must account for it. The
	1632	* record will be converted to a delete-on-disk when
	1633	* it gets synced.
	1634	*
	1635	* If the ADD was not deleted by the frontend we
	1636	* can remove the dependancy from our target_list.
	1637	*/
	1638	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	1639	++nlinks;
	1640	} else {
	1641	TAILQ_REMOVE(&ip->target_list, depend,
	1642	target_entry);
	1643	depend->target_ip = NULL;
	1644	}
	1645	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	1646	/*
	1647	* Not part of our flush group
	1648	*/
	1649	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	1650	switch(depend->type) {
	1651	case HAMMER_MEM_RECORD_ADD:
	1652	--nlinks;
	1653	break;
	1654	case HAMMER_MEM_RECORD_DEL:
	1655	++nlinks;
	1656	break;
	1657	default:
	1658	break;
	1659	}
	1660	}
	1661	}
	1662
	1663	/*
	1664	* Set dirty if we had to modify the link count.
	1665	*/
	1666	if (ip->sync_ino_data.nlinks != nlinks) {
	1667	KKASSERT((int64_t)nlinks >= 0);
	1668	ip->sync_ino_data.nlinks = nlinks;
	1669	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	1670	}
	1671
	1672	/*
	1673	* If there is a trunction queued destroy any data past the (aligned)
	1674	* truncation point. Userland will have dealt with the buffer
	1675	* containing the truncation point for us.
	1676	*
	1677	* We don't flush pending frontend data buffers until after we've
	1678	* dealt with the truncation.
	1679	*/
	1680	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	1681	/*
	1682	* Interlock trunc_off. The VOP front-end may continue to
	1683	* make adjustments to it while we are blocked.
	1684	*/
	1685	off_t trunc_off;
	1686	off_t aligned_trunc_off;
	1687	int blkmask;
	1688
	1689	trunc_off = ip->sync_trunc_off;
	1690	blkmask = hammer_blocksize(trunc_off) - 1;
	1691	aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
	1692
	1693	/*
	1694	* Delete any whole blocks on-media. The front-end has
	1695	* already cleaned out any partial block and made it
	1696	* pending. The front-end may have updated trunc_off
	1697	* while we were blocked so we only use sync_trunc_off.
	1698	*/
	1699	error = hammer_ip_delete_range(&cursor, ip,
	1700	aligned_trunc_off,
	1701	0x7FFFFFFFFFFFFFFFLL, 1);
	1702	if (error)
	1703	Debugger("hammer_ip_delete_range errored");
	1704
	1705	/*
	1706	* Clear the truncation flag on the backend after we have
	1707	* complete the deletions. Backend data is now good again
	1708	* (including new records we are about to sync, below).
	1709	*
	1710	* Leave sync_trunc_off intact. As we write additional
	1711	* records the backend will update sync_trunc_off. This
	1712	* tells the backend whether it can skip the overwrite
	1713	* test. This should work properly even when the backend
	1714	* writes full blocks where the truncation point straddles
	1715	* the block because the comparison is against the base
	1716	* offset of the record.
	1717	*/
	1718	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1719	/* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
	1720	} else {
	1721	error = 0;
	1722	}
	1723
	1724	/*
	1725	* Now sync related records. These will typically be directory
	1726	* entries or delete-on-disk records.
	1727	*
	1728	* Not all records will be flushed, but clear XDIRTY anyway. We
	1729	* will set it again in the frontend hammer_flush_inode_done()
	1730	* if records remain.
	1731	*/
	1732	if (error == 0) {
	1733	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1734	hammer_sync_record_callback, &cursor);
	1735	if (tmp_error < 0)
	1736	tmp_error = -error;
	1737	if (tmp_error)
	1738	error = tmp_error;
	1739	}
	1740	hammer_cache_node(&ip->cache[1], cursor.node);
	1741
	1742	/*
	1743	* Re-seek for inode update.
	1744	*/
	1745	if (error == 0) {
	1746	tmp_node = hammer_ref_node_safe(ip->hmp, &ip->cache[0], &error);
	1747	if (tmp_node) {
	1748	hammer_cursor_seek(&cursor, tmp_node, 0);
	1749	hammer_rel_node(tmp_node);
	1750	}
	1751	error = 0;
	1752	}
	1753
	1754	/*
	1755	* If we are deleting the inode the frontend had better not have
	1756	* any active references on elements making up the inode.
	1757	*/
	1758	if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
	1759	RB_EMPTY(&ip->rec_tree) &&
	1760	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	1761	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	1762	int count1 = 0;
	1763
	1764	ip->flags \|= HAMMER_INODE_DELETED;
	1765	error = hammer_ip_delete_range_all(&cursor, ip, &count1);
	1766	if (error == 0) {
	1767	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	1768	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1769	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1770
	1771	/*
	1772	* Set delete_tid in both the frontend and backend
	1773	* copy of the inode record. The DELETED flag handles
	1774	* this, do not set RDIRTY.
	1775	*/
	1776	ip->ino_leaf.base.delete_tid = trans.tid;
	1777	ip->sync_ino_leaf.base.delete_tid = trans.tid;
	1778
	1779	/*
	1780	* Adjust the inode count in the volume header
	1781	*/
	1782	if (ip->flags & HAMMER_INODE_ONDISK) {
	1783	hammer_modify_volume_field(&trans,
	1784	trans.rootvol,
	1785	vol0_stat_inodes);
	1786	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1787	hammer_modify_volume_done(trans.rootvol);
	1788	}
	1789	} else {
	1790	ip->flags &= ~HAMMER_INODE_DELETED;
	1791	Debugger("hammer_ip_delete_range_all errored");
	1792	}
	1793	}
	1794
	1795	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	1796
	1797	if (error)
	1798	Debugger("RB_SCAN errored");
	1799
	1800	/*
	1801	* Now update the inode's on-disk inode-data and/or on-disk record.
	1802	* DELETED and ONDISK are managed only in ip->flags.
	1803	*/
	1804	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	1805	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	1806	/*
	1807	* If deleted and on-disk, don't set any additional flags.
	1808	* the delete flag takes care of things.
	1809	*
	1810	* Clear flags which may have been set by the frontend.
	1811	*/
	1812	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY\|
	1813	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES\|
	1814	HAMMER_INODE_DELETING);
	1815	break;
	1816	case HAMMER_INODE_DELETED:
	1817	/*
	1818	* Take care of the case where a deleted inode was never
	1819	* flushed to the disk in the first place.
	1820	*
	1821	* Clear flags which may have been set by the frontend.
	1822	*/
	1823	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY\|
	1824	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES\|
	1825	HAMMER_INODE_DELETING);
	1826	while (RB_ROOT(&ip->rec_tree)) {
	1827	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	1828	hammer_ref(&record->lock);
	1829	KKASSERT(record->lock.refs == 1);
	1830	record->flags \|= HAMMER_RECF_DELETED_FE;
	1831	record->flags \|= HAMMER_RECF_DELETED_BE;
	1832	hammer_rel_mem_record(record);
	1833	}
	1834	break;
	1835	case HAMMER_INODE_ONDISK:
	1836	/*
	1837	* If already on-disk, do not set any additional flags.
	1838	*/
	1839	break;
	1840	default:
	1841	/*
	1842	* If not on-disk and not deleted, set both dirty flags
	1843	* to force an initial record to be written. Also set
	1844	* the create_tid for the inode.
	1845	*
	1846	* Set create_tid in both the frontend and backend
	1847	* copy of the inode record.
	1848	*/
	1849	ip->ino_leaf.base.create_tid = trans.tid;
	1850	ip->sync_ino_leaf.base.create_tid = trans.tid;
	1851	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	1852	break;
	1853	}
	1854
	1855	/*
	1856	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	1857	* is already on-disk the old record is marked as deleted.
	1858	*
	1859	* If DELETED is set hammer_update_inode() will delete the existing
	1860	* record without writing out a new one.
	1861	*
	1862	* If ONLY the ITIMES flag is set we can update the record in-place.
	1863	*/
	1864	if (ip->flags & HAMMER_INODE_DELETED) {
	1865	error = hammer_update_inode(&cursor, ip);
	1866	} else
	1867	if ((ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_ITIMES)) ==
	1868	HAMMER_INODE_ITIMES) {
	1869	error = hammer_update_itimes(&cursor, ip);
	1870	} else
	1871	if (ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_ITIMES)) {
	1872	error = hammer_update_inode(&cursor, ip);
	1873	}
	1874	if (error)
	1875	Debugger("hammer_update_itimes/inode errored");
	1876	done:
	1877	/*
	1878	* Save the TID we used to sync the inode with to make sure we
	1879	* do not improperly reuse it.
	1880	*/
	1881	hammer_done_cursor(&cursor);
	1882	hammer_done_transaction(&trans);
	1883	return(error);
	1884	}
	1885
	1886	/*
	1887	* This routine is called when the OS is no longer actively referencing
	1888	* the inode (but might still be keeping it cached), or when releasing
	1889	* the last reference to an inode.
	1890	*
	1891	* At this point if the inode's nlinks count is zero we want to destroy
	1892	* it, which may mean destroying it on-media too.
	1893	*/
	1894	void
	1895	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	1896	{
	1897	struct vnode *vp;
	1898
	1899	/*
	1900	* Set the DELETING flag when the link count drops to 0 and the
	1901	* OS no longer has any opens on the inode.
	1902	*
	1903	* The backend will clear DELETING (a mod flag) and set DELETED
	1904	* (a state flag) when it is actually able to perform the
	1905	* operation.
	1906	*/
	1907	if (ip->ino_data.nlinks == 0 &&
	1908	(ip->flags & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	1909	ip->flags \|= HAMMER_INODE_DELETING;
	1910	ip->flags \|= HAMMER_INODE_TRUNCATED;
	1911	ip->trunc_off = 0;
	1912	vp = NULL;
	1913	if (getvp) {
	1914	if (hammer_get_vnode(ip, &vp) != 0)
	1915	return;
	1916	}
	1917
	1918	/*
	1919	* Final cleanup
	1920	*/
	1921	if (ip->vp) {
	1922	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	1923	vnode_pager_setsize(ip->vp, 0);
	1924	}
	1925	if (getvp) {
	1926	vput(vp);
	1927	}
	1928	}
	1929	}
	1930
	1931	/*
	1932	* Re-test an inode when a dependancy had gone away to see if we
	1933	* can chain flush it.
	1934	*/
	1935	void
	1936	hammer_test_inode(hammer_inode_t ip)
	1937	{
	1938	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1939	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1940	hammer_ref(&ip->lock);
	1941	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	1942	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	1943	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1944	} else {
	1945	hammer_flush_inode(ip, 0);
	1946	}
	1947	hammer_rel_inode(ip, 0);
	1948	}
	1949	}
	1950
	1951	/*
	1952	* Clear the RECLAIM flag on an inode. This occurs when the inode is
	1953	* reassociated with a vp or just before it gets freed.
	1954	*
	1955	* Wakeup one thread blocked waiting on reclaims to complete. Note that
	1956	* the inode the thread is waiting on behalf of is a different inode then
	1957	* the inode we are called with. This is to create a pipeline.
	1958	*/
	1959	static void
	1960	hammer_inode_wakereclaims(hammer_inode_t ip)
	1961	{
	1962	struct hammer_reclaim *reclaim;
	1963	hammer_mount_t hmp = ip->hmp;
	1964
	1965	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
	1966	return;
	1967
	1968	--hammer_count_reclaiming;
	1969	--hmp->inode_reclaims;
	1970	ip->flags &= ~HAMMER_INODE_RECLAIM;
	1971
	1972	if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
	1973	TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
	1974	reclaim->okydoky = 1;
	1975	wakeup(reclaim);
	1976	}
	1977	}
	1978
	1979	/*
	1980	* Setup our reclaim pipeline. We only let so many detached (and dirty)
	1981	* inodes build up before we start blocking.
	1982	*
	1983	* When we block we don't care which inode has finished reclaiming,
	1984	* as lone as one does. This is somewhat heuristical... we also put a
	1985	* cap on how long we are willing to wait.
	1986	*/
	1987	void
	1988	hammer_inode_waitreclaims(hammer_mount_t hmp)
	1989	{
	1990	struct hammer_reclaim reclaim;
	1991	int delay;
	1992
	1993	if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT) {
	1994	reclaim.okydoky = 0;
	1995	TAILQ_INSERT_TAIL(&hmp->reclaim_list,
	1996	&reclaim, entry);
	1997	} else {
	1998	reclaim.okydoky = 1;
	1999	}
	2000
	2001	if (reclaim.okydoky == 0) {
	2002	delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
	2003	HAMMER_RECLAIM_WAIT;
	2004	if (delay >= 0)
	2005	tsleep(&reclaim, 0, "hmrrcm", delay + 1);
	2006	if (reclaim.okydoky == 0)
	2007	TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
	2008	}
	2009	}
	2010