gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.47 2008/05/03 05:28:55 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39	#include <sys/buf.h>
	40	#include <sys/buf2.h>
	41
	42	static int hammer_unload_inode(struct hammer_inode *ip);
	43	static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
	44	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	45	static void hammer_inode_unloadable_check(hammer_inode_t ip);
	46	static int hammer_setup_parent_inodes(hammer_record_t record);
	47
	48	/*
	49	* The kernel is not actively referencing this vnode but is still holding
	50	* it cached.
	51	*
	52	* This is called from the frontend.
	53	*/
	54	int
	55	hammer_vop_inactive(struct vop_inactive_args *ap)
	56	{
	57	struct hammer_inode *ip = VTOI(ap->a_vp);
	58
	59	/*
	60	* Degenerate case
	61	*/
	62	if (ip == NULL) {
	63	vrecycle(ap->a_vp);
	64	return(0);
	65	}
	66
	67	/*
	68	* If the inode no longer has visibility in the filesystem and is
	69	* fairly clean, try to recycle it immediately. This can deadlock
	70	* in vfsync() if we aren't careful.
	71	*/
	72	hammer_inode_unloadable_check(ip);
	73	if (ip->flags & HAMMER_INODE_MODMASK)
	74	hammer_flush_inode(ip, 0);
	75	else if (ip->ino_rec.ino_nlinks == 0)
	76	vrecycle(ap->a_vp);
	77	return(0);
	78	}
	79
	80	/*
	81	* Release the vnode association. This is typically (but not always)
	82	* the last reference on the inode.
	83	*
	84	* Once the association is lost we are on our own with regards to
	85	* flushing the inode.
	86	*/
	87	int
	88	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	89	{
	90	struct hammer_inode *ip;
	91	struct vnode *vp;
	92
	93	vp = ap->a_vp;
	94
	95	if ((ip = vp->v_data) != NULL) {
	96	vp->v_data = NULL;
	97	ip->vp = NULL;
	98	hammer_rel_inode(ip, 1);
	99	}
	100	return(0);
	101	}
	102
	103	/*
	104	* Return a locked vnode for the specified inode. The inode must be
	105	* referenced but NOT LOCKED on entry and will remain referenced on
	106	* return.
	107	*
	108	* Called from the frontend.
	109	*/
	110	int
	111	hammer_get_vnode(struct hammer_inode ip, int lktype, struct vnode *vpp)
	112	{
	113	struct vnode *vp;
	114	int error = 0;
	115
	116	for (;;) {
	117	if ((vp = ip->vp) == NULL) {
	118	error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
	119	if (error)
	120	break;
	121	hammer_lock_ex(&ip->lock);
	122	if (ip->vp != NULL) {
	123	hammer_unlock(&ip->lock);
	124	vp->v_type = VBAD;
	125	vx_put(vp);
	126	continue;
	127	}
	128	hammer_ref(&ip->lock);
	129	vp = *vpp;
	130	ip->vp = vp;
	131	vp->v_type = hammer_get_vnode_type(
	132	ip->ino_rec.base.base.obj_type);
	133
	134	switch(ip->ino_rec.base.base.obj_type) {
	135	case HAMMER_OBJTYPE_CDEV:
	136	case HAMMER_OBJTYPE_BDEV:
	137	vp->v_ops = &ip->hmp->mp->mnt_vn_spec_ops;
	138	addaliasu(vp, ip->ino_data.rmajor,
	139	ip->ino_data.rminor);
	140	break;
	141	case HAMMER_OBJTYPE_FIFO:
	142	vp->v_ops = &ip->hmp->mp->mnt_vn_fifo_ops;
	143	break;
	144	default:
	145	break;
	146	}
	147
	148	/*
	149	* Only mark as the root vnode if the ip is not
	150	* historical, otherwise the VFS cache will get
	151	* confused. The other half of the special handling
	152	* is in hammer_vop_nlookupdotdot().
	153	*/
	154	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	155	ip->obj_asof == ip->hmp->asof) {
	156	vp->v_flag \|= VROOT;
	157	}
	158
	159	vp->v_data = (void *)ip;
	160	/* vnode locked by getnewvnode() */
	161	/* make related vnode dirty if inode dirty? */
	162	hammer_unlock(&ip->lock);
	163	if (vp->v_type == VREG)
	164	vinitvmio(vp, ip->ino_rec.ino_size);
	165	break;
	166	}
	167
	168	/*
	169	* loop if the vget fails (aka races), or if the vp
	170	* no longer matches ip->vp.
	171	*/
	172	if (vget(vp, LK_EXCLUSIVE) == 0) {
	173	if (vp == ip->vp)
	174	break;
	175	vput(vp);
	176	}
	177	}
	178	*vpp = vp;
	179	return(error);
	180	}
	181
	182	/*
	183	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	184	* do not attach or detach the related vnode (use hammer_get_vnode() for
	185	* that).
	186	*
	187	* The flags argument is only applied for newly created inodes, and only
	188	* certain flags are inherited.
	189	*
	190	* Called from the frontend.
	191	*/
	192	struct hammer_inode *
	193	hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache,
	194	u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
	195	{
	196	hammer_mount_t hmp = trans->hmp;
	197	struct hammer_inode_info iinfo;
	198	struct hammer_cursor cursor;
	199	struct hammer_inode *ip;
	200
	201	/*
	202	* Determine if we already have an inode cached. If we do then
	203	* we are golden.
	204	*/
	205	iinfo.obj_id = obj_id;
	206	iinfo.obj_asof = asof;
	207	loop:
	208	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	209	if (ip) {
	210	hammer_ref(&ip->lock);
	211	*errorp = 0;
	212	return(ip);
	213	}
	214
	215	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	216	++hammer_count_inodes;
	217	ip->obj_id = obj_id;
	218	ip->obj_asof = iinfo.obj_asof;
	219	ip->hmp = hmp;
	220	ip->flags = flags & HAMMER_INODE_RO;
	221	if (hmp->ronly)
	222	ip->flags \|= HAMMER_INODE_RO;
	223	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	224	RB_INIT(&ip->rec_tree);
	225	TAILQ_INIT(&ip->bio_list);
	226	TAILQ_INIT(&ip->bio_alt_list);
	227	TAILQ_INIT(&ip->target_list);
	228
	229	/*
	230	* Locate the on-disk inode.
	231	*/
	232	retry:
	233	hammer_init_cursor(trans, &cursor, cache, NULL);
	234	cursor.key_beg.obj_id = ip->obj_id;
	235	cursor.key_beg.key = 0;
	236	cursor.key_beg.create_tid = 0;
	237	cursor.key_beg.delete_tid = 0;
	238	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	239	cursor.key_beg.obj_type = 0;
	240	cursor.asof = iinfo.obj_asof;
	241	cursor.flags = HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_GET_DATA \|
	242	HAMMER_CURSOR_ASOF;
	243
	244	*errorp = hammer_btree_lookup(&cursor);
	245	if (*errorp == EDEADLK) {
	246	hammer_done_cursor(&cursor);
	247	goto retry;
	248	}
	249
	250	/*
	251	* On success the B-Tree lookup will hold the appropriate
	252	* buffer cache buffers and provide a pointer to the requested
	253	* information. Copy the information to the in-memory inode
	254	* and cache the B-Tree node to improve future operations.
	255	*/
	256	if (*errorp == 0) {
	257	ip->ino_rec = cursor.record->inode;
	258	ip->ino_data = cursor.data->inode;
	259	hammer_cache_node(cursor.node, &ip->cache[0]);
	260	if (cache)
	261	hammer_cache_node(cursor.node, cache);
	262	}
	263
	264	/*
	265	* On success load the inode's record and data and insert the
	266	* inode into the B-Tree. It is possible to race another lookup
	267	* insertion of the same inode so deal with that condition too.
	268	*
	269	* The cursor's locked node interlocks against others creating and
	270	* destroying ip while we were blocked.
	271	*/
	272	if (*errorp == 0) {
	273	hammer_ref(&ip->lock);
	274	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	275	hammer_uncache_node(&ip->cache[0]);
	276	hammer_uncache_node(&ip->cache[1]);
	277	KKASSERT(ip->lock.refs == 1);
	278	--hammer_count_inodes;
	279	kfree(ip, M_HAMMER);
	280	hammer_done_cursor(&cursor);
	281	goto loop;
	282	}
	283	ip->flags \|= HAMMER_INODE_ONDISK;
	284	} else {
	285	--hammer_count_inodes;
	286	kfree(ip, M_HAMMER);
	287	ip = NULL;
	288	}
	289	hammer_done_cursor(&cursor);
	290	return (ip);
	291	}
	292
	293	/*
	294	* Create a new filesystem object, returning the inode in *ipp. The
	295	* returned inode will be referenced.
	296	*
	297	* The inode is created in-memory.
	298	*/
	299	int
	300	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	301	struct ucred *cred, hammer_inode_t dip,
	302	struct hammer_inode **ipp)
	303	{
	304	hammer_mount_t hmp;
	305	hammer_inode_t ip;
	306	uid_t xuid;
	307
	308	hmp = trans->hmp;
	309	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	310	++hammer_count_inodes;
	311	ip->obj_id = hammer_alloc_objid(trans, dip);
	312	KKASSERT(ip->obj_id != 0);
	313	ip->obj_asof = hmp->asof;
	314	ip->hmp = hmp;
	315	ip->flush_state = HAMMER_FST_IDLE;
	316	ip->flags = HAMMER_INODE_DDIRTY \| HAMMER_INODE_RDIRTY \|
	317	HAMMER_INODE_ITIMES;
	318
	319	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	320	RB_INIT(&ip->rec_tree);
	321	TAILQ_INIT(&ip->bio_list);
	322	TAILQ_INIT(&ip->bio_alt_list);
	323	TAILQ_INIT(&ip->target_list);
	324
	325	ip->ino_rec.ino_atime = trans->time;
	326	ip->ino_rec.ino_mtime = trans->time;
	327	ip->ino_rec.ino_size = 0;
	328	ip->ino_rec.ino_nlinks = 0;
	329	/* XXX */
	330	ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
	331	ip->ino_rec.base.base.obj_id = ip->obj_id;
	332	ip->ino_rec.base.base.key = 0;
	333	ip->ino_rec.base.base.create_tid = 0;
	334	ip->ino_rec.base.base.delete_tid = 0;
	335	ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
	336	ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
	337
	338	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	339	ip->ino_data.mode = vap->va_mode;
	340	ip->ino_data.ctime = trans->time;
	341	ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
	342
	343	switch(ip->ino_rec.base.base.obj_type) {
	344	case HAMMER_OBJTYPE_CDEV:
	345	case HAMMER_OBJTYPE_BDEV:
	346	ip->ino_data.rmajor = vap->va_rmajor;
	347	ip->ino_data.rminor = vap->va_rminor;
	348	break;
	349	default:
	350	break;
	351	}
	352
	353	/*
	354	* Calculate default uid/gid and overwrite with information from
	355	* the vap.
	356	*/
	357	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	358	ip->ino_data.gid = dip->ino_data.gid;
	359	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
	360	&vap->va_mode);
	361	ip->ino_data.mode = vap->va_mode;
	362
	363	if (vap->va_vaflags & VA_UID_UUID_VALID)
	364	ip->ino_data.uid = vap->va_uid_uuid;
	365	else if (vap->va_uid != (uid_t)VNOVAL)
	366	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	367	if (vap->va_vaflags & VA_GID_UUID_VALID)
	368	ip->ino_data.gid = vap->va_gid_uuid;
	369	else if (vap->va_gid != (gid_t)VNOVAL)
	370	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	371
	372	hammer_ref(&ip->lock);
	373	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	374	hammer_unref(&ip->lock);
	375	panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
	376	}
	377	*ipp = ip;
	378	return(0);
	379	}
	380
	381	/*
	382	* Called by hammer_sync_inode().
	383	*/
	384	static int
	385	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	386	{
	387	hammer_transaction_t trans = cursor->trans;
	388	hammer_record_t record;
	389	int error;
	390
	391	retry:
	392	error = 0;
	393
	394	/*
	395	* If the inode has a presence on-disk then locate it and mark
	396	* it deleted, setting DELONDISK.
	397	*
	398	* The record may or may not be physically deleted, depending on
	399	* the retention policy.
	400	*/
	401	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	402	HAMMER_INODE_ONDISK) {
	403	hammer_normalize_cursor(cursor);
	404	cursor->key_beg.obj_id = ip->obj_id;
	405	cursor->key_beg.key = 0;
	406	cursor->key_beg.create_tid = 0;
	407	cursor->key_beg.delete_tid = 0;
	408	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	409	cursor->key_beg.obj_type = 0;
	410	cursor->asof = ip->obj_asof;
	411	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	412	cursor->flags \|= HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_ASOF;
	413	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	414
	415	error = hammer_btree_lookup(cursor);
	416	if (error) {
	417	kprintf("error %d\n", error);
	418	Debugger("hammer_update_inode");
	419	}
	420
	421	if (error == 0) {
	422	error = hammer_ip_delete_record(cursor, trans->tid);
	423	if (error && error != EDEADLK) {
	424	kprintf("error %d\n", error);
	425	Debugger("hammer_update_inode2");
	426	}
	427	if (error == 0) {
	428	ip->flags \|= HAMMER_INODE_DELONDISK;
	429	}
	430	hammer_cache_node(cursor->node, &ip->cache[0]);
	431	}
	432	if (error == EDEADLK) {
	433	hammer_done_cursor(cursor);
	434	error = hammer_init_cursor(trans, cursor,
	435	&ip->cache[0], ip);
	436	if (error == 0)
	437	goto retry;
	438	}
	439	}
	440
	441	/*
	442	* Ok, write out the initial record or a new record (after deleting
	443	* the old one), unless the DELETED flag is set. This routine will
	444	* clear DELONDISK if it writes out a record.
	445	*
	446	* Update our inode statistics if this is the first application of
	447	* the inode on-disk.
	448	*/
	449	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	450	/*
	451	* Generate a record and write it to the media
	452	*/
	453	record = hammer_alloc_mem_record(ip);
	454	record->flush_state = HAMMER_FST_FLUSH;
	455	record->rec.inode = ip->sync_ino_rec;
	456	record->rec.inode.base.base.create_tid = trans->tid;
	457	record->rec.inode.base.data_len = sizeof(ip->sync_ino_data);
	458	record->data = (void *)&ip->sync_ino_data;
	459	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	460	for (;;) {
	461	error = hammer_ip_sync_record_cursor(cursor, record);
	462	if (error != EDEADLK)
	463	break;
	464	hammer_done_cursor(cursor);
	465	error = hammer_init_cursor(trans, cursor,
	466	&ip->cache[0], ip);
	467	if (error)
	468	break;
	469	}
	470	if (error) {
	471	kprintf("error %d\n", error);
	472	Debugger("hammer_update_inode3");
	473	}
	474
	475	/*
	476	* The record isn't managed by the inode's record tree,
	477	* destroy it whether we succeed or fail.
	478	*/
	479	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	480	record->flags \|= HAMMER_RECF_DELETED_FE;
	481	record->flush_state = HAMMER_FST_IDLE;
	482	hammer_rel_mem_record(record);
	483
	484	/*
	485	* Finish up.
	486	*/
	487	if (error == 0) {
	488	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY \|
	489	HAMMER_INODE_DDIRTY \|
	490	HAMMER_INODE_ITIMES);
	491	ip->flags &= ~HAMMER_INODE_DELONDISK;
	492
	493	/*
	494	* Root volume count of inodes
	495	*/
	496	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	497	hammer_modify_volume(trans, trans->rootvol,
	498	NULL, 0);
	499	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	500	hammer_modify_volume_done(trans->rootvol);
	501	ip->flags \|= HAMMER_INODE_ONDISK;
	502	}
	503	}
	504	}
	505
	506	/*
	507	* If the inode has been destroyed, clean out any left-over flags
	508	* that may have been set by the frontend.
	509	*/
	510	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	511	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY \|
	512	HAMMER_INODE_DDIRTY \|
	513	HAMMER_INODE_ITIMES);
	514	}
	515	return(error);
	516	}
	517
	518	/*
	519	* Update only the itimes fields. This is done no-historically. The
	520	* record is updated in-place on the disk.
	521	*/
	522	static int
	523	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	524	{
	525	hammer_transaction_t trans = cursor->trans;
	526	struct hammer_inode_record *rec;
	527	int error;
	528
	529	retry:
	530	error = 0;
	531	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	532	HAMMER_INODE_ONDISK) {
	533	hammer_normalize_cursor(cursor);
	534	cursor->key_beg.obj_id = ip->obj_id;
	535	cursor->key_beg.key = 0;
	536	cursor->key_beg.create_tid = 0;
	537	cursor->key_beg.delete_tid = 0;
	538	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	539	cursor->key_beg.obj_type = 0;
	540	cursor->asof = ip->obj_asof;
	541	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	542	cursor->flags \|= HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_ASOF;
	543	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	544
	545	error = hammer_btree_lookup(cursor);
	546	if (error) {
	547	kprintf("error %d\n", error);
	548	Debugger("hammer_update_itimes1");
	549	}
	550	if (error == 0) {
	551	/*
	552	* Do not generate UNDO records for atime/mtime
	553	* updates.
	554	*/
	555	rec = &cursor->record->inode;
	556	hammer_modify_buffer(trans, cursor->record_buffer,
	557	NULL, 0);
	558	rec->ino_atime = ip->sync_ino_rec.ino_atime;
	559	rec->ino_mtime = ip->sync_ino_rec.ino_mtime;
	560	hammer_modify_buffer_done(cursor->record_buffer);
	561	ip->sync_flags &= ~HAMMER_INODE_ITIMES;
	562	/* XXX recalculate crc */
	563	hammer_cache_node(cursor->node, &ip->cache[0]);
	564	}
	565	if (error == EDEADLK) {
	566	hammer_done_cursor(cursor);
	567	error = hammer_init_cursor(trans, cursor,
	568	&ip->cache[0], ip);
	569	if (error == 0)
	570	goto retry;
	571	}
	572	}
	573	return(error);
	574	}
	575
	576	/*
	577	* Release a reference on an inode, flush as requested.
	578	*
	579	* On the last reference we queue the inode to the flusher for its final
	580	* disposition.
	581	*/
	582	void
	583	hammer_rel_inode(struct hammer_inode *ip, int flush)
	584	{
	585	hammer_mount_t hmp = ip->hmp;
	586
	587	/*
	588	* Handle disposition when dropping the last ref.
	589	*/
	590	for (;;) {
	591	if (ip->lock.refs == 1) {
	592	/*
	593	* Determine whether on-disk action is needed for
	594	* the inode's final disposition.
	595	*/
	596	hammer_inode_unloadable_check(ip);
	597	if (ip->flags & HAMMER_INODE_MODMASK) {
	598	hammer_flush_inode(ip, 0);
	599	} else if (ip->lock.refs == 1) {
	600	hammer_unload_inode(ip);
	601	break;
	602	}
	603	} else {
	604	if (flush)
	605	hammer_flush_inode(ip, 0);
	606
	607	/*
	608	* The inode still has multiple refs, try to drop
	609	* one ref.
	610	*/
	611	KKASSERT(ip->lock.refs >= 1);
	612	if (ip->lock.refs > 1) {
	613	hammer_unref(&ip->lock);
	614	break;
	615	}
	616	}
	617	}
	618
	619	/*
	620	* XXX bad hack until I add code to track inodes in SETUP. We
	621	* can queue a lot of inodes to the syncer but if we don't wake
	622	* it up the undo sets will be too large or too many unflushed
	623	* records will build up and blow our malloc limit.
	624	*/
	625	if (++hmp->reclaim_count > 256) {
	626	hmp->reclaim_count = 0;
	627	hammer_flusher_async(hmp);
	628	}
	629	}
	630
	631	/*
	632	* Unload and destroy the specified inode. Must be called with one remaining
	633	* reference. The reference is disposed of.
	634	*
	635	* This can only be called in the context of the flusher.
	636	*/
	637	static int
	638	hammer_unload_inode(struct hammer_inode *ip)
	639	{
	640	KASSERT(ip->lock.refs == 1,
	641	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	642	KKASSERT(ip->vp == NULL);
	643	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	644	KKASSERT(ip->cursor_ip_refs == 0);
	645	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	646
	647	KKASSERT(RB_EMPTY(&ip->rec_tree));
	648	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	649	KKASSERT(TAILQ_EMPTY(&ip->bio_list));
	650	KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
	651
	652	RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
	653
	654	hammer_uncache_node(&ip->cache[0]);
	655	hammer_uncache_node(&ip->cache[1]);
	656	if (ip->objid_cache)
	657	hammer_clear_objid(ip);
	658	--hammer_count_inodes;
	659	kfree(ip, M_HAMMER);
	660
	661	return(0);
	662	}
	663
	664	/*
	665	* A transaction has modified an inode, requiring updates as specified by
	666	* the passed flags.
	667	*
	668	* HAMMER_INODE_RDIRTY: Inode record has been updated
	669	* HAMMER_INODE_DDIRTY: Inode data has been updated
	670	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	671	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	672	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	673	* HAMMER_INODE_ITIMES: mtime/atime has been updated
	674	*/
	675	void
	676	hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
	677	{
	678	KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 \|\|
	679	(flags & (HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	680	HAMMER_INODE_XDIRTY\|HAMMER_INODE_BUFS\|
	681	HAMMER_INODE_DELETED\|HAMMER_INODE_ITIMES)) == 0);
	682
	683	ip->flags \|= flags;
	684	}
	685
	686	/*
	687	* Request that an inode be flushed. This whole mess cannot block and may
	688	* recurse. Once requested HAMMER will attempt to actively flush it until
	689	* the flush can be done.
	690	*
	691	* The inode may already be flushing, or may be in a setup state. We can
	692	* place the inode in a flushing state if it is currently idle and flag it
	693	* to reflush if it is currently flushing.
	694	*/
	695	void
	696	hammer_flush_inode(hammer_inode_t ip, int flags)
	697	{
	698	hammer_record_t depend;
	699	int r, good;
	700
	701	/*
	702	* Trivial 'nothing to flush' case. If the inode is ina SETUP
	703	* state we have to put it back into an IDLE state so we can
	704	* drop the extra ref.
	705	*/
	706	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	707	if (ip->flush_state == HAMMER_FST_SETUP) {
	708	ip->flush_state = HAMMER_FST_IDLE;
	709	hammer_rel_inode(ip, 0);
	710	}
	711	return;
	712	}
	713
	714	/*
	715	* Our flush action will depend on the current state.
	716	*/
	717	switch(ip->flush_state) {
	718	case HAMMER_FST_IDLE:
	719	/*
	720	* We have no dependancies and can flush immediately. Some
	721	* our children may not be flushable so we have to re-test
	722	* with that additional knowledge.
	723	*/
	724	hammer_flush_inode_core(ip, flags);
	725	break;
	726	case HAMMER_FST_SETUP:
	727	/*
	728	* Recurse upwards through dependancies via target_list
	729	* and start their flusher actions going if possible.
	730	*
	731	* 'good' is our connectivity. -1 means we have none and
	732	* can't flush, 0 means there weren't any dependancies, and
	733	* 1 means we have good connectivity.
	734	*/
	735	good = 0;
	736	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	737	r = hammer_setup_parent_inodes(depend);
	738	if (r < 0 && good == 0)
	739	good = -1;
	740	if (r > 0)
	741	good = 1;
	742	}
	743
	744	/*
	745	* We can continue if good >= 0. Determine how many records
	746	* under our inode can be flushed (and mark them).
	747	*/
	748	if (good >= 0) {
	749	hammer_flush_inode_core(ip, flags);
	750	} else {
	751	ip->flags \|= HAMMER_INODE_REFLUSH;
	752	if (flags & HAMMER_FLUSH_SIGNAL) {
	753	ip->flags \|= HAMMER_INODE_RESIGNAL;
	754	hammer_flusher_async(ip->hmp);
	755	}
	756	}
	757	break;
	758	default:
	759	/*
	760	* We are already flushing, flag the inode to reflush
	761	* if needed after it completes its current flush.
	762	*/
	763	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	764	ip->flags \|= HAMMER_INODE_REFLUSH;
	765	if (flags & HAMMER_FLUSH_SIGNAL) {
	766	ip->flags \|= HAMMER_INODE_RESIGNAL;
	767	hammer_flusher_async(ip->hmp);
	768	}
	769	break;
	770	}
	771	}
	772
	773	/*
	774	* We are asked to recurse upwards and convert the record from SETUP
	775	* to FLUSH if possible. record->ip is a parent of the caller's inode,
	776	* and record->target_ip is the caller's inode.
	777	*
	778	* Return 1 if the record gives us connectivity
	779	*
	780	* Return 0 if the record is not relevant
	781	*
	782	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	783	*/
	784	static int
	785	hammer_setup_parent_inodes(hammer_record_t record)
	786	{
	787	hammer_mount_t hmp = record->ip->hmp;
	788	hammer_record_t depend;
	789	hammer_inode_t ip;
	790	int r, good;
	791
	792	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	793	ip = record->ip;
	794
	795	/*
	796	* If the record is already flushing, is it in our flush group?
	797	*
	798	* If it is in our flush group but it is a delete-on-disk, it
	799	* does not improve our connectivity (return 0), and if the
	800	* target inode is not trying to destroy itself we can't allow
	801	* the operation yet anyway (the second return -1).
	802	*/
	803	if (record->flush_state == HAMMER_FST_FLUSH) {
	804	if (record->flush_group != hmp->flusher_next) {
	805	ip->flags \|= HAMMER_INODE_REFLUSH;
	806	return(-1);
	807	}
	808	if (record->type == HAMMER_MEM_RECORD_ADD)
	809	return(1);
	810	return(0);
	811	}
	812
	813	/*
	814	* It must be a setup record. Try to resolve the setup dependancies
	815	* by recursing upwards so we can place ip on the flush list.
	816	*/
	817	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	818
	819	good = 0;
	820	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	821	r = hammer_setup_parent_inodes(depend);
	822	if (r < 0 && good == 0)
	823	good = -1;
	824	if (r > 0)
	825	good = 1;
	826	}
	827
	828	/*
	829	* We can't flush ip because it has no connectivity (XXX also check
	830	* nlinks for pre-existing connectivity!). Flag it so any resolution
	831	* recurses back down.
	832	*/
	833	if (good < 0) {
	834	ip->flags \|= HAMMER_INODE_REFLUSH;
	835	return(good);
	836	}
	837
	838	/*
	839	* We are go, place the parent inode in a flushing state so we can
	840	* place its record in a flushing state. Note that the parent
	841	* may already be flushing. The record must be in the same flush
	842	* group as the parent.
	843	*/
	844	if (ip->flush_state != HAMMER_FST_FLUSH)
	845	hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION);
	846	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	847	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	848
	849	#if 0
	850	if (record->type == HAMMER_MEM_RECORD_DEL &&
	851	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	852	/*
	853	* Regardless of flushing state we cannot sync this path if the
	854	* record represents a delete-on-disk but the target inode
	855	* is not ready to sync its own deletion.
	856	*
	857	* XXX need to count effective nlinks to determine whether
	858	* the flush is ok, otherwise removing a hardlink will
	859	* just leave the DEL record to rot.
	860	*/
	861	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	862	return(-1);
	863	} else
	864	#endif
	865	if (ip->flush_group == ip->hmp->flusher_next) {
	866	/*
	867	* This is the record we wanted to synchronize.
	868	*/
	869	record->flush_state = HAMMER_FST_FLUSH;
	870	record->flush_group = ip->flush_group;
	871	hammer_ref(&record->lock);
	872	if (record->type == HAMMER_MEM_RECORD_ADD)
	873	return(1);
	874
	875	/*
	876	* The record is a delete-n-disk. It does not contribute
	877	* to our visibility. We can still flush it.
	878	*/
	879	return(0);
	880	} else {
	881	/*
	882	* We couldn't resolve the dependancies, request that the
	883	* inode be flushed when the dependancies can be resolved.
	884	*/
	885	ip->flags \|= HAMMER_INODE_REFLUSH;
	886	return(-1);
	887	}
	888	}
	889
	890	/*
	891	* This is the core routine placing an inode into the FST_FLUSH state.
	892	*/
	893	static void
	894	hammer_flush_inode_core(hammer_inode_t ip, int flags)
	895	{
	896	int go_count;
	897
	898	/*
	899	* Set flush state and prevent the flusher from cycling into
	900	* the next flush group. Do not place the ip on the list yet.
	901	* Inodes not in the idle state get an extra reference.
	902	*/
	903	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	904	if (ip->flush_state == HAMMER_FST_IDLE)
	905	hammer_ref(&ip->lock);
	906	ip->flush_state = HAMMER_FST_FLUSH;
	907	ip->flush_group = ip->hmp->flusher_next;
	908	++ip->hmp->flusher_lock;
	909
	910	/*
	911	* Figure out how many in-memory records we can actually flush
	912	* (not including inode meta-data, buffers, etc).
	913	*/
	914	if (flags & HAMMER_FLUSH_RECURSION) {
	915	go_count = 1;
	916	} else {
	917	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	918	hammer_setup_child_callback, NULL);
	919	}
	920
	921	/*
	922	* This is a more involved test that includes go_count. If we
	923	* can't flush, flag the inode and return. If go_count is 0 we
	924	* were are unable to flush any records in our rec_tree and
	925	* must ignore the XDIRTY flag.
	926	*/
	927	if (go_count == 0) {
	928	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	929	ip->flags \|= HAMMER_INODE_REFLUSH;
	930	ip->flush_state = HAMMER_FST_SETUP;
	931	if (flags & HAMMER_FLUSH_SIGNAL) {
	932	ip->flags \|= HAMMER_INODE_RESIGNAL;
	933	hammer_flusher_async(ip->hmp);
	934	}
	935	if (--ip->hmp->flusher_lock == 0)
	936	wakeup(&ip->hmp->flusher_lock);
	937	return;
	938	}
	939	}
	940
	941	#if 0
	942	/*
	943	* XXX - don't sync the buffer cache on the frontend, the backend
	944	* will do it and we do not want to prematurely activate the backend.
	945	*
	946	* Sync the buffer cache if the caller wants to flush now, otherwise
	947	* don't (any write bios will wake up the flusher).
	948	*/
	949	if ((flags & HAMMER_FLUSH_RECURSION) == 0 &&
	950	(flags & HAMMER_FLUSH_SIGNAL)) {
	951	if (ip->vp != NULL)
	952	error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
	953	else
	954	error = 0;
	955	}
	956
	957	/*
	958	* Any further strategy calls will go into the inode's alternative
	959	* bioq.
	960	*/
	961	ip->flags \|= HAMMER_INODE_WRITE_ALT;
	962	#endif
	963
	964	/*
	965	* Snapshot the state of the inode for the backend flusher.
	966	*
	967	* The truncation must be retained in the frontend until after
	968	* we've actually performed the record deletion.
	969	*
	970	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	971	* and stays in ip->flags. Once set, it stays set until the
	972	* inode is destroyed.
	973	*/
	974	ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
	975	ip->sync_trunc_off = ip->trunc_off;
	976	ip->sync_ino_rec = ip->ino_rec;
	977	ip->sync_ino_data = ip->ino_data;
	978	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	979
	980	/*
	981	* The flusher list inherits our inode and reference.
	982	*/
	983	TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
	984	if (--ip->hmp->flusher_lock == 0)
	985	wakeup(&ip->hmp->flusher_lock);
	986
	987	if (flags & HAMMER_FLUSH_SIGNAL)
	988	hammer_flusher_async(ip->hmp);
	989	}
	990
	991	/*
	992	* Callback for scan of ip->rec_tree. Try to include each record in our
	993	* flush. ip->flush_group has been set but the inode has not yet been
	994	* moved into a flushing state.
	995	*
	996	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	997	* both inodes.
	998	*
	999	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	1000	* the caller from shortcutting the flush.
	1001	*/
	1002	static int
	1003	hammer_setup_child_callback(hammer_record_t rec, void *data)
	1004	{
	1005	hammer_inode_t target_ip;
	1006	hammer_inode_t ip;
	1007	int r;
	1008
	1009	/*
	1010	* If the record has been deleted by the backend (it's being held
	1011	* by the frontend in a race), just ignore it.
	1012	*/
	1013	if (rec->flags & HAMMER_RECF_DELETED_BE)
	1014	return(0);
	1015
	1016	/*
	1017	* If the record is in an idle state it has no dependancies and
	1018	* can be flushed.
	1019	*/
	1020	ip = rec->ip;
	1021	r = 0;
	1022
	1023	switch(rec->flush_state) {
	1024	case HAMMER_FST_IDLE:
	1025	/*
	1026	* Record has no setup dependancy, we can flush it.
	1027	*/
	1028	KKASSERT(rec->target_ip == NULL);
	1029	rec->flush_state = HAMMER_FST_FLUSH;
	1030	rec->flush_group = ip->flush_group;
	1031	hammer_ref(&rec->lock);
	1032	r = 1;
	1033	break;
	1034	case HAMMER_FST_SETUP:
	1035	/*
	1036	* Record has a setup dependancy. Try to include the
	1037	* target ip in the flush.
	1038	*
	1039	* We have to be careful here, if we do not do the right
	1040	* thing we can lose track of dirty inodes and the system
	1041	* will lockup trying to allocate buffers.
	1042	*/
	1043	target_ip = rec->target_ip;
	1044	KKASSERT(target_ip != NULL);
	1045	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	1046	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	1047	/*
	1048	* If the target IP is already flushing in our group
	1049	* we are golden, otherwise make sure the target
	1050	* reflushes.
	1051	*/
	1052	if (target_ip->flush_group == ip->flush_group) {
	1053	rec->flush_state = HAMMER_FST_FLUSH;
	1054	rec->flush_group = ip->flush_group;
	1055	hammer_ref(&rec->lock);
	1056	r = 1;
	1057	} else {
	1058	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1059	}
	1060	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	1061	/*
	1062	* If the target IP is not flushing we can force
	1063	* it to flush, even if it is unable to write out
	1064	* any of its own records we have at least one in
	1065	* hand that we CAN deal with.
	1066	*/
	1067	rec->flush_state = HAMMER_FST_FLUSH;
	1068	rec->flush_group = ip->flush_group;
	1069	hammer_ref(&rec->lock);
	1070	hammer_flush_inode_core(target_ip,
	1071	HAMMER_FLUSH_RECURSION);
	1072	r = 1;
	1073	} else {
	1074	/*
	1075	* XXX this needs help. We have a delete-on-disk
	1076	* which could disconnect the target. If the target
	1077	* has its own dependancies they really need to
	1078	* be flushed.
	1079	*
	1080	* XXX
	1081	*/
	1082	rec->flush_state = HAMMER_FST_FLUSH;
	1083	rec->flush_group = ip->flush_group;
	1084	hammer_ref(&rec->lock);
	1085	hammer_flush_inode_core(target_ip,
	1086	HAMMER_FLUSH_RECURSION);
	1087	r = 1;
	1088	}
	1089	break;
	1090	case HAMMER_FST_FLUSH:
	1091	/*
	1092	* Record already associated with a flush group. It had
	1093	* better be ours.
	1094	*/
	1095	KKASSERT(rec->flush_group == ip->flush_group);
	1096	r = 1;
	1097	break;
	1098	}
	1099	return(r);
	1100	}
	1101
	1102	/*
	1103	* Wait for a previously queued flush to complete
	1104	*/
	1105	void
	1106	hammer_wait_inode(hammer_inode_t ip)
	1107	{
	1108	while (ip->flush_state == HAMMER_FST_FLUSH) {
	1109	ip->flags \|= HAMMER_INODE_FLUSHW;
	1110	tsleep(&ip->flags, 0, "hmrwin", 0);
	1111	}
	1112	}
	1113
	1114	/*
	1115	* Called by the backend code when a flush has been completed.
	1116	* The inode has already been removed from the flush list.
	1117	*
	1118	* A pipelined flush can occur, in which case we must re-enter the
	1119	* inode on the list and re-copy its fields.
	1120	*/
	1121	void
	1122	hammer_flush_inode_done(hammer_inode_t ip)
	1123	{
	1124	struct bio *bio;
	1125	int dorel = 0;
	1126
	1127	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	1128
	1129	/*
	1130	* Allow BIOs to queue to the inode's primary bioq again.
	1131	*/
	1132	ip->flags &= ~HAMMER_INODE_WRITE_ALT;
	1133
	1134	/*
	1135	* Merge left-over flags back into the frontend and fix the state.
	1136	*/
	1137	ip->flags \|= ip->sync_flags;
	1138
	1139	/*
	1140	* The backend may have adjusted nlinks, so if the adjusted nlinks
	1141	* does not match the fronttend set the frontend's RDIRTY flag again.
	1142	*/
	1143	if (ip->ino_rec.ino_nlinks != ip->sync_ino_rec.ino_nlinks)
	1144	ip->flags \|= HAMMER_INODE_RDIRTY;
	1145
	1146	/*
	1147	* Reflush any BIOs that wound up in the alt list. Our inode will
	1148	* also wind up at the end of the flusher's list.
	1149	*/
	1150	while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
	1151	TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
	1152	TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
	1153	}
	1154	/*
	1155	* Fix up the dirty buffer status.
	1156	*/
	1157	if (TAILQ_FIRST(&ip->bio_list) \|\|
	1158	(ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree))) {
	1159	ip->flags \|= HAMMER_INODE_BUFS;
	1160	}
	1161
	1162	/*
	1163	* Re-set the XDIRTY flag if some of the inode's in-memory records
	1164	* could not be flushed.
	1165	*/
	1166	if (RB_ROOT(&ip->rec_tree))
	1167	ip->flags \|= HAMMER_INODE_XDIRTY;
	1168
	1169	/*
	1170	* Do not lose track of inodes which no longer have vnode
	1171	* assocations, otherwise they may never get flushed again.
	1172	*/
	1173	if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
	1174	ip->flags \|= HAMMER_INODE_REFLUSH;
	1175
	1176	/*
	1177	* Adjust flush_state. The target state (idle or setup) shouldn't
	1178	* be terribly important since we will reflush if we really need
	1179	* to do anything. XXX
	1180	*/
	1181	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	1182	ip->flush_state = HAMMER_FST_IDLE;
	1183	dorel = 1;
	1184	} else {
	1185	ip->flush_state = HAMMER_FST_SETUP;
	1186	}
	1187
	1188	/*
	1189	* If the frontend made more changes and requested another flush,
	1190	* then try to get it running.
	1191	*/
	1192	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1193	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1194	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	1195	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	1196	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1197	} else {
	1198	hammer_flush_inode(ip, 0);
	1199	}
	1200	}
	1201
	1202	/*
	1203	* Finally, if the frontend is waiting for a flush to complete,
	1204	* wake it up.
	1205	*/
	1206	if (ip->flush_state != HAMMER_FST_FLUSH) {
	1207	if (ip->flags & HAMMER_INODE_FLUSHW) {
	1208	ip->flags &= ~HAMMER_INODE_FLUSHW;
	1209	wakeup(&ip->flags);
	1210	}
	1211	}
	1212	if (dorel)
	1213	hammer_rel_inode(ip, 0);
	1214	}
	1215
	1216	/*
	1217	* Called from hammer_sync_inode() to synchronize in-memory records
	1218	* to the media.
	1219	*/
	1220	static int
	1221	hammer_sync_record_callback(hammer_record_t record, void *data)
	1222	{
	1223	hammer_cursor_t cursor = data;
	1224	hammer_transaction_t trans = cursor->trans;
	1225	int error;
	1226
	1227	/*
	1228	* Skip records that do not belong to the current flush.
	1229	*/
	1230	if (record->flush_state != HAMMER_FST_FLUSH)
	1231	return(0);
	1232	KKASSERT((record->flags & HAMMER_RECF_DELETED_BE) == 0);
	1233	#if 1
	1234	if (record->flush_group != record->ip->flush_group) {
	1235	kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	1236	Debugger("blah2");
	1237	return(0);
	1238	}
	1239	#endif
	1240	KKASSERT(record->flush_group == record->ip->flush_group);
	1241
	1242	/*
	1243	* Interlock the record using the BE flag. Once BE is set the
	1244	* frontend cannot change the state of FE.
	1245	*
	1246	* NOTE: If FE is set prior to us setting BE we still sync the
	1247	* record out, but the flush completion code converts it to
	1248	* a delete-on-disk record instead of destroying it.
	1249	*/
	1250	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	1251	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1252
	1253	/*
	1254	* If DELETED_FE is set we may have already sent dependant pieces
	1255	* to the disk and we must flush the record as if it hadn't been
	1256	* deleted. This creates a bit of a mess because we have to
	1257	* have ip_sync_record convert the record to MEM_RECORD_DEL before
	1258	* it inserts the B-Tree record. Otherwise the media sync might
	1259	* be visible to the frontend.
	1260	*/
	1261	if (record->flags & HAMMER_RECF_DELETED_FE) {
	1262	KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
	1263	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	1264	}
	1265
	1266	/*
	1267	* Assign the create_tid for new records. Deletions already
	1268	* have the record's entire key properly set up.
	1269	*/
	1270	if (record->type != HAMMER_MEM_RECORD_DEL)
	1271	record->rec.inode.base.base.create_tid = trans->tid;
	1272	for (;;) {
	1273	error = hammer_ip_sync_record_cursor(cursor, record);
	1274	if (error != EDEADLK)
	1275	break;
	1276	hammer_done_cursor(cursor);
	1277	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	1278	record->ip);
	1279	if (error)
	1280	break;
	1281	}
	1282	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	1283
	1284	if (error) {
	1285	error = -error;
	1286	if (error != -ENOSPC) {
	1287	kprintf("hammer_sync_record_callback: sync failed rec "
	1288	"%p, error %d\n", record, error);
	1289	Debugger("sync failed rec");
	1290	}
	1291	}
	1292	hammer_flush_record_done(record, error);
	1293	return(error);
	1294	}
	1295
	1296	/*
	1297	* XXX error handling
	1298	*/
	1299	int
	1300	hammer_sync_inode(hammer_inode_t ip)
	1301	{
	1302	struct hammer_transaction trans;
	1303	struct hammer_cursor cursor;
	1304	struct bio *bio;
	1305	hammer_record_t depend;
	1306	hammer_record_t next;
	1307	int error, tmp_error;
	1308	u_int64_t nlinks;
	1309
	1310	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	1311	return(0);
	1312
	1313	hammer_start_transaction_fls(&trans, ip->hmp);
	1314	error = hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
	1315	if (error)
	1316	goto done;
	1317
	1318	/*
	1319	* Any directory records referencing this inode which are not in
	1320	* our current flush group must adjust our nlink count for the
	1321	* purposes of synchronization to disk.
	1322	*
	1323	* Records which are in our flush group can be unlinked from our
	1324	* inode now, allowing the inode to be physically deleted.
	1325	*/
	1326	nlinks = ip->ino_rec.ino_nlinks;
	1327	next = TAILQ_FIRST(&ip->target_list);
	1328	while ((depend = next) != NULL) {
	1329	next = TAILQ_NEXT(depend, target_entry);
	1330	if (depend->flush_state == HAMMER_FST_FLUSH &&
	1331	depend->flush_group == ip->hmp->flusher_act) {
	1332	TAILQ_REMOVE(&ip->target_list, depend, target_entry);
	1333	depend->target_ip = NULL;
	1334	/* no need to signal target_ip, it is us */
	1335	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	1336	switch(depend->type) {
	1337	case HAMMER_MEM_RECORD_ADD:
	1338	--nlinks;
	1339	break;
	1340	case HAMMER_MEM_RECORD_DEL:
	1341	++nlinks;
	1342	break;
	1343	}
	1344	}
	1345	}
	1346
	1347	/*
	1348	* Set dirty if we had to modify the link count.
	1349	*/
	1350	if (ip->sync_ino_rec.ino_nlinks != nlinks) {
	1351	KKASSERT((int64_t)nlinks >= 0);
	1352	ip->sync_ino_rec.ino_nlinks = nlinks;
	1353	ip->sync_flags \|= HAMMER_INODE_RDIRTY;
	1354	}
	1355
	1356	/*
	1357	* Queue up any pending dirty buffers then set a flag to cause
	1358	* any further BIOs to go to the alternative queue.
	1359	*/
	1360	if (ip->vp)
	1361	error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
	1362	ip->flags \|= HAMMER_INODE_WRITE_ALT;
	1363
	1364	/*
	1365	* The buffer cache may contain dirty buffers beyond the inode
	1366	* state we copied from the frontend to the backend. Because
	1367	* we are syncing our buffer cache on the backend, resync
	1368	* the truncation point and the file size so we don't wipe out
	1369	* any data.
	1370	*
	1371	* Syncing the buffer cache on the frontend has serious problems
	1372	* because it prevents us from passively queueing dirty inodes
	1373	* to the backend (the BIO's could stall indefinitely).
	1374	*/
	1375	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	1376	ip->sync_trunc_off = ip->trunc_off;
	1377	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	1378	}
	1379	if (ip->sync_ino_rec.ino_size != ip->ino_rec.ino_size) {
	1380	ip->sync_ino_rec.ino_size = ip->ino_rec.ino_size;
	1381	ip->sync_flags \|= HAMMER_INODE_RDIRTY;
	1382	}
	1383
	1384	/*
	1385	* If there is a trunction queued destroy any data past the (aligned)
	1386	* truncation point. Userland will have dealt with the buffer
	1387	* containing the truncation point for us.
	1388	*
	1389	* We don't flush pending frontend data buffers until after we've
	1390	* dealth with the truncation.
	1391	*
	1392	* Don't bother if the inode is or has been deleted.
	1393	*/
	1394	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	1395	/*
	1396	* Interlock trunc_off. The VOP front-end may continue to
	1397	* make adjustments to it while we are blocked.
	1398	*/
	1399	off_t trunc_off;
	1400	off_t aligned_trunc_off;
	1401
	1402	trunc_off = ip->sync_trunc_off;
	1403	aligned_trunc_off = (trunc_off + HAMMER_BUFMASK) &
	1404	~HAMMER_BUFMASK64;
	1405
	1406	/*
	1407	* Delete any whole blocks on-media. The front-end has
	1408	* already cleaned out any partial block and made it
	1409	* pending. The front-end may have updated trunc_off
	1410	* while we were blocked so do not just unconditionally
	1411	* set it to the maximum offset.
	1412	*/
	1413	error = hammer_ip_delete_range(&cursor, ip,
	1414	aligned_trunc_off,
	1415	0x7FFFFFFFFFFFFFFFLL);
	1416	if (error)
	1417	Debugger("hammer_ip_delete_range errored");
	1418	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1419	if (ip->trunc_off >= trunc_off) {
	1420	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	1421	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	1422	}
	1423	} else {
	1424	error = 0;
	1425	}
	1426
	1427	/*
	1428	* Now sync related records. These will typically be directory
	1429	* entries or delete-on-disk records.
	1430	*
	1431	* Not all records will be flushed, but clear XDIRTY anyway. We
	1432	* will set it again in the frontend hammer_flush_inode_done()
	1433	* if records remain.
	1434	*/
	1435	if (error == 0) {
	1436	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1437	hammer_sync_record_callback, &cursor);
	1438	if (tmp_error < 0)
	1439	tmp_error = -error;
	1440	if (tmp_error)
	1441	error = tmp_error;
	1442	if (error == 0)
	1443	ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
	1444	}
	1445
	1446	/*
	1447	* If we are deleting the inode the frontend had better not have
	1448	* any active references on elements making up the inode.
	1449	*/
	1450	if (error == 0 && ip->sync_ino_rec.ino_nlinks == 0 &&
	1451	RB_EMPTY(&ip->rec_tree) &&
	1452	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	1453	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	1454	int count1 = 0;
	1455
	1456	kprintf("Y");
	1457	ip->flags \|= HAMMER_INODE_DELETED;
	1458	error = hammer_ip_delete_range_all(&cursor, ip, &count1);
	1459	if (error == 0) {
	1460	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	1461	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1462	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1463
	1464	/*
	1465	* Set delete_tid in both the frontend and backend
	1466	* copy of the inode record. The DELETED flag handles
	1467	* this, do not set RDIRTY.
	1468	*/
	1469	ip->ino_rec.base.base.delete_tid = trans.tid;
	1470	ip->sync_ino_rec.base.base.delete_tid = trans.tid;
	1471
	1472	/*
	1473	* Adjust the inode count in the volume header
	1474	*/
	1475	hammer_modify_volume(&trans, trans.rootvol, NULL, 0);
	1476	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1477	hammer_modify_volume_done(trans.rootvol);
	1478	} else {
	1479	ip->flags &= ~HAMMER_INODE_DELETED;
	1480	Debugger("hammer_ip_delete_range_all errored");
	1481	}
	1482	}
	1483
	1484	/*
	1485	* Flush any queued BIOs. These will just biodone() the IO's if
	1486	* the inode has been deleted.
	1487	*/
	1488	while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) {
	1489	TAILQ_REMOVE(&ip->bio_list, bio, bio_act);
	1490	tmp_error = hammer_dowrite(&cursor, ip, bio);
	1491	if (tmp_error)
	1492	error = tmp_error;
	1493	}
	1494	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	1495
	1496	if (error)
	1497	Debugger("RB_SCAN errored");
	1498
	1499	/*
	1500	* Now update the inode's on-disk inode-data and/or on-disk record.
	1501	* DELETED and ONDISK are managed only in ip->flags.
	1502	*/
	1503	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	1504	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	1505	/*
	1506	* If deleted and on-disk, don't set any additional flags.
	1507	* the delete flag takes care of things.
	1508	*
	1509	* Clear flags which may have been set by the frontend.
	1510	*/
	1511	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	1512	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES\|
	1513	HAMMER_INODE_DELETING);
	1514	break;
	1515	case HAMMER_INODE_DELETED:
	1516	/*
	1517	* Take care of the case where a deleted inode was never
	1518	* flushed to the disk in the first place.
	1519	*
	1520	* Clear flags which may have been set by the frontend.
	1521	*/
	1522	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	1523	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES\|
	1524	HAMMER_INODE_DELETING);
	1525	while (RB_ROOT(&ip->rec_tree)) {
	1526	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	1527	hammer_ref(&record->lock);
	1528	KKASSERT(record->lock.refs == 1);
	1529	record->flags \|= HAMMER_RECF_DELETED_FE;
	1530	record->flags \|= HAMMER_RECF_DELETED_BE;
	1531	hammer_rel_mem_record(record);
	1532	}
	1533	break;
	1534	case HAMMER_INODE_ONDISK:
	1535	/*
	1536	* If already on-disk, do not set any additional flags.
	1537	*/
	1538	break;
	1539	default:
	1540	/*
	1541	* If not on-disk and not deleted, set both dirty flags
	1542	* to force an initial record to be written. Also set
	1543	* the create_tid for the inode.
	1544	*
	1545	* Set create_tid in both the frontend and backend
	1546	* copy of the inode record.
	1547	*/
	1548	ip->ino_rec.base.base.create_tid = trans.tid;
	1549	ip->sync_ino_rec.base.base.create_tid = trans.tid;
	1550	ip->sync_flags \|= HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY;
	1551	break;
	1552	}
	1553
	1554	/*
	1555	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	1556	* is already on-disk the old record is marked as deleted.
	1557	*
	1558	* If DELETED is set hammer_update_inode() will delete the existing
	1559	* record without writing out a new one.
	1560	*
	1561	* If ONLY the ITIMES flag is set we can update the record in-place.
	1562	*/
	1563	if (ip->flags & HAMMER_INODE_DELETED) {
	1564	error = hammer_update_inode(&cursor, ip);
	1565	} else
	1566	if ((ip->sync_flags & (HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY \|
	1567	HAMMER_INODE_ITIMES)) == HAMMER_INODE_ITIMES) {
	1568	error = hammer_update_itimes(&cursor, ip);
	1569	} else
	1570	if (ip->sync_flags & (HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY \|
	1571	HAMMER_INODE_ITIMES)) {
	1572	error = hammer_update_inode(&cursor, ip);
	1573	}
	1574	if (error)
	1575	Debugger("hammer_update_itimes/inode errored");
	1576	done:
	1577	/*
	1578	* Save the TID we used to sync the inode with to make sure we
	1579	* do not improperly reuse it.
	1580	*/
	1581	hammer_done_cursor(&cursor);
	1582	hammer_done_transaction(&trans);
	1583	return(error);
	1584	}
	1585
	1586	/*
	1587	* This routine is called when the OS is no longer actively referencing
	1588	* the inode (but might still be keeping it cached), or when releasing
	1589	* the last reference to an inode.
	1590	*
	1591	* At this point if the inode's nlinks count is zero we want to destroy
	1592	* it, which may mean destroying it on-media too.
	1593	*/
	1594	static void
	1595	hammer_inode_unloadable_check(hammer_inode_t ip)
	1596	{
	1597	/*
	1598	* If the inode is on-media and the link count is 0 we MUST delete
	1599	* it on-media. DELETING is a mod flag, DELETED is a state flag.
	1600	*/
	1601	if (ip->ino_rec.ino_nlinks == 0 &&
	1602	(ip->flags & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	1603	if (ip->vp) {
	1604	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	1605	vnode_pager_setsize(ip->vp, 0);
	1606	}
	1607	ip->flags \|= HAMMER_INODE_DELETING;
	1608	ip->flags \|= HAMMER_INODE_TRUNCATED;
	1609	ip->trunc_off = 0;
	1610	}
	1611	}
	1612
	1613	void
	1614	hammer_test_inode(hammer_inode_t ip)
	1615	{
	1616	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1617	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1618	hammer_ref(&ip->lock);
	1619	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1620	hammer_rel_inode(ip, 0);
	1621	}
	1622	}
	1623