gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.51 2008/05/04 19:57:42 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39	#include <sys/buf.h>
	40	#include <sys/buf2.h>
	41
	42	static int hammer_unload_inode(struct hammer_inode *ip);
	43	static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
	44	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	45	static int hammer_setup_parent_inodes(hammer_record_t record);
	46
	47	/*
	48	* The kernel is not actively referencing this vnode but is still holding
	49	* it cached.
	50	*
	51	* This is called from the frontend.
	52	*/
	53	int
	54	hammer_vop_inactive(struct vop_inactive_args *ap)
	55	{
	56	struct hammer_inode *ip = VTOI(ap->a_vp);
	57
	58	/*
	59	* Degenerate case
	60	*/
	61	if (ip == NULL) {
	62	vrecycle(ap->a_vp);
	63	return(0);
	64	}
	65
	66	/*
	67	* If the inode no longer has visibility in the filesystem and is
	68	* fairly clean, try to recycle it immediately. This can deadlock
	69	* in vfsync() if we aren't careful.
	70	*/
	71	hammer_inode_unloadable_check(ip, 0);
	72	if (ip->flags & HAMMER_INODE_MODMASK)
	73	hammer_flush_inode(ip, 0);
	74	else if (ip->ino_rec.ino_nlinks == 0)
	75	vrecycle(ap->a_vp);
	76	return(0);
	77	}
	78
	79	/*
	80	* Release the vnode association. This is typically (but not always)
	81	* the last reference on the inode.
	82	*
	83	* Once the association is lost we are on our own with regards to
	84	* flushing the inode.
	85	*/
	86	int
	87	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	88	{
	89	struct hammer_inode *ip;
	90	struct vnode *vp;
	91
	92	vp = ap->a_vp;
	93
	94	if ((ip = vp->v_data) != NULL) {
	95	vp->v_data = NULL;
	96	ip->vp = NULL;
	97	hammer_rel_inode(ip, 1);
	98	}
	99	return(0);
	100	}
	101
	102	/*
	103	* Return a locked vnode for the specified inode. The inode must be
	104	* referenced but NOT LOCKED on entry and will remain referenced on
	105	* return.
	106	*
	107	* Called from the frontend.
	108	*/
	109	int
	110	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	111	{
	112	struct vnode *vp;
	113	int error = 0;
	114
	115	for (;;) {
	116	if ((vp = ip->vp) == NULL) {
	117	error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
	118	if (error)
	119	break;
	120	hammer_lock_ex(&ip->lock);
	121	if (ip->vp != NULL) {
	122	hammer_unlock(&ip->lock);
	123	vp->v_type = VBAD;
	124	vx_put(vp);
	125	continue;
	126	}
	127	hammer_ref(&ip->lock);
	128	vp = *vpp;
	129	ip->vp = vp;
	130	vp->v_type = hammer_get_vnode_type(
	131	ip->ino_rec.base.base.obj_type);
	132
	133	switch(ip->ino_rec.base.base.obj_type) {
	134	case HAMMER_OBJTYPE_CDEV:
	135	case HAMMER_OBJTYPE_BDEV:
	136	vp->v_ops = &ip->hmp->mp->mnt_vn_spec_ops;
	137	addaliasu(vp, ip->ino_data.rmajor,
	138	ip->ino_data.rminor);
	139	break;
	140	case HAMMER_OBJTYPE_FIFO:
	141	vp->v_ops = &ip->hmp->mp->mnt_vn_fifo_ops;
	142	break;
	143	default:
	144	break;
	145	}
	146
	147	/*
	148	* Only mark as the root vnode if the ip is not
	149	* historical, otherwise the VFS cache will get
	150	* confused. The other half of the special handling
	151	* is in hammer_vop_nlookupdotdot().
	152	*/
	153	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	154	ip->obj_asof == ip->hmp->asof) {
	155	vp->v_flag \|= VROOT;
	156	}
	157
	158	vp->v_data = (void *)ip;
	159	/* vnode locked by getnewvnode() */
	160	/* make related vnode dirty if inode dirty? */
	161	hammer_unlock(&ip->lock);
	162	if (vp->v_type == VREG)
	163	vinitvmio(vp, ip->ino_rec.ino_size);
	164	break;
	165	}
	166
	167	/*
	168	* loop if the vget fails (aka races), or if the vp
	169	* no longer matches ip->vp.
	170	*/
	171	if (vget(vp, LK_EXCLUSIVE) == 0) {
	172	if (vp == ip->vp)
	173	break;
	174	vput(vp);
	175	}
	176	}
	177	*vpp = vp;
	178	return(error);
	179	}
	180
	181	/*
	182	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	183	* do not attach or detach the related vnode (use hammer_get_vnode() for
	184	* that).
	185	*
	186	* The flags argument is only applied for newly created inodes, and only
	187	* certain flags are inherited.
	188	*
	189	* Called from the frontend.
	190	*/
	191	struct hammer_inode *
	192	hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache,
	193	u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
	194	{
	195	hammer_mount_t hmp = trans->hmp;
	196	struct hammer_inode_info iinfo;
	197	struct hammer_cursor cursor;
	198	struct hammer_inode *ip;
	199
	200	/*
	201	* Determine if we already have an inode cached. If we do then
	202	* we are golden.
	203	*/
	204	iinfo.obj_id = obj_id;
	205	iinfo.obj_asof = asof;
	206	loop:
	207	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	208	if (ip) {
	209	hammer_ref(&ip->lock);
	210	*errorp = 0;
	211	return(ip);
	212	}
	213
	214	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	215	++hammer_count_inodes;
	216	ip->obj_id = obj_id;
	217	ip->obj_asof = iinfo.obj_asof;
	218	ip->hmp = hmp;
	219	ip->flags = flags & HAMMER_INODE_RO;
	220	if (hmp->ronly)
	221	ip->flags \|= HAMMER_INODE_RO;
	222	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	223	RB_INIT(&ip->rec_tree);
	224	TAILQ_INIT(&ip->bio_list);
	225	TAILQ_INIT(&ip->bio_alt_list);
	226	TAILQ_INIT(&ip->target_list);
	227
	228	/*
	229	* Locate the on-disk inode.
	230	*/
	231	retry:
	232	hammer_init_cursor(trans, &cursor, cache, NULL);
	233	cursor.key_beg.obj_id = ip->obj_id;
	234	cursor.key_beg.key = 0;
	235	cursor.key_beg.create_tid = 0;
	236	cursor.key_beg.delete_tid = 0;
	237	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	238	cursor.key_beg.obj_type = 0;
	239	cursor.asof = iinfo.obj_asof;
	240	cursor.flags = HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_GET_DATA \|
	241	HAMMER_CURSOR_ASOF;
	242
	243	*errorp = hammer_btree_lookup(&cursor);
	244	if (*errorp == EDEADLK) {
	245	hammer_done_cursor(&cursor);
	246	goto retry;
	247	}
	248
	249	/*
	250	* On success the B-Tree lookup will hold the appropriate
	251	* buffer cache buffers and provide a pointer to the requested
	252	* information. Copy the information to the in-memory inode
	253	* and cache the B-Tree node to improve future operations.
	254	*/
	255	if (*errorp == 0) {
	256	ip->ino_rec = cursor.record->inode;
	257	ip->ino_data = cursor.data->inode;
	258	hammer_cache_node(cursor.node, &ip->cache[0]);
	259	if (cache)
	260	hammer_cache_node(cursor.node, cache);
	261	}
	262
	263	/*
	264	* On success load the inode's record and data and insert the
	265	* inode into the B-Tree. It is possible to race another lookup
	266	* insertion of the same inode so deal with that condition too.
	267	*
	268	* The cursor's locked node interlocks against others creating and
	269	* destroying ip while we were blocked.
	270	*/
	271	if (*errorp == 0) {
	272	hammer_ref(&ip->lock);
	273	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	274	hammer_uncache_node(&ip->cache[0]);
	275	hammer_uncache_node(&ip->cache[1]);
	276	KKASSERT(ip->lock.refs == 1);
	277	--hammer_count_inodes;
	278	kfree(ip, M_HAMMER);
	279	hammer_done_cursor(&cursor);
	280	goto loop;
	281	}
	282	ip->flags \|= HAMMER_INODE_ONDISK;
	283	} else {
	284	kprintf("hammer_get_inode: failed ip %p obj_id %016llx cursor %p error %d\n",
	285	ip, ip->obj_id, &cursor, *errorp);
	286	/Debugger("x");/
	287	--hammer_count_inodes;
	288	kfree(ip, M_HAMMER);
	289	ip = NULL;
	290	}
	291	hammer_done_cursor(&cursor);
	292	return (ip);
	293	}
	294
	295	/*
	296	* Create a new filesystem object, returning the inode in *ipp. The
	297	* returned inode will be referenced.
	298	*
	299	* The inode is created in-memory.
	300	*/
	301	int
	302	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	303	struct ucred *cred, hammer_inode_t dip,
	304	struct hammer_inode **ipp)
	305	{
	306	hammer_mount_t hmp;
	307	hammer_inode_t ip;
	308	uid_t xuid;
	309
	310	hmp = trans->hmp;
	311	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	312	++hammer_count_inodes;
	313	ip->obj_id = hammer_alloc_objid(trans, dip);
	314	KKASSERT(ip->obj_id != 0);
	315	ip->obj_asof = hmp->asof;
	316	ip->hmp = hmp;
	317	ip->flush_state = HAMMER_FST_IDLE;
	318	ip->flags = HAMMER_INODE_DDIRTY \| HAMMER_INODE_RDIRTY \|
	319	HAMMER_INODE_ITIMES;
	320
	321	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	322	RB_INIT(&ip->rec_tree);
	323	TAILQ_INIT(&ip->bio_list);
	324	TAILQ_INIT(&ip->bio_alt_list);
	325	TAILQ_INIT(&ip->target_list);
	326
	327	ip->ino_rec.ino_atime = trans->time;
	328	ip->ino_rec.ino_mtime = trans->time;
	329	ip->ino_rec.ino_size = 0;
	330	ip->ino_rec.ino_nlinks = 0;
	331	/* XXX */
	332	ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
	333	ip->ino_rec.base.base.obj_id = ip->obj_id;
	334	ip->ino_rec.base.base.key = 0;
	335	ip->ino_rec.base.base.create_tid = 0;
	336	ip->ino_rec.base.base.delete_tid = 0;
	337	ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
	338	ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
	339
	340	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	341	ip->ino_data.mode = vap->va_mode;
	342	ip->ino_data.ctime = trans->time;
	343	ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
	344
	345	switch(ip->ino_rec.base.base.obj_type) {
	346	case HAMMER_OBJTYPE_CDEV:
	347	case HAMMER_OBJTYPE_BDEV:
	348	ip->ino_data.rmajor = vap->va_rmajor;
	349	ip->ino_data.rminor = vap->va_rminor;
	350	break;
	351	default:
	352	break;
	353	}
	354
	355	/*
	356	* Calculate default uid/gid and overwrite with information from
	357	* the vap.
	358	*/
	359	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	360	ip->ino_data.gid = dip->ino_data.gid;
	361	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
	362	&vap->va_mode);
	363	ip->ino_data.mode = vap->va_mode;
	364
	365	if (vap->va_vaflags & VA_UID_UUID_VALID)
	366	ip->ino_data.uid = vap->va_uid_uuid;
	367	else if (vap->va_uid != (uid_t)VNOVAL)
	368	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	369	if (vap->va_vaflags & VA_GID_UUID_VALID)
	370	ip->ino_data.gid = vap->va_gid_uuid;
	371	else if (vap->va_gid != (gid_t)VNOVAL)
	372	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	373
	374	hammer_ref(&ip->lock);
	375	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	376	hammer_unref(&ip->lock);
	377	panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
	378	}
	379	*ipp = ip;
	380	return(0);
	381	}
	382
	383	/*
	384	* Called by hammer_sync_inode().
	385	*/
	386	static int
	387	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	388	{
	389	hammer_transaction_t trans = cursor->trans;
	390	hammer_record_t record;
	391	int error;
	392
	393	retry:
	394	error = 0;
	395
	396	/*
	397	* If the inode has a presence on-disk then locate it and mark
	398	* it deleted, setting DELONDISK.
	399	*
	400	* The record may or may not be physically deleted, depending on
	401	* the retention policy.
	402	*/
	403	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	404	HAMMER_INODE_ONDISK) {
	405	hammer_normalize_cursor(cursor);
	406	cursor->key_beg.obj_id = ip->obj_id;
	407	cursor->key_beg.key = 0;
	408	cursor->key_beg.create_tid = 0;
	409	cursor->key_beg.delete_tid = 0;
	410	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	411	cursor->key_beg.obj_type = 0;
	412	cursor->asof = ip->obj_asof;
	413	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	414	cursor->flags \|= HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_ASOF;
	415	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	416
	417	error = hammer_btree_lookup(cursor);
	418	if (hammer_debug_inode)
	419	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	420	if (error) {
	421	kprintf("error %d\n", error);
	422	Debugger("hammer_update_inode");
	423	}
	424
	425	if (error == 0) {
	426	error = hammer_ip_delete_record(cursor, trans->tid);
	427	if (hammer_debug_inode)
	428	kprintf(" error %d\n", error);
	429	if (error && error != EDEADLK) {
	430	kprintf("error %d\n", error);
	431	Debugger("hammer_update_inode2");
	432	}
	433	if (error == 0) {
	434	ip->flags \|= HAMMER_INODE_DELONDISK;
	435	}
	436	if (cursor->node)
	437	hammer_cache_node(cursor->node, &ip->cache[0]);
	438	}
	439	if (error == EDEADLK) {
	440	hammer_done_cursor(cursor);
	441	error = hammer_init_cursor(trans, cursor,
	442	&ip->cache[0], ip);
	443	if (hammer_debug_inode)
	444	kprintf("IPDED %p %d\n", ip, error);
	445	if (error == 0)
	446	goto retry;
	447	}
	448	}
	449
	450	/*
	451	* Ok, write out the initial record or a new record (after deleting
	452	* the old one), unless the DELETED flag is set. This routine will
	453	* clear DELONDISK if it writes out a record.
	454	*
	455	* Update our inode statistics if this is the first application of
	456	* the inode on-disk.
	457	*/
	458	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	459	/*
	460	* Generate a record and write it to the media
	461	*/
	462	record = hammer_alloc_mem_record(ip);
	463	record->type = HAMMER_MEM_RECORD_GENERAL;
	464	record->flush_state = HAMMER_FST_FLUSH;
	465	record->rec.inode = ip->sync_ino_rec;
	466	record->rec.inode.base.base.create_tid = trans->tid;
	467	record->rec.inode.base.data_len = sizeof(ip->sync_ino_data);
	468	record->data = (void *)&ip->sync_ino_data;
	469	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	470	for (;;) {
	471	error = hammer_ip_sync_record_cursor(cursor, record);
	472	if (hammer_debug_inode)
	473	kprintf("GENREC %p rec %08x %d\n",
	474	ip, record->flags, error);
	475	if (error != EDEADLK)
	476	break;
	477	hammer_done_cursor(cursor);
	478	error = hammer_init_cursor(trans, cursor,
	479	&ip->cache[0], ip);
	480	if (hammer_debug_inode)
	481	kprintf("GENREC reinit %d\n", error);
	482	if (error)
	483	break;
	484	}
	485	if (error) {
	486	kprintf("error %d\n", error);
	487	Debugger("hammer_update_inode3");
	488	}
	489
	490	/*
	491	* The record isn't managed by the inode's record tree,
	492	* destroy it whether we succeed or fail.
	493	*/
	494	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	495	record->flags \|= HAMMER_RECF_DELETED_FE;
	496	record->flush_state = HAMMER_FST_IDLE;
	497	hammer_rel_mem_record(record);
	498
	499	/*
	500	* Finish up.
	501	*/
	502	if (error == 0) {
	503	if (hammer_debug_inode)
	504	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	505	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY \|
	506	HAMMER_INODE_DDIRTY \|
	507	HAMMER_INODE_ITIMES);
	508	ip->flags &= ~HAMMER_INODE_DELONDISK;
	509
	510	/*
	511	* Root volume count of inodes
	512	*/
	513	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	514	hammer_modify_volume_field(trans,
	515	trans->rootvol,
	516	vol0_stat_inodes);
	517	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	518	hammer_modify_volume_done(trans->rootvol);
	519	ip->flags \|= HAMMER_INODE_ONDISK;
	520	if (hammer_debug_inode)
	521	kprintf("NOWONDISK %p\n", ip);
	522	}
	523	}
	524	}
	525
	526	/*
	527	* If the inode has been destroyed, clean out any left-over flags
	528	* that may have been set by the frontend.
	529	*/
	530	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	531	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY \|
	532	HAMMER_INODE_DDIRTY \|
	533	HAMMER_INODE_ITIMES);
	534	}
	535	return(error);
	536	}
	537
	538	/*
	539	* Update only the itimes fields. This is done no-historically. The
	540	* record is updated in-place on the disk.
	541	*/
	542	static int
	543	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	544	{
	545	hammer_transaction_t trans = cursor->trans;
	546	struct hammer_inode_record *rec;
	547	int error;
	548
	549	retry:
	550	error = 0;
	551	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	552	HAMMER_INODE_ONDISK) {
	553	hammer_normalize_cursor(cursor);
	554	cursor->key_beg.obj_id = ip->obj_id;
	555	cursor->key_beg.key = 0;
	556	cursor->key_beg.create_tid = 0;
	557	cursor->key_beg.delete_tid = 0;
	558	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	559	cursor->key_beg.obj_type = 0;
	560	cursor->asof = ip->obj_asof;
	561	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	562	cursor->flags \|= HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_ASOF;
	563	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	564
	565	error = hammer_btree_lookup(cursor);
	566	if (error) {
	567	kprintf("error %d\n", error);
	568	Debugger("hammer_update_itimes1");
	569	}
	570	if (error == 0) {
	571	/*
	572	* Do not generate UNDO records for atime/mtime
	573	* updates.
	574	*/
	575	rec = &cursor->record->inode;
	576	hammer_modify_buffer(trans, cursor->record_buffer,
	577	NULL, 0);
	578	rec->ino_atime = ip->sync_ino_rec.ino_atime;
	579	rec->ino_mtime = ip->sync_ino_rec.ino_mtime;
	580	hammer_modify_buffer_done(cursor->record_buffer);
	581	ip->sync_flags &= ~HAMMER_INODE_ITIMES;
	582	/* XXX recalculate crc */
	583	hammer_cache_node(cursor->node, &ip->cache[0]);
	584	}
	585	if (error == EDEADLK) {
	586	hammer_done_cursor(cursor);
	587	error = hammer_init_cursor(trans, cursor,
	588	&ip->cache[0], ip);
	589	if (error == 0)
	590	goto retry;
	591	}
	592	}
	593	return(error);
	594	}
	595
	596	/*
	597	* Release a reference on an inode, flush as requested.
	598	*
	599	* On the last reference we queue the inode to the flusher for its final
	600	* disposition.
	601	*/
	602	void
	603	hammer_rel_inode(struct hammer_inode *ip, int flush)
	604	{
	605	hammer_mount_t hmp = ip->hmp;
	606
	607	/*
	608	* Handle disposition when dropping the last ref.
	609	*/
	610	for (;;) {
	611	if (ip->lock.refs == 1) {
	612	/*
	613	* Determine whether on-disk action is needed for
	614	* the inode's final disposition.
	615	*/
	616	KKASSERT(ip->vp == NULL);
	617	hammer_inode_unloadable_check(ip, 0);
	618	if (ip->flags & HAMMER_INODE_MODMASK) {
	619	hammer_flush_inode(ip, 0);
	620	} else if (ip->lock.refs == 1) {
	621	hammer_unload_inode(ip);
	622	break;
	623	}
	624	} else {
	625	if (flush)
	626	hammer_flush_inode(ip, 0);
	627
	628	/*
	629	* The inode still has multiple refs, try to drop
	630	* one ref.
	631	*/
	632	KKASSERT(ip->lock.refs >= 1);
	633	if (ip->lock.refs > 1) {
	634	hammer_unref(&ip->lock);
	635	break;
	636	}
	637	}
	638	}
	639
	640	/*
	641	* XXX bad hack until I add code to track inodes in SETUP. We
	642	* can queue a lot of inodes to the syncer but if we don't wake
	643	* it up the undo sets will be too large or too many unflushed
	644	* records will build up and blow our malloc limit.
	645	*/
	646	if (++hmp->reclaim_count > 256) {
	647	hmp->reclaim_count = 0;
	648	hammer_flusher_async(hmp);
	649	}
	650	}
	651
	652	/*
	653	* Unload and destroy the specified inode. Must be called with one remaining
	654	* reference. The reference is disposed of.
	655	*
	656	* This can only be called in the context of the flusher.
	657	*/
	658	static int
	659	hammer_unload_inode(struct hammer_inode *ip)
	660	{
	661	KASSERT(ip->lock.refs == 1,
	662	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	663	KKASSERT(ip->vp == NULL);
	664	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	665	KKASSERT(ip->cursor_ip_refs == 0);
	666	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	667
	668	KKASSERT(RB_EMPTY(&ip->rec_tree));
	669	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	670	KKASSERT(TAILQ_EMPTY(&ip->bio_list));
	671	KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
	672
	673	RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
	674
	675	hammer_uncache_node(&ip->cache[0]);
	676	hammer_uncache_node(&ip->cache[1]);
	677	if (ip->objid_cache)
	678	hammer_clear_objid(ip);
	679	--hammer_count_inodes;
	680	kfree(ip, M_HAMMER);
	681
	682	return(0);
	683	}
	684
	685	/*
	686	* A transaction has modified an inode, requiring updates as specified by
	687	* the passed flags.
	688	*
	689	* HAMMER_INODE_RDIRTY: Inode record has been updated
	690	* HAMMER_INODE_DDIRTY: Inode data has been updated
	691	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	692	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	693	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	694	* HAMMER_INODE_ITIMES: mtime/atime has been updated
	695	*/
	696	void
	697	hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
	698	{
	699	KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 \|\|
	700	(flags & (HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	701	HAMMER_INODE_XDIRTY\|HAMMER_INODE_BUFS\|
	702	HAMMER_INODE_DELETED\|HAMMER_INODE_ITIMES)) == 0);
	703
	704	ip->flags \|= flags;
	705	}
	706
	707	/*
	708	* Request that an inode be flushed. This whole mess cannot block and may
	709	* recurse. Once requested HAMMER will attempt to actively flush it until
	710	* the flush can be done.
	711	*
	712	* The inode may already be flushing, or may be in a setup state. We can
	713	* place the inode in a flushing state if it is currently idle and flag it
	714	* to reflush if it is currently flushing.
	715	*/
	716	void
	717	hammer_flush_inode(hammer_inode_t ip, int flags)
	718	{
	719	hammer_record_t depend;
	720	int r, good;
	721
	722	/*
	723	* Trivial 'nothing to flush' case. If the inode is ina SETUP
	724	* state we have to put it back into an IDLE state so we can
	725	* drop the extra ref.
	726	*/
	727	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	728	if (ip->flush_state == HAMMER_FST_SETUP) {
	729	ip->flush_state = HAMMER_FST_IDLE;
	730	hammer_rel_inode(ip, 0);
	731	}
	732	return;
	733	}
	734
	735	/*
	736	* Our flush action will depend on the current state.
	737	*/
	738	switch(ip->flush_state) {
	739	case HAMMER_FST_IDLE:
	740	/*
	741	* We have no dependancies and can flush immediately. Some
	742	* our children may not be flushable so we have to re-test
	743	* with that additional knowledge.
	744	*/
	745	hammer_flush_inode_core(ip, flags);
	746	break;
	747	case HAMMER_FST_SETUP:
	748	/*
	749	* Recurse upwards through dependancies via target_list
	750	* and start their flusher actions going if possible.
	751	*
	752	* 'good' is our connectivity. -1 means we have none and
	753	* can't flush, 0 means there weren't any dependancies, and
	754	* 1 means we have good connectivity.
	755	*/
	756	good = 0;
	757	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	758	r = hammer_setup_parent_inodes(depend);
	759	if (r < 0 && good == 0)
	760	good = -1;
	761	if (r > 0)
	762	good = 1;
	763	}
	764
	765	/*
	766	* We can continue if good >= 0. Determine how many records
	767	* under our inode can be flushed (and mark them).
	768	*/
	769	if (good >= 0) {
	770	hammer_flush_inode_core(ip, flags);
	771	} else {
	772	ip->flags \|= HAMMER_INODE_REFLUSH;
	773	if (flags & HAMMER_FLUSH_SIGNAL) {
	774	ip->flags \|= HAMMER_INODE_RESIGNAL;
	775	hammer_flusher_async(ip->hmp);
	776	}
	777	}
	778	break;
	779	default:
	780	/*
	781	* We are already flushing, flag the inode to reflush
	782	* if needed after it completes its current flush.
	783	*/
	784	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	785	ip->flags \|= HAMMER_INODE_REFLUSH;
	786	if (flags & HAMMER_FLUSH_SIGNAL) {
	787	ip->flags \|= HAMMER_INODE_RESIGNAL;
	788	hammer_flusher_async(ip->hmp);
	789	}
	790	break;
	791	}
	792	}
	793
	794	/*
	795	* We are asked to recurse upwards and convert the record from SETUP
	796	* to FLUSH if possible. record->ip is a parent of the caller's inode,
	797	* and record->target_ip is the caller's inode.
	798	*
	799	* Return 1 if the record gives us connectivity
	800	*
	801	* Return 0 if the record is not relevant
	802	*
	803	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	804	*/
	805	static int
	806	hammer_setup_parent_inodes(hammer_record_t record)
	807	{
	808	hammer_mount_t hmp = record->ip->hmp;
	809	hammer_record_t depend;
	810	hammer_inode_t ip;
	811	int r, good;
	812
	813	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	814	ip = record->ip;
	815
	816	/*
	817	* If the record is already flushing, is it in our flush group?
	818	*
	819	* If it is in our flush group but it is a general record or a
	820	* delete-on-disk, it does not improve our connectivity (return 0),
	821	* and if the target inode is not trying to destroy itself we can't
	822	* allow the operation yet anyway (the second return -1).
	823	*/
	824	if (record->flush_state == HAMMER_FST_FLUSH) {
	825	if (record->flush_group != hmp->flusher_next) {
	826	ip->flags \|= HAMMER_INODE_REFLUSH;
	827	return(-1);
	828	}
	829	if (record->type == HAMMER_MEM_RECORD_ADD)
	830	return(1);
	831	/* GENERAL or DEL */
	832	return(0);
	833	}
	834
	835	/*
	836	* It must be a setup record. Try to resolve the setup dependancies
	837	* by recursing upwards so we can place ip on the flush list.
	838	*/
	839	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	840
	841	good = 0;
	842	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	843	r = hammer_setup_parent_inodes(depend);
	844	if (r < 0 && good == 0)
	845	good = -1;
	846	if (r > 0)
	847	good = 1;
	848	}
	849
	850	/*
	851	* We can't flush ip because it has no connectivity (XXX also check
	852	* nlinks for pre-existing connectivity!). Flag it so any resolution
	853	* recurses back down.
	854	*/
	855	if (good < 0) {
	856	ip->flags \|= HAMMER_INODE_REFLUSH;
	857	return(good);
	858	}
	859
	860	/*
	861	* We are go, place the parent inode in a flushing state so we can
	862	* place its record in a flushing state. Note that the parent
	863	* may already be flushing. The record must be in the same flush
	864	* group as the parent.
	865	*/
	866	if (ip->flush_state != HAMMER_FST_FLUSH)
	867	hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION);
	868	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	869	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	870
	871	#if 0
	872	if (record->type == HAMMER_MEM_RECORD_DEL &&
	873	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	874	/*
	875	* Regardless of flushing state we cannot sync this path if the
	876	* record represents a delete-on-disk but the target inode
	877	* is not ready to sync its own deletion.
	878	*
	879	* XXX need to count effective nlinks to determine whether
	880	* the flush is ok, otherwise removing a hardlink will
	881	* just leave the DEL record to rot.
	882	*/
	883	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	884	return(-1);
	885	} else
	886	#endif
	887	if (ip->flush_group == ip->hmp->flusher_next) {
	888	/*
	889	* This is the record we wanted to synchronize.
	890	*/
	891	record->flush_state = HAMMER_FST_FLUSH;
	892	record->flush_group = ip->flush_group;
	893	hammer_ref(&record->lock);
	894	if (record->type == HAMMER_MEM_RECORD_ADD)
	895	return(1);
	896
	897	/*
	898	* A general or delete-on-disk record does not contribute
	899	* to our visibility. We can still flush it, however.
	900	*/
	901	return(0);
	902	} else {
	903	/*
	904	* We couldn't resolve the dependancies, request that the
	905	* inode be flushed when the dependancies can be resolved.
	906	*/
	907	ip->flags \|= HAMMER_INODE_REFLUSH;
	908	return(-1);
	909	}
	910	}
	911
	912	/*
	913	* This is the core routine placing an inode into the FST_FLUSH state.
	914	*/
	915	static void
	916	hammer_flush_inode_core(hammer_inode_t ip, int flags)
	917	{
	918	int go_count;
	919
	920	/*
	921	* Set flush state and prevent the flusher from cycling into
	922	* the next flush group. Do not place the ip on the list yet.
	923	* Inodes not in the idle state get an extra reference.
	924	*/
	925	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	926	if (ip->flush_state == HAMMER_FST_IDLE)
	927	hammer_ref(&ip->lock);
	928	ip->flush_state = HAMMER_FST_FLUSH;
	929	ip->flush_group = ip->hmp->flusher_next;
	930	++ip->hmp->flusher_lock;
	931
	932	/*
	933	* We need to be able to vfsync/truncate from the backend.
	934	*/
	935	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	936	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	937	ip->flags \|= HAMMER_INODE_VHELD;
	938	vref(ip->vp);
	939	}
	940
	941	/*
	942	* Figure out how many in-memory records we can actually flush
	943	* (not including inode meta-data, buffers, etc).
	944	*/
	945	if (flags & HAMMER_FLUSH_RECURSION) {
	946	go_count = 1;
	947	} else {
	948	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	949	hammer_setup_child_callback, NULL);
	950	}
	951
	952	/*
	953	* This is a more involved test that includes go_count. If we
	954	* can't flush, flag the inode and return. If go_count is 0 we
	955	* were are unable to flush any records in our rec_tree and
	956	* must ignore the XDIRTY flag.
	957	*/
	958	if (go_count == 0) {
	959	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	960	ip->flags \|= HAMMER_INODE_REFLUSH;
	961	ip->flush_state = HAMMER_FST_SETUP;
	962	if (ip->flags & HAMMER_INODE_VHELD) {
	963	ip->flags &= ~HAMMER_INODE_VHELD;
	964	vrele(ip->vp);
	965	}
	966	if (flags & HAMMER_FLUSH_SIGNAL) {
	967	ip->flags \|= HAMMER_INODE_RESIGNAL;
	968	hammer_flusher_async(ip->hmp);
	969	}
	970	if (--ip->hmp->flusher_lock == 0)
	971	wakeup(&ip->hmp->flusher_lock);
	972	return;
	973	}
	974	}
	975
	976	/*
	977	* Snapshot the state of the inode for the backend flusher.
	978	*
	979	* The truncation must be retained in the frontend until after
	980	* we've actually performed the record deletion.
	981	*
	982	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	983	* and stays in ip->flags. Once set, it stays set until the
	984	* inode is destroyed.
	985	*/
	986	ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
	987	ip->sync_trunc_off = ip->trunc_off;
	988	ip->sync_ino_rec = ip->ino_rec;
	989	ip->sync_ino_data = ip->ino_data;
	990	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	991
	992	/*
	993	* The flusher list inherits our inode and reference.
	994	*/
	995	TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
	996	if (--ip->hmp->flusher_lock == 0)
	997	wakeup(&ip->hmp->flusher_lock);
	998
	999	if (flags & HAMMER_FLUSH_SIGNAL)
	1000	hammer_flusher_async(ip->hmp);
	1001	}
	1002
	1003	/*
	1004	* Callback for scan of ip->rec_tree. Try to include each record in our
	1005	* flush. ip->flush_group has been set but the inode has not yet been
	1006	* moved into a flushing state.
	1007	*
	1008	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	1009	* both inodes.
	1010	*
	1011	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	1012	* the caller from shortcutting the flush.
	1013	*/
	1014	static int
	1015	hammer_setup_child_callback(hammer_record_t rec, void *data)
	1016	{
	1017	hammer_inode_t target_ip;
	1018	hammer_inode_t ip;
	1019	int r;
	1020
	1021	/*
	1022	* If the record has been deleted by the backend (it's being held
	1023	* by the frontend in a race), just ignore it.
	1024	*/
	1025	if (rec->flags & HAMMER_RECF_DELETED_BE)
	1026	return(0);
	1027
	1028	/*
	1029	* If the record is in an idle state it has no dependancies and
	1030	* can be flushed.
	1031	*/
	1032	ip = rec->ip;
	1033	r = 0;
	1034
	1035	switch(rec->flush_state) {
	1036	case HAMMER_FST_IDLE:
	1037	/*
	1038	* Record has no setup dependancy, we can flush it.
	1039	*/
	1040	KKASSERT(rec->target_ip == NULL);
	1041	rec->flush_state = HAMMER_FST_FLUSH;
	1042	rec->flush_group = ip->flush_group;
	1043	hammer_ref(&rec->lock);
	1044	r = 1;
	1045	break;
	1046	case HAMMER_FST_SETUP:
	1047	/*
	1048	* Record has a setup dependancy. Try to include the
	1049	* target ip in the flush.
	1050	*
	1051	* We have to be careful here, if we do not do the right
	1052	* thing we can lose track of dirty inodes and the system
	1053	* will lockup trying to allocate buffers.
	1054	*/
	1055	target_ip = rec->target_ip;
	1056	KKASSERT(target_ip != NULL);
	1057	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	1058	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	1059	/*
	1060	* If the target IP is already flushing in our group
	1061	* we are golden, otherwise make sure the target
	1062	* reflushes.
	1063	*/
	1064	if (target_ip->flush_group == ip->flush_group) {
	1065	rec->flush_state = HAMMER_FST_FLUSH;
	1066	rec->flush_group = ip->flush_group;
	1067	hammer_ref(&rec->lock);
	1068	r = 1;
	1069	} else {
	1070	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1071	}
	1072	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	1073	/*
	1074	* If the target IP is not flushing we can force
	1075	* it to flush, even if it is unable to write out
	1076	* any of its own records we have at least one in
	1077	* hand that we CAN deal with.
	1078	*/
	1079	rec->flush_state = HAMMER_FST_FLUSH;
	1080	rec->flush_group = ip->flush_group;
	1081	hammer_ref(&rec->lock);
	1082	hammer_flush_inode_core(target_ip,
	1083	HAMMER_FLUSH_RECURSION);
	1084	r = 1;
	1085	} else {
	1086	/*
	1087	* General or delete-on-disk record.
	1088	*
	1089	* XXX this needs help. If a delete-on-disk we could
	1090	* disconnect the target. If the target has its own
	1091	* dependancies they really need to be flushed.
	1092	*
	1093	* XXX
	1094	*/
	1095	rec->flush_state = HAMMER_FST_FLUSH;
	1096	rec->flush_group = ip->flush_group;
	1097	hammer_ref(&rec->lock);
	1098	hammer_flush_inode_core(target_ip,
	1099	HAMMER_FLUSH_RECURSION);
	1100	r = 1;
	1101	}
	1102	break;
	1103	case HAMMER_FST_FLUSH:
	1104	/*
	1105	* Record already associated with a flush group. It had
	1106	* better be ours.
	1107	*/
	1108	KKASSERT(rec->flush_group == ip->flush_group);
	1109	r = 1;
	1110	break;
	1111	}
	1112	return(r);
	1113	}
	1114
	1115	/*
	1116	* Wait for a previously queued flush to complete
	1117	*/
	1118	void
	1119	hammer_wait_inode(hammer_inode_t ip)
	1120	{
	1121	while (ip->flush_state != HAMMER_FST_IDLE) {
	1122	ip->flags \|= HAMMER_INODE_FLUSHW;
	1123	tsleep(&ip->flags, 0, "hmrwin", 0);
	1124	}
	1125	}
	1126
	1127	/*
	1128	* Called by the backend code when a flush has been completed.
	1129	* The inode has already been removed from the flush list.
	1130	*
	1131	* A pipelined flush can occur, in which case we must re-enter the
	1132	* inode on the list and re-copy its fields.
	1133	*/
	1134	void
	1135	hammer_flush_inode_done(hammer_inode_t ip)
	1136	{
	1137	struct bio *bio;
	1138	int dorel = 0;
	1139
	1140	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	1141
	1142	/*
	1143	* Allow BIOs to queue to the inode's primary bioq again.
	1144	*/
	1145	ip->flags &= ~HAMMER_INODE_WRITE_ALT;
	1146
	1147	/*
	1148	* Merge left-over flags back into the frontend and fix the state.
	1149	*/
	1150	ip->flags \|= ip->sync_flags;
	1151
	1152	/*
	1153	* The backend may have adjusted nlinks, so if the adjusted nlinks
	1154	* does not match the fronttend set the frontend's RDIRTY flag again.
	1155	*/
	1156	if (ip->ino_rec.ino_nlinks != ip->sync_ino_rec.ino_nlinks)
	1157	ip->flags \|= HAMMER_INODE_RDIRTY;
	1158
	1159	/*
	1160	* Reflush any BIOs that wound up in the alt list. Our inode will
	1161	* also wind up at the end of the flusher's list.
	1162	*/
	1163	while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
	1164	TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
	1165	TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
	1166	}
	1167	/*
	1168	* Fix up the dirty buffer status.
	1169	*/
	1170	if (TAILQ_FIRST(&ip->bio_list) \|\|
	1171	(ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree))) {
	1172	ip->flags \|= HAMMER_INODE_BUFS;
	1173	}
	1174
	1175	/*
	1176	* Re-set the XDIRTY flag if some of the inode's in-memory records
	1177	* could not be flushed.
	1178	*/
	1179	if (RB_ROOT(&ip->rec_tree))
	1180	ip->flags \|= HAMMER_INODE_XDIRTY;
	1181
	1182	/*
	1183	* Do not lose track of inodes which no longer have vnode
	1184	* assocations, otherwise they may never get flushed again.
	1185	*/
	1186	if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
	1187	ip->flags \|= HAMMER_INODE_REFLUSH;
	1188
	1189	/*
	1190	* Adjust flush_state. The target state (idle or setup) shouldn't
	1191	* be terribly important since we will reflush if we really need
	1192	* to do anything. XXX
	1193	*/
	1194	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	1195	ip->flush_state = HAMMER_FST_IDLE;
	1196	dorel = 1;
	1197	} else {
	1198	ip->flush_state = HAMMER_FST_SETUP;
	1199	}
	1200
	1201	/*
	1202	* Clean up the vnode ref
	1203	*/
	1204	if (ip->flags & HAMMER_INODE_VHELD) {
	1205	ip->flags &= ~HAMMER_INODE_VHELD;
	1206	vrele(ip->vp);
	1207	}
	1208
	1209	/*
	1210	* If the frontend made more changes and requested another flush,
	1211	* then try to get it running.
	1212	*/
	1213	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1214	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1215	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	1216	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	1217	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1218	} else {
	1219	hammer_flush_inode(ip, 0);
	1220	}
	1221	}
	1222
	1223	/*
	1224	* Finally, if the frontend is waiting for a flush to complete,
	1225	* wake it up.
	1226	*/
	1227	if (ip->flush_state != HAMMER_FST_FLUSH) {
	1228	if (ip->flags & HAMMER_INODE_FLUSHW) {
	1229	ip->flags &= ~HAMMER_INODE_FLUSHW;
	1230	wakeup(&ip->flags);
	1231	}
	1232	}
	1233	if (dorel)
	1234	hammer_rel_inode(ip, 0);
	1235	}
	1236
	1237	/*
	1238	* Called from hammer_sync_inode() to synchronize in-memory records
	1239	* to the media.
	1240	*/
	1241	static int
	1242	hammer_sync_record_callback(hammer_record_t record, void *data)
	1243	{
	1244	hammer_cursor_t cursor = data;
	1245	hammer_transaction_t trans = cursor->trans;
	1246	int error;
	1247
	1248	/*
	1249	* Skip records that do not belong to the current flush.
	1250	*/
	1251	if (record->flush_state != HAMMER_FST_FLUSH)
	1252	return(0);
	1253	KKASSERT((record->flags & HAMMER_RECF_DELETED_BE) == 0);
	1254	#if 1
	1255	if (record->flush_group != record->ip->flush_group) {
	1256	kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	1257	Debugger("blah2");
	1258	return(0);
	1259	}
	1260	#endif
	1261	KKASSERT(record->flush_group == record->ip->flush_group);
	1262
	1263	/*
	1264	* Interlock the record using the BE flag. Once BE is set the
	1265	* frontend cannot change the state of FE.
	1266	*
	1267	* NOTE: If FE is set prior to us setting BE we still sync the
	1268	* record out, but the flush completion code converts it to
	1269	* a delete-on-disk record instead of destroying it.
	1270	*/
	1271	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	1272	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1273
	1274	/*
	1275	* If DELETED_FE is set we may have already sent dependant pieces
	1276	* to the disk and we must flush the record as if it hadn't been
	1277	* deleted. This creates a bit of a mess because we have to
	1278	* have ip_sync_record convert the record to MEM_RECORD_DEL before
	1279	* it inserts the B-Tree record. Otherwise the media sync might
	1280	* be visible to the frontend.
	1281	*/
	1282	if (record->flags & HAMMER_RECF_DELETED_FE) {
	1283	if (record->type == HAMMER_MEM_RECORD_ADD) {
	1284	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	1285	} else {
	1286	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	1287	return(0);
	1288	}
	1289	}
	1290
	1291	/*
	1292	* Assign the create_tid for new records. Deletions already
	1293	* have the record's entire key properly set up.
	1294	*/
	1295	if (record->type != HAMMER_MEM_RECORD_DEL)
	1296	record->rec.inode.base.base.create_tid = trans->tid;
	1297	for (;;) {
	1298	error = hammer_ip_sync_record_cursor(cursor, record);
	1299	if (error != EDEADLK)
	1300	break;
	1301	hammer_done_cursor(cursor);
	1302	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	1303	record->ip);
	1304	if (error)
	1305	break;
	1306	}
	1307	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	1308
	1309	if (error) {
	1310	error = -error;
	1311	if (error != -ENOSPC) {
	1312	kprintf("hammer_sync_record_callback: sync failed rec "
	1313	"%p, error %d\n", record, error);
	1314	Debugger("sync failed rec");
	1315	}
	1316	}
	1317	hammer_flush_record_done(record, error);
	1318	return(error);
	1319	}
	1320
	1321	/*
	1322	* XXX error handling
	1323	*/
	1324	int
	1325	hammer_sync_inode(hammer_inode_t ip)
	1326	{
	1327	struct hammer_transaction trans;
	1328	struct hammer_cursor cursor;
	1329	struct bio *bio;
	1330	hammer_record_t depend;
	1331	hammer_record_t next;
	1332	int error, tmp_error;
	1333	u_int64_t nlinks;
	1334
	1335	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	1336	return(0);
	1337
	1338	hammer_start_transaction_fls(&trans, ip->hmp);
	1339	error = hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
	1340	if (error)
	1341	goto done;
	1342
	1343	/*
	1344	* Any directory records referencing this inode which are not in
	1345	* our current flush group must adjust our nlink count for the
	1346	* purposes of synchronization to disk.
	1347	*
	1348	* Records which are in our flush group can be unlinked from our
	1349	* inode now, potentially allowing the inode to be physically
	1350	* deleted.
	1351	*/
	1352	nlinks = ip->ino_rec.ino_nlinks;
	1353	next = TAILQ_FIRST(&ip->target_list);
	1354	while ((depend = next) != NULL) {
	1355	next = TAILQ_NEXT(depend, target_entry);
	1356	if (depend->flush_state == HAMMER_FST_FLUSH &&
	1357	depend->flush_group == ip->hmp->flusher_act) {
	1358	/*
	1359	* If this is an ADD that was deleted by the frontend
	1360	* the frontend nlinks count will have already been
	1361	* decremented, but the backend is going to sync its
	1362	* directory entry and must account for it. The
	1363	* record will be converted to a delete-on-disk when
	1364	* it gets synced.
	1365	*
	1366	* If the ADD was not deleted by the frontend we
	1367	* can remove the dependancy from our target_list.
	1368	*/
	1369	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	1370	++nlinks;
	1371	} else {
	1372	TAILQ_REMOVE(&ip->target_list, depend,
	1373	target_entry);
	1374	depend->target_ip = NULL;
	1375	}
	1376	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	1377	/*
	1378	* Not part of our flush group
	1379	*/
	1380	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	1381	switch(depend->type) {
	1382	case HAMMER_MEM_RECORD_ADD:
	1383	--nlinks;
	1384	break;
	1385	case HAMMER_MEM_RECORD_DEL:
	1386	++nlinks;
	1387	break;
	1388	default:
	1389	break;
	1390	}
	1391	}
	1392	}
	1393
	1394	/*
	1395	* Set dirty if we had to modify the link count.
	1396	*/
	1397	if (ip->sync_ino_rec.ino_nlinks != nlinks) {
	1398	KKASSERT((int64_t)nlinks >= 0);
	1399	ip->sync_ino_rec.ino_nlinks = nlinks;
	1400	ip->sync_flags \|= HAMMER_INODE_RDIRTY;
	1401	}
	1402
	1403	/*
	1404	* Queue up as many dirty buffers as we can then set a flag to
	1405	* cause any further BIOs to go to the alternative queue.
	1406	*/
	1407	if (ip->flags & HAMMER_INODE_VHELD)
	1408	error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
	1409	ip->flags \|= HAMMER_INODE_WRITE_ALT;
	1410
	1411	/*
	1412	* The buffer cache may contain dirty buffers beyond the inode
	1413	* state we copied from the frontend to the backend. Because
	1414	* we are syncing our buffer cache on the backend, resync
	1415	* the truncation point and the file size so we don't wipe out
	1416	* any data.
	1417	*
	1418	* Syncing the buffer cache on the frontend has serious problems
	1419	* because it prevents us from passively queueing dirty inodes
	1420	* to the backend (the BIO's could stall indefinitely).
	1421	*/
	1422	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	1423	ip->sync_trunc_off = ip->trunc_off;
	1424	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	1425	}
	1426	if (ip->sync_ino_rec.ino_size != ip->ino_rec.ino_size) {
	1427	ip->sync_ino_rec.ino_size = ip->ino_rec.ino_size;
	1428	ip->sync_flags \|= HAMMER_INODE_RDIRTY;
	1429	}
	1430
	1431	/*
	1432	* If there is a trunction queued destroy any data past the (aligned)
	1433	* truncation point. Userland will have dealt with the buffer
	1434	* containing the truncation point for us.
	1435	*
	1436	* We don't flush pending frontend data buffers until after we've
	1437	* dealth with the truncation.
	1438	*
	1439	* Don't bother if the inode is or has been deleted.
	1440	*/
	1441	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	1442	/*
	1443	* Interlock trunc_off. The VOP front-end may continue to
	1444	* make adjustments to it while we are blocked.
	1445	*/
	1446	off_t trunc_off;
	1447	off_t aligned_trunc_off;
	1448
	1449	trunc_off = ip->sync_trunc_off;
	1450	aligned_trunc_off = (trunc_off + HAMMER_BUFMASK) &
	1451	~HAMMER_BUFMASK64;
	1452
	1453	/*
	1454	* Delete any whole blocks on-media. The front-end has
	1455	* already cleaned out any partial block and made it
	1456	* pending. The front-end may have updated trunc_off
	1457	* while we were blocked so do not just unconditionally
	1458	* set it to the maximum offset.
	1459	*/
	1460	error = hammer_ip_delete_range(&cursor, ip,
	1461	aligned_trunc_off,
	1462	0x7FFFFFFFFFFFFFFFLL);
	1463	if (error)
	1464	Debugger("hammer_ip_delete_range errored");
	1465	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1466	if (ip->trunc_off >= trunc_off) {
	1467	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	1468	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	1469	}
	1470	} else {
	1471	error = 0;
	1472	}
	1473
	1474	/*
	1475	* Now sync related records. These will typically be directory
	1476	* entries or delete-on-disk records.
	1477	*
	1478	* Not all records will be flushed, but clear XDIRTY anyway. We
	1479	* will set it again in the frontend hammer_flush_inode_done()
	1480	* if records remain.
	1481	*/
	1482	if (error == 0) {
	1483	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1484	hammer_sync_record_callback, &cursor);
	1485	if (tmp_error < 0)
	1486	tmp_error = -error;
	1487	if (tmp_error)
	1488	error = tmp_error;
	1489	if (error == 0)
	1490	ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
	1491	}
	1492
	1493	/*
	1494	* If we are deleting the inode the frontend had better not have
	1495	* any active references on elements making up the inode.
	1496	*/
	1497	if (error == 0 && ip->sync_ino_rec.ino_nlinks == 0 &&
	1498	RB_EMPTY(&ip->rec_tree) &&
	1499	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	1500	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	1501	int count1 = 0;
	1502
	1503	kprintf("Y");
	1504	ip->flags \|= HAMMER_INODE_DELETED;
	1505	error = hammer_ip_delete_range_all(&cursor, ip, &count1);
	1506	if (error == 0) {
	1507	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	1508	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1509	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1510
	1511	/*
	1512	* Set delete_tid in both the frontend and backend
	1513	* copy of the inode record. The DELETED flag handles
	1514	* this, do not set RDIRTY.
	1515	*/
	1516	ip->ino_rec.base.base.delete_tid = trans.tid;
	1517	ip->sync_ino_rec.base.base.delete_tid = trans.tid;
	1518
	1519	/*
	1520	* Adjust the inode count in the volume header
	1521	*/
	1522	hammer_modify_volume_field(&trans, trans.rootvol,
	1523	vol0_stat_inodes);
	1524	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1525	hammer_modify_volume_done(trans.rootvol);
	1526	} else {
	1527	ip->flags &= ~HAMMER_INODE_DELETED;
	1528	Debugger("hammer_ip_delete_range_all errored");
	1529	}
	1530	}
	1531
	1532	/*
	1533	* Flush any queued BIOs. These will just biodone() the IO's if
	1534	* the inode has been deleted.
	1535	*/
	1536	while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) {
	1537	TAILQ_REMOVE(&ip->bio_list, bio, bio_act);
	1538	tmp_error = hammer_dowrite(&cursor, ip, bio);
	1539	if (tmp_error)
	1540	error = tmp_error;
	1541	}
	1542	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	1543
	1544	if (error)
	1545	Debugger("RB_SCAN errored");
	1546
	1547	/*
	1548	* Now update the inode's on-disk inode-data and/or on-disk record.
	1549	* DELETED and ONDISK are managed only in ip->flags.
	1550	*/
	1551	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	1552	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	1553	/*
	1554	* If deleted and on-disk, don't set any additional flags.
	1555	* the delete flag takes care of things.
	1556	*
	1557	* Clear flags which may have been set by the frontend.
	1558	*/
	1559	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	1560	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES\|
	1561	HAMMER_INODE_DELETING);
	1562	break;
	1563	case HAMMER_INODE_DELETED:
	1564	/*
	1565	* Take care of the case where a deleted inode was never
	1566	* flushed to the disk in the first place.
	1567	*
	1568	* Clear flags which may have been set by the frontend.
	1569	*/
	1570	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	1571	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES\|
	1572	HAMMER_INODE_DELETING);
	1573	while (RB_ROOT(&ip->rec_tree)) {
	1574	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	1575	hammer_ref(&record->lock);
	1576	KKASSERT(record->lock.refs == 1);
	1577	record->flags \|= HAMMER_RECF_DELETED_FE;
	1578	record->flags \|= HAMMER_RECF_DELETED_BE;
	1579	hammer_rel_mem_record(record);
	1580	}
	1581	break;
	1582	case HAMMER_INODE_ONDISK:
	1583	/*
	1584	* If already on-disk, do not set any additional flags.
	1585	*/
	1586	break;
	1587	default:
	1588	/*
	1589	* If not on-disk and not deleted, set both dirty flags
	1590	* to force an initial record to be written. Also set
	1591	* the create_tid for the inode.
	1592	*
	1593	* Set create_tid in both the frontend and backend
	1594	* copy of the inode record.
	1595	*/
	1596	ip->ino_rec.base.base.create_tid = trans.tid;
	1597	ip->sync_ino_rec.base.base.create_tid = trans.tid;
	1598	ip->sync_flags \|= HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY;
	1599	break;
	1600	}
	1601
	1602	/*
	1603	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	1604	* is already on-disk the old record is marked as deleted.
	1605	*
	1606	* If DELETED is set hammer_update_inode() will delete the existing
	1607	* record without writing out a new one.
	1608	*
	1609	* If ONLY the ITIMES flag is set we can update the record in-place.
	1610	*/
	1611	if (ip->flags & HAMMER_INODE_DELETED) {
	1612	error = hammer_update_inode(&cursor, ip);
	1613	} else
	1614	if ((ip->sync_flags & (HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY \|
	1615	HAMMER_INODE_ITIMES)) == HAMMER_INODE_ITIMES) {
	1616	error = hammer_update_itimes(&cursor, ip);
	1617	} else
	1618	if (ip->sync_flags & (HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY \|
	1619	HAMMER_INODE_ITIMES)) {
	1620	error = hammer_update_inode(&cursor, ip);
	1621	}
	1622	if (error)
	1623	Debugger("hammer_update_itimes/inode errored");
	1624	done:
	1625	/*
	1626	* Save the TID we used to sync the inode with to make sure we
	1627	* do not improperly reuse it.
	1628	*/
	1629	hammer_done_cursor(&cursor);
	1630	hammer_done_transaction(&trans);
	1631	return(error);
	1632	}
	1633
	1634	/*
	1635	* This routine is called when the OS is no longer actively referencing
	1636	* the inode (but might still be keeping it cached), or when releasing
	1637	* the last reference to an inode.
	1638	*
	1639	* At this point if the inode's nlinks count is zero we want to destroy
	1640	* it, which may mean destroying it on-media too.
	1641	*/
	1642	void
	1643	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	1644	{
	1645	struct vnode *vp;
	1646
	1647	/*
	1648	* Set the DELETING flag when the link count drops to 0 and the
	1649	* OS no longer has any opens on the inode.
	1650	*
	1651	* The backend will clear DELETING (a mod flag) and set DELETED
	1652	* (a state flag) when it is actually able to perform the
	1653	* operation.
	1654	*/
	1655	if (ip->ino_rec.ino_nlinks == 0 &&
	1656	(ip->flags & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	1657	ip->flags \|= HAMMER_INODE_DELETING;
	1658	ip->flags \|= HAMMER_INODE_TRUNCATED;
	1659	ip->trunc_off = 0;
	1660	vp = NULL;
	1661	if (getvp) {
	1662	if (hammer_get_vnode(ip, &vp) != 0)
	1663	return;
	1664	}
	1665	if (ip->vp) {
	1666	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	1667	vnode_pager_setsize(ip->vp, 0);
	1668	}
	1669	if (getvp) {
	1670	vput(vp);
	1671	}
	1672	}
	1673	}
	1674
	1675	/*
	1676	* Re-test an inode when a dependancy had gone away to see if we
	1677	* can chain flush it.
	1678	*/
	1679	void
	1680	hammer_test_inode(hammer_inode_t ip)
	1681	{
	1682	if (ip->flags & HAMMER_INODE_REFLUSH) {
	1683	ip->flags &= ~HAMMER_INODE_REFLUSH;
	1684	hammer_ref(&ip->lock);
	1685	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	1686	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	1687	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	1688	} else {
	1689	hammer_flush_inode(ip, 0);
	1690	}
	1691	hammer_rel_inode(ip, 0);
	1692	}
	1693	}
	1694