gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39	#include <sys/buf.h>
	40	#include <sys/buf2.h>
	41
	42	static int hammer_unload_inode(struct hammer_inode *ip);
	43	static void hammer_free_inode(hammer_inode_t ip);
	44	static void hammer_flush_inode_core(hammer_inode_t ip,
	45	hammer_flush_group_t flg, int flags);
	46	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	47	#if 0
	48	static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
	49	#endif
	50	static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	51	hammer_flush_group_t flg);
	52	static int hammer_setup_parent_inodes_helper(hammer_record_t record,
	53	int depth, hammer_flush_group_t flg);
	54	static void hammer_inode_wakereclaims(hammer_inode_t ip, int dowake);
	55
	56	#ifdef DEBUG_TRUNCATE
	57	extern struct hammer_inode *HammerTruncIp;
	58	#endif
	59
	60	/*
	61	* RB-Tree support for inode structures
	62	*/
	63	int
	64	hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
	65	{
	66	if (ip1->obj_localization < ip2->obj_localization)
	67	return(-1);
	68	if (ip1->obj_localization > ip2->obj_localization)
	69	return(1);
	70	if (ip1->obj_id < ip2->obj_id)
	71	return(-1);
	72	if (ip1->obj_id > ip2->obj_id)
	73	return(1);
	74	if (ip1->obj_asof < ip2->obj_asof)
	75	return(-1);
	76	if (ip1->obj_asof > ip2->obj_asof)
	77	return(1);
	78	return(0);
	79	}
	80
	81	/*
	82	* RB-Tree support for inode structures / special LOOKUP_INFO
	83	*/
	84	static int
	85	hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
	86	{
	87	if (info->obj_localization < ip->obj_localization)
	88	return(-1);
	89	if (info->obj_localization > ip->obj_localization)
	90	return(1);
	91	if (info->obj_id < ip->obj_id)
	92	return(-1);
	93	if (info->obj_id > ip->obj_id)
	94	return(1);
	95	if (info->obj_asof < ip->obj_asof)
	96	return(-1);
	97	if (info->obj_asof > ip->obj_asof)
	98	return(1);
	99	return(0);
	100	}
	101
	102	/*
	103	* Used by hammer_scan_inode_snapshots() to locate all of an object's
	104	* snapshots. Note that the asof field is not tested, which we can get
	105	* away with because it is the lowest-priority field.
	106	*/
	107	static int
	108	hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
	109	{
	110	hammer_inode_info_t info = data;
	111
	112	if (ip->obj_localization > info->obj_localization)
	113	return(1);
	114	if (ip->obj_localization < info->obj_localization)
	115	return(-1);
	116	if (ip->obj_id > info->obj_id)
	117	return(1);
	118	if (ip->obj_id < info->obj_id)
	119	return(-1);
	120	return(0);
	121	}
	122
	123	/*
	124	* Used by hammer_unload_pseudofs() to locate all inodes associated with
	125	* a particular PFS.
	126	*/
	127	static int
	128	hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
	129	{
	130	u_int32_t localization = (u_int32_t )data;
	131	if (ip->obj_localization > localization)
	132	return(1);
	133	if (ip->obj_localization < localization)
	134	return(-1);
	135	return(0);
	136	}
	137
	138	/*
	139	* RB-Tree support for pseudofs structures
	140	*/
	141	static int
	142	hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
	143	{
	144	if (p1->localization < p2->localization)
	145	return(-1);
	146	if (p1->localization > p2->localization)
	147	return(1);
	148	return(0);
	149	}
	150
	151
	152	RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
	153	RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
	154	hammer_inode_info_cmp, hammer_inode_info_t);
	155	RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
	156	hammer_pfs_rb_compare, u_int32_t, localization);
	157
	158	/*
	159	* The kernel is not actively referencing this vnode but is still holding
	160	* it cached.
	161	*
	162	* This is called from the frontend.
	163	*/
	164	int
	165	hammer_vop_inactive(struct vop_inactive_args *ap)
	166	{
	167	struct hammer_inode *ip = VTOI(ap->a_vp);
	168
	169	/*
	170	* Degenerate case
	171	*/
	172	if (ip == NULL) {
	173	vrecycle(ap->a_vp);
	174	return(0);
	175	}
	176
	177	/*
	178	* If the inode no longer has visibility in the filesystem try to
	179	* recycle it immediately, even if the inode is dirty. Recycling
	180	* it quickly allows the system to reclaim buffer cache and VM
	181	* resources which can matter a lot in a heavily loaded system.
	182	*
	183	* This can deadlock in vfsync() if we aren't careful.
	184	*
	185	* Do not queue the inode to the flusher if we still have visibility,
	186	* otherwise namespace calls such as chmod will unnecessarily generate
	187	* multiple inode updates.
	188	*/
	189	hammer_inode_unloadable_check(ip, 0);
	190	if (ip->ino_data.nlinks == 0) {
	191	if (ip->flags & HAMMER_INODE_MODMASK)
	192	hammer_flush_inode(ip, 0);
	193	vrecycle(ap->a_vp);
	194	}
	195	return(0);
	196	}
	197
	198	/*
	199	* Release the vnode association. This is typically (but not always)
	200	* the last reference on the inode.
	201	*
	202	* Once the association is lost we are on our own with regards to
	203	* flushing the inode.
	204	*/
	205	int
	206	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	207	{
	208	struct hammer_inode *ip;
	209	hammer_mount_t hmp;
	210	struct vnode *vp;
	211
	212	vp = ap->a_vp;
	213
	214	if ((ip = vp->v_data) != NULL) {
	215	hmp = ip->hmp;
	216	vp->v_data = NULL;
	217	ip->vp = NULL;
	218
	219	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
	220	++hammer_count_reclaiming;
	221	++hmp->inode_reclaims;
	222	ip->flags \|= HAMMER_INODE_RECLAIM;
	223	}
	224	hammer_rel_inode(ip, 1);
	225	}
	226	return(0);
	227	}
	228
	229	/*
	230	* Return a locked vnode for the specified inode. The inode must be
	231	* referenced but NOT LOCKED on entry and will remain referenced on
	232	* return.
	233	*
	234	* Called from the frontend.
	235	*/
	236	int
	237	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	238	{
	239	hammer_mount_t hmp;
	240	struct vnode *vp;
	241	int error = 0;
	242	u_int8_t obj_type;
	243
	244	hmp = ip->hmp;
	245
	246	for (;;) {
	247	if ((vp = ip->vp) == NULL) {
	248	error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
	249	if (error)
	250	break;
	251	hammer_lock_ex(&ip->lock);
	252	if (ip->vp != NULL) {
	253	hammer_unlock(&ip->lock);
	254	vp = *vpp;
	255	vp->v_type = VBAD;
	256	vx_put(vp);
	257	continue;
	258	}
	259	hammer_ref(&ip->lock);
	260	vp = *vpp;
	261	ip->vp = vp;
	262
	263	obj_type = ip->ino_data.obj_type;
	264	vp->v_type = hammer_get_vnode_type(obj_type);
	265
	266	hammer_inode_wakereclaims(ip, 0);
	267
	268	switch(ip->ino_data.obj_type) {
	269	case HAMMER_OBJTYPE_CDEV:
	270	case HAMMER_OBJTYPE_BDEV:
	271	vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
	272	addaliasu(vp, ip->ino_data.rmajor,
	273	ip->ino_data.rminor);
	274	break;
	275	case HAMMER_OBJTYPE_FIFO:
	276	vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
	277	break;
	278	case HAMMER_OBJTYPE_REGFILE:
	279	/*
	280	* MPSAFE read supported.
	281	*/
	282	vp->v_flag \|= VMP_READ;
	283	break;
	284	default:
	285	break;
	286	}
	287
	288	/*
	289	* Only mark as the root vnode if the ip is not
	290	* historical, otherwise the VFS cache will get
	291	* confused. The other half of the special handling
	292	* is in hammer_vop_nlookupdotdot().
	293	*
	294	* Pseudo-filesystem roots can be accessed via
	295	* non-root filesystem paths and setting VROOT may
	296	* confuse the namecache. Set VPFSROOT instead.
	297	*/
	298	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	299	ip->obj_asof == hmp->asof) {
	300	if (ip->obj_localization == 0)
	301	vp->v_flag \|= VROOT;
	302	else
	303	vp->v_flag \|= VPFSROOT;
	304	}
	305
	306	vp->v_data = (void *)ip;
	307	/* vnode locked by getnewvnode() */
	308	/* make related vnode dirty if inode dirty? */
	309	hammer_unlock(&ip->lock);
	310	if (vp->v_type == VREG)
	311	vinitvmio(vp, ip->ino_data.size);
	312	break;
	313	}
	314
	315	/*
	316	* loop if the vget fails (aka races), or if the vp
	317	* no longer matches ip->vp.
	318	*/
	319	if (vget(vp, LK_EXCLUSIVE) == 0) {
	320	if (vp == ip->vp)
	321	break;
	322	vput(vp);
	323	}
	324	}
	325	*vpp = vp;
	326	return(error);
	327	}
	328
	329	/*
	330	* Locate all copies of the inode for obj_id compatible with the specified
	331	* asof, reference, and issue the related call-back. This routine is used
	332	* for direct-io invalidation and does not create any new inodes.
	333	*/
	334	void
	335	hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
	336	int (callback)(hammer_inode_t ip, void data),
	337	void *data)
	338	{
	339	hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
	340	hammer_inode_info_cmp_all_history,
	341	callback, iinfo);
	342	}
	343
	344	/*
	345	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	346	* do not attach or detach the related vnode (use hammer_get_vnode() for
	347	* that).
	348	*
	349	* The flags argument is only applied for newly created inodes, and only
	350	* certain flags are inherited.
	351	*
	352	* Called from the frontend.
	353	*/
	354	struct hammer_inode *
	355	hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
	356	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	357	int flags, int *errorp)
	358	{
	359	hammer_mount_t hmp = trans->hmp;
	360	struct hammer_node_cache *cachep;
	361	struct hammer_inode_info iinfo;
	362	struct hammer_cursor cursor;
	363	struct hammer_inode *ip;
	364
	365
	366	/*
	367	* Determine if we already have an inode cached. If we do then
	368	* we are golden.
	369	*
	370	* If we find an inode with no vnode we have to mark the
	371	* transaction such that hammer_inode_waitreclaims() is
	372	* called later on to avoid building up an infinite number
	373	* of inodes. Otherwise we can continue to * add new inodes
	374	* faster then they can be disposed of, even with the tsleep
	375	* delay.
	376	*
	377	* If we find a dummy inode we return a failure so dounlink
	378	* (which does another lookup) doesn't try to mess with the
	379	* link count. hammer_vop_nresolve() uses hammer_get_dummy_inode()
	380	* to ref dummy inodes.
	381	*/
	382	iinfo.obj_id = obj_id;
	383	iinfo.obj_asof = asof;
	384	iinfo.obj_localization = localization;
	385	loop:
	386	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	387	if (ip) {
	388	if (ip->flags & HAMMER_INODE_DUMMY) {
	389	*errorp = ENOENT;
	390	return(NULL);
	391	}
	392	hammer_ref(&ip->lock);
	393	*errorp = 0;
	394	return(ip);
	395	}
	396
	397	/*
	398	* Allocate a new inode structure and deal with races later.
	399	*/
	400	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	401	++hammer_count_inodes;
	402	++hmp->count_inodes;
	403	ip->obj_id = obj_id;
	404	ip->obj_asof = iinfo.obj_asof;
	405	ip->obj_localization = localization;
	406	ip->hmp = hmp;
	407	ip->flags = flags & HAMMER_INODE_RO;
	408	ip->cache[0].ip = ip;
	409	ip->cache[1].ip = ip;
	410	ip->cache[2].ip = ip;
	411	ip->cache[3].ip = ip;
	412	if (hmp->ronly)
	413	ip->flags \|= HAMMER_INODE_RO;
	414	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	415	0x7FFFFFFFFFFFFFFFLL;
	416	RB_INIT(&ip->rec_tree);
	417	TAILQ_INIT(&ip->target_list);
	418	hammer_ref(&ip->lock);
	419
	420	/*
	421	* Locate the on-disk inode. If this is a PFS root we always
	422	* access the current version of the root inode and (if it is not
	423	* a master) always access information under it with a snapshot
	424	* TID.
	425	*
	426	* We cache recent inode lookups in this directory in dip->cache[2].
	427	* If we can't find it we assume the inode we are looking for is
	428	* close to the directory inode.
	429	*/
	430	retry:
	431	cachep = NULL;
	432	if (dip) {
	433	if (dip->cache[2].node)
	434	cachep = &dip->cache[2];
	435	else
	436	cachep = &dip->cache[0];
	437	}
	438	hammer_init_cursor(trans, &cursor, cachep, NULL);
	439	cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
	440	cursor.key_beg.obj_id = ip->obj_id;
	441	cursor.key_beg.key = 0;
	442	cursor.key_beg.create_tid = 0;
	443	cursor.key_beg.delete_tid = 0;
	444	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	445	cursor.key_beg.obj_type = 0;
	446
	447	cursor.asof = iinfo.obj_asof;
	448	cursor.flags = HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_GET_DATA \|
	449	HAMMER_CURSOR_ASOF;
	450
	451	*errorp = hammer_btree_lookup(&cursor);
	452	if (*errorp == EDEADLK) {
	453	hammer_done_cursor(&cursor);
	454	goto retry;
	455	}
	456
	457	/*
	458	* On success the B-Tree lookup will hold the appropriate
	459	* buffer cache buffers and provide a pointer to the requested
	460	* information. Copy the information to the in-memory inode
	461	* and cache the B-Tree node to improve future operations.
	462	*/
	463	if (*errorp == 0) {
	464	ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
	465	ip->ino_data = cursor.data->inode;
	466
	467	/*
	468	* cache[0] tries to cache the location of the object inode.
	469	* The assumption is that it is near the directory inode.
	470	*
	471	* cache[1] tries to cache the location of the object data.
	472	* We might have something in the governing directory from
	473	* scan optimizations (see the strategy code in
	474	* hammer_vnops.c).
	475	*
	476	* We update dip->cache[2], if possible, with the location
	477	* of the object inode for future directory shortcuts.
	478	*/
	479	hammer_cache_node(&ip->cache[0], cursor.node);
	480	if (dip) {
	481	if (dip->cache[3].node) {
	482	hammer_cache_node(&ip->cache[1],
	483	dip->cache[3].node);
	484	}
	485	hammer_cache_node(&dip->cache[2], cursor.node);
	486	}
	487
	488	/*
	489	* The file should not contain any data past the file size
	490	* stored in the inode. Setting save_trunc_off to the
	491	* file size instead of max reduces B-Tree lookup overheads
	492	* on append by allowing the flusher to avoid checking for
	493	* record overwrites.
	494	*/
	495	ip->save_trunc_off = ip->ino_data.size;
	496
	497	/*
	498	* Locate and assign the pseudofs management structure to
	499	* the inode.
	500	*/
	501	if (dip && dip->obj_localization == ip->obj_localization) {
	502	ip->pfsm = dip->pfsm;
	503	hammer_ref(&ip->pfsm->lock);
	504	} else {
	505	ip->pfsm = hammer_load_pseudofs(trans,
	506	ip->obj_localization,
	507	errorp);
	508	errorp = 0; / ignore ENOENT */
	509	}
	510	}
	511
	512	/*
	513	* The inode is placed on the red-black tree and will be synced to
	514	* the media when flushed or by the filesystem sync. If this races
	515	* another instantiation/lookup the insertion will fail.
	516	*/
	517	if (*errorp == 0) {
	518	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	519	hammer_free_inode(ip);
	520	hammer_done_cursor(&cursor);
	521	goto loop;
	522	}
	523	ip->flags \|= HAMMER_INODE_ONDISK;
	524	} else {
	525	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	526	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	527	--hmp->rsv_inodes;
	528	}
	529
	530	hammer_free_inode(ip);
	531	ip = NULL;
	532	}
	533	hammer_done_cursor(&cursor);
	534	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	535	return (ip);
	536	}
	537
	538	/*
	539	* Get a dummy inode to placemark a broken directory entry.
	540	*/
	541	struct hammer_inode *
	542	hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
	543	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	544	int flags, int *errorp)
	545	{
	546	hammer_mount_t hmp = trans->hmp;
	547	struct hammer_inode_info iinfo;
	548	struct hammer_inode *ip;
	549
	550	/*
	551	* Determine if we already have an inode cached. If we do then
	552	* we are golden.
	553	*
	554	* If we find an inode with no vnode we have to mark the
	555	* transaction such that hammer_inode_waitreclaims() is
	556	* called later on to avoid building up an infinite number
	557	* of inodes. Otherwise we can continue to * add new inodes
	558	* faster then they can be disposed of, even with the tsleep
	559	* delay.
	560	*
	561	* If we find a non-fake inode we return an error. Only fake
	562	* inodes can be returned by this routine.
	563	*/
	564	iinfo.obj_id = obj_id;
	565	iinfo.obj_asof = asof;
	566	iinfo.obj_localization = localization;
	567	loop:
	568	*errorp = 0;
	569	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	570	if (ip) {
	571	if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
	572	*errorp = ENOENT;
	573	return(NULL);
	574	}
	575	hammer_ref(&ip->lock);
	576	return(ip);
	577	}
	578
	579	/*
	580	* Allocate a new inode structure and deal with races later.
	581	*/
	582	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	583	++hammer_count_inodes;
	584	++hmp->count_inodes;
	585	ip->obj_id = obj_id;
	586	ip->obj_asof = iinfo.obj_asof;
	587	ip->obj_localization = localization;
	588	ip->hmp = hmp;
	589	ip->flags = flags \| HAMMER_INODE_RO \| HAMMER_INODE_DUMMY;
	590	ip->cache[0].ip = ip;
	591	ip->cache[1].ip = ip;
	592	ip->cache[2].ip = ip;
	593	ip->cache[3].ip = ip;
	594	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	595	0x7FFFFFFFFFFFFFFFLL;
	596	RB_INIT(&ip->rec_tree);
	597	TAILQ_INIT(&ip->target_list);
	598	hammer_ref(&ip->lock);
	599
	600	/*
	601	* Populate the dummy inode. Leave everything zero'd out.
	602	*
	603	* (ip->ino_leaf and ip->ino_data)
	604	*
	605	* Make the dummy inode a FIFO object which most copy programs
	606	* will properly ignore.
	607	*/
	608	ip->save_trunc_off = ip->ino_data.size;
	609	ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
	610
	611	/*
	612	* Locate and assign the pseudofs management structure to
	613	* the inode.
	614	*/
	615	if (dip && dip->obj_localization == ip->obj_localization) {
	616	ip->pfsm = dip->pfsm;
	617	hammer_ref(&ip->pfsm->lock);
	618	} else {
	619	ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
	620	errorp);
	621	errorp = 0; / ignore ENOENT */
	622	}
	623
	624	/*
	625	* The inode is placed on the red-black tree and will be synced to
	626	* the media when flushed or by the filesystem sync. If this races
	627	* another instantiation/lookup the insertion will fail.
	628	*
	629	* NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake.
	630	*/
	631	if (*errorp == 0) {
	632	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	633	hammer_free_inode(ip);
	634	goto loop;
	635	}
	636	} else {
	637	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	638	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	639	--hmp->rsv_inodes;
	640	}
	641	hammer_free_inode(ip);
	642	ip = NULL;
	643	}
	644	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	645	return (ip);
	646	}
	647
	648	/*
	649	* Return a referenced inode only if it is in our inode cache.
	650	*
	651	* Dummy inodes do not count.
	652	*/
	653	struct hammer_inode *
	654	hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
	655	hammer_tid_t asof, u_int32_t localization)
	656	{
	657	hammer_mount_t hmp = trans->hmp;
	658	struct hammer_inode_info iinfo;
	659	struct hammer_inode *ip;
	660
	661	iinfo.obj_id = obj_id;
	662	iinfo.obj_asof = asof;
	663	iinfo.obj_localization = localization;
	664
	665	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	666	if (ip) {
	667	if (ip->flags & HAMMER_INODE_DUMMY)
	668	ip = NULL;
	669	else
	670	hammer_ref(&ip->lock);
	671	}
	672	return(ip);
	673	}
	674
	675	/*
	676	* Create a new filesystem object, returning the inode in *ipp. The
	677	* returned inode will be referenced. The inode is created in-memory.
	678	*
	679	* If pfsm is non-NULL the caller wishes to create the root inode for
	680	* a master PFS.
	681	*/
	682	int
	683	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	684	struct ucred *cred,
	685	hammer_inode_t dip, const char *name, int namelen,
	686	hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
	687	{
	688	hammer_mount_t hmp;
	689	hammer_inode_t ip;
	690	uid_t xuid;
	691	int error;
	692	int64_t namekey;
	693	u_int32_t dummy;
	694
	695	hmp = trans->hmp;
	696
	697	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	698	++hammer_count_inodes;
	699	++hmp->count_inodes;
	700	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	701
	702	if (pfsm) {
	703	KKASSERT(pfsm->localization != 0);
	704	ip->obj_id = HAMMER_OBJID_ROOT;
	705	ip->obj_localization = pfsm->localization;
	706	} else {
	707	KKASSERT(dip != NULL);
	708	namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
	709	ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
	710	ip->obj_localization = dip->obj_localization;
	711	}
	712
	713	KKASSERT(ip->obj_id != 0);
	714	ip->obj_asof = hmp->asof;
	715	ip->hmp = hmp;
	716	ip->flush_state = HAMMER_FST_IDLE;
	717	ip->flags = HAMMER_INODE_DDIRTY \|
	718	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME;
	719	ip->cache[0].ip = ip;
	720	ip->cache[1].ip = ip;
	721	ip->cache[2].ip = ip;
	722	ip->cache[3].ip = ip;
	723
	724	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	725	/* ip->save_trunc_off = 0; (already zero) */
	726	RB_INIT(&ip->rec_tree);
	727	TAILQ_INIT(&ip->target_list);
	728
	729	ip->ino_data.atime = trans->time;
	730	ip->ino_data.mtime = trans->time;
	731	ip->ino_data.size = 0;
	732	ip->ino_data.nlinks = 0;
	733
	734	/*
	735	* A nohistory designator on the parent directory is inherited by
	736	* the child. We will do this even for pseudo-fs creation... the
	737	* sysad can turn it off.
	738	*/
	739	if (dip) {
	740	ip->ino_data.uflags = dip->ino_data.uflags &
	741	(SF_NOHISTORY\|UF_NOHISTORY\|UF_NODUMP);
	742	}
	743
	744	ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
	745	ip->ino_leaf.base.localization = ip->obj_localization +
	746	HAMMER_LOCALIZE_INODE;
	747	ip->ino_leaf.base.obj_id = ip->obj_id;
	748	ip->ino_leaf.base.key = 0;
	749	ip->ino_leaf.base.create_tid = 0;
	750	ip->ino_leaf.base.delete_tid = 0;
	751	ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
	752	ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
	753
	754	ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
	755	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	756	ip->ino_data.mode = vap->va_mode;
	757	ip->ino_data.ctime = trans->time;
	758
	759	/*
	760	* If we are running version 2 or greater directory entries are
	761	* inode-localized instead of data-localized.
	762	*/
	763	if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
	764	if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	765	ip->ino_data.cap_flags \|=
	766	HAMMER_INODE_CAP_DIR_LOCAL_INO;
	767	}
	768	}
	769
	770	/*
	771	* Setup the ".." pointer. This only needs to be done for directories
	772	* but we do it for all objects as a recovery aid.
	773	*/
	774	if (dip)
	775	ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
	776	#if 0
	777	/*
	778	* The parent_obj_localization field only applies to pseudo-fs roots.
	779	* XXX this is no longer applicable, PFSs are no longer directly
	780	* tied into the parent's directory structure.
	781	*/
	782	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
	783	ip->obj_id == HAMMER_OBJID_ROOT) {
	784	ip->ino_data.ext.obj.parent_obj_localization =
	785	dip->obj_localization;
	786	}
	787	#endif
	788
	789	switch(ip->ino_leaf.base.obj_type) {
	790	case HAMMER_OBJTYPE_CDEV:
	791	case HAMMER_OBJTYPE_BDEV:
	792	ip->ino_data.rmajor = vap->va_rmajor;
	793	ip->ino_data.rminor = vap->va_rminor;
	794	break;
	795	default:
	796	break;
	797	}
	798
	799	/*
	800	* Calculate default uid/gid and overwrite with information from
	801	* the vap.
	802	*/
	803	if (dip) {
	804	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	805	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
	806	xuid, cred, &vap->va_mode);
	807	} else {
	808	xuid = 0;
	809	}
	810	ip->ino_data.mode = vap->va_mode;
	811
	812	if (vap->va_vaflags & VA_UID_UUID_VALID)
	813	ip->ino_data.uid = vap->va_uid_uuid;
	814	else if (vap->va_uid != (uid_t)VNOVAL)
	815	hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
	816	else
	817	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	818
	819	if (vap->va_vaflags & VA_GID_UUID_VALID)
	820	ip->ino_data.gid = vap->va_gid_uuid;
	821	else if (vap->va_gid != (gid_t)VNOVAL)
	822	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	823	else if (dip)
	824	ip->ino_data.gid = dip->ino_data.gid;
	825
	826	hammer_ref(&ip->lock);
	827
	828	if (pfsm) {
	829	ip->pfsm = pfsm;
	830	hammer_ref(&pfsm->lock);
	831	error = 0;
	832	} else if (dip->obj_localization == ip->obj_localization) {
	833	ip->pfsm = dip->pfsm;
	834	hammer_ref(&ip->pfsm->lock);
	835	error = 0;
	836	} else {
	837	ip->pfsm = hammer_load_pseudofs(trans,
	838	ip->obj_localization,
	839	&error);
	840	error = 0; /* ignore ENOENT */
	841	}
	842
	843	if (error) {
	844	hammer_free_inode(ip);
	845	ip = NULL;
	846	} else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	847	panic("hammer_create_inode: duplicate obj_id %llx",
	848	(long long)ip->obj_id);
	849	/* not reached */
	850	hammer_free_inode(ip);
	851	}
	852	*ipp = ip;
	853	return(error);
	854	}
	855
	856	/*
	857	* Final cleanup / freeing of an inode structure
	858	*/
	859	static void
	860	hammer_free_inode(hammer_inode_t ip)
	861	{
	862	struct hammer_mount *hmp;
	863
	864	hmp = ip->hmp;
	865	KKASSERT(ip->lock.refs == 1);
	866	hammer_uncache_node(&ip->cache[0]);
	867	hammer_uncache_node(&ip->cache[1]);
	868	hammer_uncache_node(&ip->cache[2]);
	869	hammer_uncache_node(&ip->cache[3]);
	870	hammer_inode_wakereclaims(ip, 1);
	871	if (ip->objid_cache)
	872	hammer_clear_objid(ip);
	873	--hammer_count_inodes;
	874	--hmp->count_inodes;
	875	if (ip->pfsm) {
	876	hammer_rel_pseudofs(hmp, ip->pfsm);
	877	ip->pfsm = NULL;
	878	}
	879	kfree(ip, hmp->m_inodes);
	880	ip = NULL;
	881	}
	882
	883	/*
	884	* Retrieve pseudo-fs data. NULL will never be returned.
	885	*
	886	* If an error occurs *errorp will be set and a default template is returned,
	887	* otherwise *errorp is set to 0. Typically when an error occurs it will
	888	* be ENOENT.
	889	*/
	890	hammer_pseudofs_inmem_t
	891	hammer_load_pseudofs(hammer_transaction_t trans,
	892	u_int32_t localization, int *errorp)
	893	{
	894	hammer_mount_t hmp = trans->hmp;
	895	hammer_inode_t ip;
	896	hammer_pseudofs_inmem_t pfsm;
	897	struct hammer_cursor cursor;
	898	int bytes;
	899
	900	retry:
	901	pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
	902	if (pfsm) {
	903	hammer_ref(&pfsm->lock);
	904	*errorp = 0;
	905	return(pfsm);
	906	}
	907
	908	/*
	909	* PFS records are stored in the root inode (not the PFS root inode,
	910	* but the real root). Avoid an infinite recursion if loading
	911	* the PFS for the real root.
	912	*/
	913	if (localization) {
	914	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
	915	HAMMER_MAX_TID,
	916	HAMMER_DEF_LOCALIZATION, 0, errorp);
	917	} else {
	918	ip = NULL;
	919	}
	920
	921	pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK \| M_ZERO);
	922	pfsm->localization = localization;
	923	pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
	924	pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
	925
	926	hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
	927	cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
	928	HAMMER_LOCALIZE_MISC;
	929	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	930	cursor.key_beg.create_tid = 0;
	931	cursor.key_beg.delete_tid = 0;
	932	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	933	cursor.key_beg.obj_type = 0;
	934	cursor.key_beg.key = localization;
	935	cursor.asof = HAMMER_MAX_TID;
	936	cursor.flags \|= HAMMER_CURSOR_ASOF;
	937
	938	if (ip)
	939	*errorp = hammer_ip_lookup(&cursor);
	940	else
	941	*errorp = hammer_btree_lookup(&cursor);
	942	if (*errorp == 0) {
	943	*errorp = hammer_ip_resolve_data(&cursor);
	944	if (*errorp == 0) {
	945	if (cursor.data->pfsd.mirror_flags &
	946	HAMMER_PFSD_DELETED) {
	947	*errorp = ENOENT;
	948	} else {
	949	bytes = cursor.leaf->data_len;
	950	if (bytes > sizeof(pfsm->pfsd))
	951	bytes = sizeof(pfsm->pfsd);
	952	bcopy(cursor.data, &pfsm->pfsd, bytes);
	953	}
	954	}
	955	}
	956	hammer_done_cursor(&cursor);
	957
	958	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	959	hammer_ref(&pfsm->lock);
	960	if (ip)
	961	hammer_rel_inode(ip, 0);
	962	if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
	963	kfree(pfsm, hmp->m_misc);
	964	goto retry;
	965	}
	966	return(pfsm);
	967	}
	968
	969	/*
	970	* Store pseudo-fs data. The backend will automatically delete any prior
	971	* on-disk pseudo-fs data but we have to delete in-memory versions.
	972	*/
	973	int
	974	hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
	975	{
	976	struct hammer_cursor cursor;
	977	hammer_record_t record;
	978	hammer_inode_t ip;
	979	int error;
	980
	981	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	982	HAMMER_DEF_LOCALIZATION, 0, &error);
	983	retry:
	984	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	985	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	986	cursor.key_beg.localization = ip->obj_localization +
	987	HAMMER_LOCALIZE_MISC;
	988	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	989	cursor.key_beg.create_tid = 0;
	990	cursor.key_beg.delete_tid = 0;
	991	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	992	cursor.key_beg.obj_type = 0;
	993	cursor.key_beg.key = pfsm->localization;
	994	cursor.asof = HAMMER_MAX_TID;
	995	cursor.flags \|= HAMMER_CURSOR_ASOF;
	996
	997	/*
	998	* Replace any in-memory version of the record.
	999	*/
	1000	error = hammer_ip_lookup(&cursor);
	1001	if (error == 0 && hammer_cursor_inmem(&cursor)) {
	1002	record = cursor.iprec;
	1003	if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
	1004	KKASSERT(cursor.deadlk_rec == NULL);
	1005	hammer_ref(&record->lock);
	1006	cursor.deadlk_rec = record;
	1007	error = EDEADLK;
	1008	} else {
	1009	record->flags \|= HAMMER_RECF_DELETED_FE;
	1010	error = 0;
	1011	}
	1012	}
	1013
	1014	/*
	1015	* Allocate replacement general record. The backend flush will
	1016	* delete any on-disk version of the record.
	1017	*/
	1018	if (error == 0 \|\| error == ENOENT) {
	1019	record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
	1020	record->type = HAMMER_MEM_RECORD_GENERAL;
	1021
	1022	record->leaf.base.localization = ip->obj_localization +
	1023	HAMMER_LOCALIZE_MISC;
	1024	record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
	1025	record->leaf.base.key = pfsm->localization;
	1026	record->leaf.data_len = sizeof(pfsm->pfsd);
	1027	bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
	1028	error = hammer_ip_add_record(trans, record);
	1029	}
	1030	hammer_done_cursor(&cursor);
	1031	if (error == EDEADLK)
	1032	goto retry;
	1033	hammer_rel_inode(ip, 0);
	1034	return(error);
	1035	}
	1036
	1037	/*
	1038	* Create a root directory for a PFS if one does not alredy exist.
	1039	*
	1040	* The PFS root stands alone so we must also bump the nlinks count
	1041	* to prevent it from being destroyed on release.
	1042	*/
	1043	int
	1044	hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
	1045	hammer_pseudofs_inmem_t pfsm)
	1046	{
	1047	hammer_inode_t ip;
	1048	struct vattr vap;
	1049	int error;
	1050
	1051	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	1052	pfsm->localization, 0, &error);
	1053	if (ip == NULL) {
	1054	vattr_null(&vap);
	1055	vap.va_mode = 0755;
	1056	vap.va_type = VDIR;
	1057	error = hammer_create_inode(trans, &vap, cred,
	1058	NULL, NULL, 0,
	1059	pfsm, &ip);
	1060	if (error == 0) {
	1061	++ip->ino_data.nlinks;
	1062	hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
	1063	}
	1064	}
	1065	if (ip)
	1066	hammer_rel_inode(ip, 0);
	1067	return(error);
	1068	}
	1069
	1070	/*
	1071	* Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
	1072	* if we are unable to disassociate all the inodes.
	1073	*/
	1074	static
	1075	int
	1076	hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
	1077	{
	1078	int res;
	1079
	1080	hammer_ref(&ip->lock);
	1081	if (ip->lock.refs == 2 && ip->vp)
	1082	vclean_unlocked(ip->vp);
	1083	if (ip->lock.refs == 1 && ip->vp == NULL)
	1084	res = 0;
	1085	else
	1086	res = -1; /* stop, someone is using the inode */
	1087	hammer_rel_inode(ip, 0);
	1088	return(res);
	1089	}
	1090
	1091	int
	1092	hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
	1093	{
	1094	int res;
	1095	int try;
	1096
	1097	for (try = res = 0; try < 4; ++try) {
	1098	res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
	1099	hammer_inode_pfs_cmp,
	1100	hammer_unload_pseudofs_callback,
	1101	&localization);
	1102	if (res == 0 && try > 1)
	1103	break;
	1104	hammer_flusher_sync(trans->hmp);
	1105	}
	1106	if (res != 0)
	1107	res = ENOTEMPTY;
	1108	return(res);
	1109	}
	1110
	1111
	1112	/*
	1113	* Release a reference on a PFS
	1114	*/
	1115	void
	1116	hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
	1117	{
	1118	hammer_unref(&pfsm->lock);
	1119	if (pfsm->lock.refs == 0) {
	1120	RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
	1121	kfree(pfsm, hmp->m_misc);
	1122	}
	1123	}
	1124
	1125	/*
	1126	* Called by hammer_sync_inode().
	1127	*/
	1128	static int
	1129	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	1130	{
	1131	hammer_transaction_t trans = cursor->trans;
	1132	hammer_record_t record;
	1133	int error;
	1134	int redirty;
	1135
	1136	retry:
	1137	error = 0;
	1138
	1139	/*
	1140	* If the inode has a presence on-disk then locate it and mark
	1141	* it deleted, setting DELONDISK.
	1142	*
	1143	* The record may or may not be physically deleted, depending on
	1144	* the retention policy.
	1145	*/
	1146	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	1147	HAMMER_INODE_ONDISK) {
	1148	hammer_normalize_cursor(cursor);
	1149	cursor->key_beg.localization = ip->obj_localization +
	1150	HAMMER_LOCALIZE_INODE;
	1151	cursor->key_beg.obj_id = ip->obj_id;
	1152	cursor->key_beg.key = 0;
	1153	cursor->key_beg.create_tid = 0;
	1154	cursor->key_beg.delete_tid = 0;
	1155	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1156	cursor->key_beg.obj_type = 0;
	1157	cursor->asof = ip->obj_asof;
	1158	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1159	cursor->flags \|= HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_ASOF;
	1160	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1161
	1162	error = hammer_btree_lookup(cursor);
	1163	if (hammer_debug_inode)
	1164	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	1165
	1166	if (error == 0) {
	1167	error = hammer_ip_delete_record(cursor, ip, trans->tid);
	1168	if (hammer_debug_inode)
	1169	kprintf(" error %d\n", error);
	1170	if (error == 0) {
	1171	ip->flags \|= HAMMER_INODE_DELONDISK;
	1172	}
	1173	if (cursor->node)
	1174	hammer_cache_node(&ip->cache[0], cursor->node);
	1175	}
	1176	if (error == EDEADLK) {
	1177	hammer_done_cursor(cursor);
	1178	error = hammer_init_cursor(trans, cursor,
	1179	&ip->cache[0], ip);
	1180	if (hammer_debug_inode)
	1181	kprintf("IPDED %p %d\n", ip, error);
	1182	if (error == 0)
	1183	goto retry;
	1184	}
	1185	}
	1186
	1187	/*
	1188	* Ok, write out the initial record or a new record (after deleting
	1189	* the old one), unless the DELETED flag is set. This routine will
	1190	* clear DELONDISK if it writes out a record.
	1191	*
	1192	* Update our inode statistics if this is the first application of
	1193	* the inode on-disk.
	1194	*/
	1195	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	1196	/*
	1197	* Generate a record and write it to the media. We clean-up
	1198	* the state before releasing so we do not have to set-up
	1199	* a flush_group.
	1200	*/
	1201	record = hammer_alloc_mem_record(ip, 0);
	1202	record->type = HAMMER_MEM_RECORD_INODE;
	1203	record->flush_state = HAMMER_FST_FLUSH;
	1204	record->leaf = ip->sync_ino_leaf;
	1205	record->leaf.base.create_tid = trans->tid;
	1206	record->leaf.data_len = sizeof(ip->sync_ino_data);
	1207	record->leaf.create_ts = trans->time32;
	1208	record->data = (void *)&ip->sync_ino_data;
	1209	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1210
	1211	/*
	1212	* If this flag is set we cannot sync the new file size
	1213	* because we haven't finished related truncations. The
	1214	* inode will be flushed in another flush group to finish
	1215	* the job.
	1216	*/
	1217	if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
	1218	ip->sync_ino_data.size != ip->ino_data.size) {
	1219	redirty = 1;
	1220	ip->sync_ino_data.size = ip->ino_data.size;
	1221	} else {
	1222	redirty = 0;
	1223	}
	1224
	1225	for (;;) {
	1226	error = hammer_ip_sync_record_cursor(cursor, record);
	1227	if (hammer_debug_inode)
	1228	kprintf("GENREC %p rec %08x %d\n",
	1229	ip, record->flags, error);
	1230	if (error != EDEADLK)
	1231	break;
	1232	hammer_done_cursor(cursor);
	1233	error = hammer_init_cursor(trans, cursor,
	1234	&ip->cache[0], ip);
	1235	if (hammer_debug_inode)
	1236	kprintf("GENREC reinit %d\n", error);
	1237	if (error)
	1238	break;
	1239	}
	1240
	1241	/*
	1242	* Note: The record was never on the inode's record tree
	1243	* so just wave our hands importantly and destroy it.
	1244	*/
	1245	record->flags \|= HAMMER_RECF_COMMITTED;
	1246	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	1247	record->flush_state = HAMMER_FST_IDLE;
	1248	++ip->rec_generation;
	1249	hammer_rel_mem_record(record);
	1250
	1251	/*
	1252	* Finish up.
	1253	*/
	1254	if (error == 0) {
	1255	if (hammer_debug_inode)
	1256	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	1257	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1258	HAMMER_INODE_ATIME \|
	1259	HAMMER_INODE_MTIME);
	1260	ip->flags &= ~HAMMER_INODE_DELONDISK;
	1261	if (redirty)
	1262	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	1263
	1264	/*
	1265	* Root volume count of inodes
	1266	*/
	1267	hammer_sync_lock_sh(trans);
	1268	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	1269	hammer_modify_volume_field(trans,
	1270	trans->rootvol,
	1271	vol0_stat_inodes);
	1272	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1273	hammer_modify_volume_done(trans->rootvol);
	1274	ip->flags \|= HAMMER_INODE_ONDISK;
	1275	if (hammer_debug_inode)
	1276	kprintf("NOWONDISK %p\n", ip);
	1277	}
	1278	hammer_sync_unlock(trans);
	1279	}
	1280	}
	1281
	1282	/*
	1283	* If the inode has been destroyed, clean out any left-over flags
	1284	* that may have been set by the frontend.
	1285	*/
	1286	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	1287	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1288	HAMMER_INODE_ATIME \|
	1289	HAMMER_INODE_MTIME);
	1290	}
	1291	return(error);
	1292	}
	1293
	1294	/*
	1295	* Update only the itimes fields.
	1296	*
	1297	* ATIME can be updated without generating any UNDO. MTIME is updated
	1298	* with UNDO so it is guaranteed to be synchronized properly in case of
	1299	* a crash.
	1300	*
	1301	* Neither field is included in the B-Tree leaf element's CRC, which is how
	1302	* we can get away with updating ATIME the way we do.
	1303	*/
	1304	static int
	1305	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	1306	{
	1307	hammer_transaction_t trans = cursor->trans;
	1308	int error;
	1309
	1310	retry:
	1311	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) !=
	1312	HAMMER_INODE_ONDISK) {
	1313	return(0);
	1314	}
	1315
	1316	hammer_normalize_cursor(cursor);
	1317	cursor->key_beg.localization = ip->obj_localization +
	1318	HAMMER_LOCALIZE_INODE;
	1319	cursor->key_beg.obj_id = ip->obj_id;
	1320	cursor->key_beg.key = 0;
	1321	cursor->key_beg.create_tid = 0;
	1322	cursor->key_beg.delete_tid = 0;
	1323	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1324	cursor->key_beg.obj_type = 0;
	1325	cursor->asof = ip->obj_asof;
	1326	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1327	cursor->flags \|= HAMMER_CURSOR_ASOF;
	1328	cursor->flags \|= HAMMER_CURSOR_GET_LEAF;
	1329	cursor->flags \|= HAMMER_CURSOR_GET_DATA;
	1330	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1331
	1332	error = hammer_btree_lookup(cursor);
	1333	if (error == 0) {
	1334	hammer_cache_node(&ip->cache[0], cursor->node);
	1335	if (ip->sync_flags & HAMMER_INODE_MTIME) {
	1336	/*
	1337	* Updating MTIME requires an UNDO. Just cover
	1338	* both atime and mtime.
	1339	*/
	1340	hammer_sync_lock_sh(trans);
	1341	hammer_modify_buffer(trans, cursor->data_buffer,
	1342	HAMMER_ITIMES_BASE(&cursor->data->inode),
	1343	HAMMER_ITIMES_BYTES);
	1344	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1345	cursor->data->inode.mtime = ip->sync_ino_data.mtime;
	1346	hammer_modify_buffer_done(cursor->data_buffer);
	1347	hammer_sync_unlock(trans);
	1348	} else if (ip->sync_flags & HAMMER_INODE_ATIME) {
	1349	/*
	1350	* Updating atime only can be done in-place with
	1351	* no UNDO.
	1352	*/
	1353	hammer_sync_lock_sh(trans);
	1354	hammer_modify_buffer(trans, cursor->data_buffer,
	1355	NULL, 0);
	1356	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1357	hammer_modify_buffer_done(cursor->data_buffer);
	1358	hammer_sync_unlock(trans);
	1359	}
	1360	ip->sync_flags &= ~(HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME);
	1361	}
	1362	if (error == EDEADLK) {
	1363	hammer_done_cursor(cursor);
	1364	error = hammer_init_cursor(trans, cursor,
	1365	&ip->cache[0], ip);
	1366	if (error == 0)
	1367	goto retry;
	1368	}
	1369	return(error);
	1370	}
	1371
	1372	/*
	1373	* Release a reference on an inode, flush as requested.
	1374	*
	1375	* On the last reference we queue the inode to the flusher for its final
	1376	* disposition.
	1377	*/
	1378	void
	1379	hammer_rel_inode(struct hammer_inode *ip, int flush)
	1380	{
	1381	/hammer_mount_t hmp = ip->hmp;/
	1382
	1383	/*
	1384	* Handle disposition when dropping the last ref.
	1385	*/
	1386	for (;;) {
	1387	if (ip->lock.refs == 1) {
	1388	/*
	1389	* Determine whether on-disk action is needed for
	1390	* the inode's final disposition.
	1391	*/
	1392	KKASSERT(ip->vp == NULL);
	1393	hammer_inode_unloadable_check(ip, 0);
	1394	if (ip->flags & HAMMER_INODE_MODMASK) {
	1395	hammer_flush_inode(ip, 0);
	1396	} else if (ip->lock.refs == 1) {
	1397	hammer_unload_inode(ip);
	1398	break;
	1399	}
	1400	} else {
	1401	if (flush)
	1402	hammer_flush_inode(ip, 0);
	1403
	1404	/*
	1405	* The inode still has multiple refs, try to drop
	1406	* one ref.
	1407	*/
	1408	KKASSERT(ip->lock.refs >= 1);
	1409	if (ip->lock.refs > 1) {
	1410	hammer_unref(&ip->lock);
	1411	break;
	1412	}
	1413	}
	1414	}
	1415	}
	1416
	1417	/*
	1418	* Unload and destroy the specified inode. Must be called with one remaining
	1419	* reference. The reference is disposed of.
	1420	*
	1421	* The inode must be completely clean.
	1422	*/
	1423	static int
	1424	hammer_unload_inode(struct hammer_inode *ip)
	1425	{
	1426	hammer_mount_t hmp = ip->hmp;
	1427
	1428	KASSERT(ip->lock.refs == 1,
	1429	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	1430	KKASSERT(ip->vp == NULL);
	1431	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	1432	KKASSERT(ip->cursor_ip_refs == 0);
	1433	KKASSERT(ip->lock.lockcount == 0);
	1434	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	1435
	1436	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1437	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	1438
	1439	RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
	1440
	1441	hammer_free_inode(ip);
	1442	return(0);
	1443	}
	1444
	1445	/*
	1446	* Called during unmounting if a critical error occured. The in-memory
	1447	* inode and all related structures are destroyed.
	1448	*
	1449	* If a critical error did not occur the unmount code calls the standard
	1450	* release and asserts that the inode is gone.
	1451	*/
	1452	int
	1453	hammer_destroy_inode_callback(struct hammer_inode ip, void data __unused)
	1454	{
	1455	hammer_record_t rec;
	1456
	1457	/*
	1458	* Get rid of the inodes in-memory records, regardless of their
	1459	* state, and clear the mod-mask.
	1460	*/
	1461	while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
	1462	TAILQ_REMOVE(&ip->target_list, rec, target_entry);
	1463	rec->target_ip = NULL;
	1464	if (rec->flush_state == HAMMER_FST_SETUP)
	1465	rec->flush_state = HAMMER_FST_IDLE;
	1466	}
	1467	while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
	1468	if (rec->flush_state == HAMMER_FST_FLUSH)
	1469	--rec->flush_group->refs;
	1470	else
	1471	hammer_ref(&rec->lock);
	1472	KKASSERT(rec->lock.refs == 1);
	1473	rec->flush_state = HAMMER_FST_IDLE;
	1474	rec->flush_group = NULL;
	1475	rec->flags \|= HAMMER_RECF_DELETED_FE; /* wave hands */
	1476	rec->flags \|= HAMMER_RECF_DELETED_BE; /* wave hands */
	1477	++ip->rec_generation;
	1478	hammer_rel_mem_record(rec);
	1479	}
	1480	ip->flags &= ~HAMMER_INODE_MODMASK;
	1481	ip->sync_flags &= ~HAMMER_INODE_MODMASK;
	1482	KKASSERT(ip->vp == NULL);
	1483
	1484	/*
	1485	* Remove the inode from any flush group, force it idle. FLUSH
	1486	* and SETUP states have an inode ref.
	1487	*/
	1488	switch(ip->flush_state) {
	1489	case HAMMER_FST_FLUSH:
	1490	TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
	1491	--ip->flush_group->refs;
	1492	ip->flush_group = NULL;
	1493	/* fall through */
	1494	case HAMMER_FST_SETUP:
	1495	hammer_unref(&ip->lock);
	1496	ip->flush_state = HAMMER_FST_IDLE;
	1497	/* fall through */
	1498	case HAMMER_FST_IDLE:
	1499	break;
	1500	}
	1501
	1502	/*
	1503	* There shouldn't be any associated vnode. The unload needs at
	1504	* least one ref, if we do have a vp steal its ip ref.
	1505	*/
	1506	if (ip->vp) {
	1507	kprintf("hammer_destroy_inode_callback: Unexpected "
	1508	"vnode association ip %p vp %p\n", ip, ip->vp);
	1509	ip->vp->v_data = NULL;
	1510	ip->vp = NULL;
	1511	} else {
	1512	hammer_ref(&ip->lock);
	1513	}
	1514	hammer_unload_inode(ip);
	1515	return(0);
	1516	}
	1517
	1518	/*
	1519	* Called on mount -u when switching from RW to RO or vise-versa. Adjust
	1520	* the read-only flag for cached inodes.
	1521	*
	1522	* This routine is called from a RB_SCAN().
	1523	*/
	1524	int
	1525	hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
	1526	{
	1527	hammer_mount_t hmp = ip->hmp;
	1528
	1529	if (hmp->ronly \|\| hmp->asof != HAMMER_MAX_TID)
	1530	ip->flags \|= HAMMER_INODE_RO;
	1531	else
	1532	ip->flags &= ~HAMMER_INODE_RO;
	1533	return(0);
	1534	}
	1535
	1536	/*
	1537	* A transaction has modified an inode, requiring updates as specified by
	1538	* the passed flags.
	1539	*
	1540	* HAMMER_INODE_DDIRTY: Inode data has been updated
	1541	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	1542	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	1543	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	1544	* HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
	1545	*/
	1546	void
	1547	hammer_modify_inode(hammer_inode_t ip, int flags)
	1548	{
	1549	/*
	1550	* ronly of 0 or 2 does not trigger assertion.
	1551	* 2 is a special error state
	1552	*/
	1553	KKASSERT(ip->hmp->ronly != 1 \|\|
	1554	(flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	1555	HAMMER_INODE_BUFS \| HAMMER_INODE_DELETED \|
	1556	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) == 0);
	1557	if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
	1558	ip->flags \|= HAMMER_INODE_RSV_INODES;
	1559	++ip->hmp->rsv_inodes;
	1560	}
	1561
	1562	ip->flags \|= flags;
	1563	}
	1564
	1565	/*
	1566	* Request that an inode be flushed. This whole mess cannot block and may
	1567	* recurse (if not synchronous). Once requested HAMMER will attempt to
	1568	* actively flush the inode until the flush can be done.
	1569	*
	1570	* The inode may already be flushing, or may be in a setup state. We can
	1571	* place the inode in a flushing state if it is currently idle and flag it
	1572	* to reflush if it is currently flushing.
	1573	*
	1574	* Upon return if the inode could not be flushed due to a setup
	1575	* dependancy, then it will be automatically flushed when the dependancy
	1576	* is satisfied.
	1577	*/
	1578	void
	1579	hammer_flush_inode(hammer_inode_t ip, int flags)
	1580	{
	1581	hammer_mount_t hmp;
	1582	hammer_flush_group_t flg;
	1583	int good;
	1584
	1585	/*
	1586	* next_flush_group is the first flush group we can place the inode
	1587	* in. It may be NULL. If it becomes full we append a new flush
	1588	* group and make that the next_flush_group.
	1589	*/
	1590	hmp = ip->hmp;
	1591	while ((flg = hmp->next_flush_group) != NULL) {
	1592	KKASSERT(flg->running == 0);
	1593	if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
	1594	break;
	1595	hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
	1596	hammer_flusher_async(ip->hmp, flg);
	1597	}
	1598	if (flg == NULL) {
	1599	flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK\|M_ZERO);
	1600	hmp->next_flush_group = flg;
	1601	TAILQ_INIT(&flg->flush_list);
	1602	TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
	1603	}
	1604
	1605	/*
	1606	* Trivial 'nothing to flush' case. If the inode is in a SETUP
	1607	* state we have to put it back into an IDLE state so we can
	1608	* drop the extra ref.
	1609	*
	1610	* If we have a parent dependancy we must still fall through
	1611	* so we can run it.
	1612	*/
	1613	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	1614	if (ip->flush_state == HAMMER_FST_SETUP &&
	1615	TAILQ_EMPTY(&ip->target_list)) {
	1616	ip->flush_state = HAMMER_FST_IDLE;
	1617	hammer_rel_inode(ip, 0);
	1618	}
	1619	if (ip->flush_state == HAMMER_FST_IDLE)
	1620	return;
	1621	}
	1622
	1623	/*
	1624	* Our flush action will depend on the current state.
	1625	*/
	1626	switch(ip->flush_state) {
	1627	case HAMMER_FST_IDLE:
	1628	/*
	1629	* We have no dependancies and can flush immediately. Some
	1630	* our children may not be flushable so we have to re-test
	1631	* with that additional knowledge.
	1632	*/
	1633	hammer_flush_inode_core(ip, flg, flags);
	1634	break;
	1635	case HAMMER_FST_SETUP:
	1636	/*
	1637	* Recurse upwards through dependancies via target_list
	1638	* and start their flusher actions going if possible.
	1639	*
	1640	* 'good' is our connectivity. -1 means we have none and
	1641	* can't flush, 0 means there weren't any dependancies, and
	1642	* 1 means we have good connectivity.
	1643	*/
	1644	good = hammer_setup_parent_inodes(ip, 0, flg);
	1645
	1646	if (good >= 0) {
	1647	/*
	1648	* We can continue if good >= 0. Determine how
	1649	* many records under our inode can be flushed (and
	1650	* mark them).
	1651	*/
	1652	hammer_flush_inode_core(ip, flg, flags);
	1653	} else {
	1654	/*
	1655	* Parent has no connectivity, tell it to flush
	1656	* us as soon as it does.
	1657	*
	1658	* The REFLUSH flag is also needed to trigger
	1659	* dependancy wakeups.
	1660	*/
	1661	ip->flags \|= HAMMER_INODE_CONN_DOWN \|
	1662	HAMMER_INODE_REFLUSH;
	1663	if (flags & HAMMER_FLUSH_SIGNAL) {
	1664	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1665	hammer_flusher_async(ip->hmp, flg);
	1666	}
	1667	}
	1668	break;
	1669	case HAMMER_FST_FLUSH:
	1670	/*
	1671	* We are already flushing, flag the inode to reflush
	1672	* if needed after it completes its current flush.
	1673	*
	1674	* The REFLUSH flag is also needed to trigger
	1675	* dependancy wakeups.
	1676	*/
	1677	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	1678	ip->flags \|= HAMMER_INODE_REFLUSH;
	1679	if (flags & HAMMER_FLUSH_SIGNAL) {
	1680	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1681	hammer_flusher_async(ip->hmp, flg);
	1682	}
	1683	break;
	1684	}
	1685	}
	1686
	1687	/*
	1688	* Scan ip->target_list, which is a list of records owned by PARENTS to our
	1689	* ip which reference our ip.
	1690	*
	1691	* XXX This is a huge mess of recursive code, but not one bit of it blocks
	1692	* so for now do not ref/deref the structures. Note that if we use the
	1693	* ref/rel code later, the rel CAN block.
	1694	*/
	1695	static int
	1696	hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	1697	hammer_flush_group_t flg)
	1698	{
	1699	hammer_record_t depend;
	1700	int good;
	1701	int r;
	1702
	1703	/*
	1704	* If we hit our recursion limit and we have parent dependencies
	1705	* We cannot continue. Returning < 0 will cause us to be flagged
	1706	* for reflush. Returning -2 cuts off additional dependency checks
	1707	* because they are likely to also hit the depth limit.
	1708	*
	1709	* We cannot return < 0 if there are no dependencies or there might
	1710	* not be anything to wakeup (ip).
	1711	*/
	1712	if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
	1713	kprintf("HAMMER Warning: depth limit reached on "
	1714	"setup recursion, inode %p %016llx\n",
	1715	ip, (long long)ip->obj_id);
	1716	return(-2);
	1717	}
	1718
	1719	/*
	1720	* Scan dependencies
	1721	*/
	1722	good = 0;
	1723	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	1724	r = hammer_setup_parent_inodes_helper(depend, depth, flg);
	1725	KKASSERT(depend->target_ip == ip);
	1726	if (r < 0 && good == 0)
	1727	good = -1;
	1728	if (r > 0)
	1729	good = 1;
	1730
	1731	/*
	1732	* If we failed due to the recursion depth limit then stop
	1733	* now.
	1734	*/
	1735	if (r == -2)
	1736	break;
	1737	}
	1738	return(good);
	1739	}
	1740
	1741	/*
	1742	* This helper function takes a record representing the dependancy between
	1743	* the parent inode and child inode.
	1744	*
	1745	* record->ip = parent inode
	1746	* record->target_ip = child inode
	1747	*
	1748	* We are asked to recurse upwards and convert the record from SETUP
	1749	* to FLUSH if possible.
	1750	*
	1751	* Return 1 if the record gives us connectivity
	1752	*
	1753	* Return 0 if the record is not relevant
	1754	*
	1755	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	1756	*/
	1757	static int
	1758	hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
	1759	hammer_flush_group_t flg)
	1760	{
	1761	hammer_mount_t hmp;
	1762	hammer_inode_t pip;
	1763	int good;
	1764
	1765	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	1766	pip = record->ip;
	1767	hmp = pip->hmp;
	1768
	1769	/*
	1770	* If the record is already flushing, is it in our flush group?
	1771	*
	1772	* If it is in our flush group but it is a general record or a
	1773	* delete-on-disk, it does not improve our connectivity (return 0),
	1774	* and if the target inode is not trying to destroy itself we can't
	1775	* allow the operation yet anyway (the second return -1).
	1776	*/
	1777	if (record->flush_state == HAMMER_FST_FLUSH) {
	1778	/*
	1779	* If not in our flush group ask the parent to reflush
	1780	* us as soon as possible.
	1781	*/
	1782	if (record->flush_group != flg) {
	1783	pip->flags \|= HAMMER_INODE_REFLUSH;
	1784	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1785	return(-1);
	1786	}
	1787
	1788	/*
	1789	* If in our flush group everything is already set up,
	1790	* just return whether the record will improve our
	1791	* visibility or not.
	1792	*/
	1793	if (record->type == HAMMER_MEM_RECORD_ADD)
	1794	return(1);
	1795	return(0);
	1796	}
	1797
	1798	/*
	1799	* It must be a setup record. Try to resolve the setup dependancies
	1800	* by recursing upwards so we can place ip on the flush list.
	1801	*
	1802	* Limit ourselves to 20 levels of recursion to avoid blowing out
	1803	* the kernel stack. If we hit the recursion limit we can't flush
	1804	* until the parent flushes. The parent will flush independantly
	1805	* on its own and ultimately a deep recursion will be resolved.
	1806	*/
	1807	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1808
	1809	good = hammer_setup_parent_inodes(pip, depth + 1, flg);
	1810
	1811	/*
	1812	* If good < 0 the parent has no connectivity and we cannot safely
	1813	* flush the directory entry, which also means we can't flush our
	1814	* ip. Flag us for downward recursion once the parent's
	1815	* connectivity is resolved. Flag the parent for [re]flush or it
	1816	* may not check for downward recursions.
	1817	*/
	1818	if (good < 0) {
	1819	pip->flags \|= HAMMER_INODE_REFLUSH;
	1820	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1821	return(good);
	1822	}
	1823
	1824	/*
	1825	* We are go, place the parent inode in a flushing state so we can
	1826	* place its record in a flushing state. Note that the parent
	1827	* may already be flushing. The record must be in the same flush
	1828	* group as the parent.
	1829	*/
	1830	if (pip->flush_state != HAMMER_FST_FLUSH)
	1831	hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
	1832	KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
	1833	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1834
	1835	#if 0
	1836	if (record->type == HAMMER_MEM_RECORD_DEL &&
	1837	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	1838	/*
	1839	* Regardless of flushing state we cannot sync this path if the
	1840	* record represents a delete-on-disk but the target inode
	1841	* is not ready to sync its own deletion.
	1842	*
	1843	* XXX need to count effective nlinks to determine whether
	1844	* the flush is ok, otherwise removing a hardlink will
	1845	* just leave the DEL record to rot.
	1846	*/
	1847	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1848	return(-1);
	1849	} else
	1850	#endif
	1851	if (pip->flush_group == flg) {
	1852	/*
	1853	* Because we have not calculated nlinks yet we can just
	1854	* set records to the flush state if the parent is in
	1855	* the same flush group as we are.
	1856	*/
	1857	record->flush_state = HAMMER_FST_FLUSH;
	1858	record->flush_group = flg;
	1859	++record->flush_group->refs;
	1860	hammer_ref(&record->lock);
	1861
	1862	/*
	1863	* A general directory-add contributes to our visibility.
	1864	*
	1865	* Otherwise it is probably a directory-delete or
	1866	* delete-on-disk record and does not contribute to our
	1867	* visbility (but we can still flush it).
	1868	*/
	1869	if (record->type == HAMMER_MEM_RECORD_ADD)
	1870	return(1);
	1871	return(0);
	1872	} else {
	1873	/*
	1874	* If the parent is not in our flush group we cannot
	1875	* flush this record yet, there is no visibility.
	1876	* We tell the parent to reflush and mark ourselves
	1877	* so the parent knows it should flush us too.
	1878	*/
	1879	pip->flags \|= HAMMER_INODE_REFLUSH;
	1880	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1881	return(-1);
	1882	}
	1883	}
	1884
	1885	/*
	1886	* This is the core routine placing an inode into the FST_FLUSH state.
	1887	*/
	1888	static void
	1889	hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
	1890	{
	1891	int go_count;
	1892
	1893	/*
	1894	* Set flush state and prevent the flusher from cycling into
	1895	* the next flush group. Do not place the ip on the list yet.
	1896	* Inodes not in the idle state get an extra reference.
	1897	*/
	1898	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	1899	if (ip->flush_state == HAMMER_FST_IDLE)
	1900	hammer_ref(&ip->lock);
	1901	ip->flush_state = HAMMER_FST_FLUSH;
	1902	ip->flush_group = flg;
	1903	++ip->hmp->flusher.group_lock;
	1904	++ip->hmp->count_iqueued;
	1905	++hammer_count_iqueued;
	1906	++flg->total_count;
	1907
	1908	/*
	1909	* If the flush group reaches the autoflush limit we want to signal
	1910	* the flusher. This is particularly important for remove()s.
	1911	*/
	1912	if (flg->total_count == hammer_autoflush)
	1913	flags \|= HAMMER_FLUSH_SIGNAL;
	1914
	1915	/*
	1916	* We need to be able to vfsync/truncate from the backend.
	1917	*/
	1918	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	1919	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	1920	ip->flags \|= HAMMER_INODE_VHELD;
	1921	vref(ip->vp);
	1922	}
	1923
	1924	/*
	1925	* Figure out how many in-memory records we can actually flush
	1926	* (not including inode meta-data, buffers, etc).
	1927	*/
	1928	KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
	1929	if (flags & HAMMER_FLUSH_RECURSION) {
	1930	/*
	1931	* If this is a upwards recursion we do not want to
	1932	* recurse down again!
	1933	*/
	1934	go_count = 1;
	1935	#if 0
	1936	} else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1937	/*
	1938	* No new records are added if we must complete a flush
	1939	* from a previous cycle, but we do have to move the records
	1940	* from the previous cycle to the current one.
	1941	*/
	1942	#if 0
	1943	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1944	hammer_syncgrp_child_callback, NULL);
	1945	#endif
	1946	go_count = 1;
	1947	#endif
	1948	} else {
	1949	/*
	1950	* Normal flush, scan records and bring them into the flush.
	1951	* Directory adds and deletes are usually skipped (they are
	1952	* grouped with the related inode rather then with the
	1953	* directory).
	1954	*
	1955	* go_count can be negative, which means the scan aborted
	1956	* due to the flush group being over-full and we should
	1957	* flush what we have.
	1958	*/
	1959	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1960	hammer_setup_child_callback, NULL);
	1961	}
	1962
	1963	/*
	1964	* This is a more involved test that includes go_count. If we
	1965	* can't flush, flag the inode and return. If go_count is 0 we
	1966	* were are unable to flush any records in our rec_tree and
	1967	* must ignore the XDIRTY flag.
	1968	*/
	1969	if (go_count == 0) {
	1970	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	1971	--ip->hmp->count_iqueued;
	1972	--hammer_count_iqueued;
	1973
	1974	--flg->total_count;
	1975	ip->flush_state = HAMMER_FST_SETUP;
	1976	ip->flush_group = NULL;
	1977	if (ip->flags & HAMMER_INODE_VHELD) {
	1978	ip->flags &= ~HAMMER_INODE_VHELD;
	1979	vrele(ip->vp);
	1980	}
	1981
	1982	/*
	1983	* REFLUSH is needed to trigger dependancy wakeups
	1984	* when an inode is in SETUP.
	1985	*/
	1986	ip->flags \|= HAMMER_INODE_REFLUSH;
	1987	if (flags & HAMMER_FLUSH_SIGNAL) {
	1988	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1989	hammer_flusher_async(ip->hmp, flg);
	1990	}
	1991	if (--ip->hmp->flusher.group_lock == 0)
	1992	wakeup(&ip->hmp->flusher.group_lock);
	1993	return;
	1994	}
	1995	}
	1996
	1997	/*
	1998	* Snapshot the state of the inode for the backend flusher.
	1999	*
	2000	* We continue to retain save_trunc_off even when all truncations
	2001	* have been resolved as an optimization to determine if we can
	2002	* skip the B-Tree lookup for overwrite deletions.
	2003	*
	2004	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	2005	* and stays in ip->flags. Once set, it stays set until the
	2006	* inode is destroyed.
	2007	*/
	2008	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	2009	KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
	2010	ip->sync_trunc_off = ip->trunc_off;
	2011	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	2012	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	2013	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	2014
	2015	/*
	2016	* The save_trunc_off used to cache whether the B-Tree
	2017	* holds any records past that point is not used until
	2018	* after the truncation has succeeded, so we can safely
	2019	* set it now.
	2020	*/
	2021	if (ip->save_trunc_off > ip->sync_trunc_off)
	2022	ip->save_trunc_off = ip->sync_trunc_off;
	2023	}
	2024	ip->sync_flags \|= (ip->flags & HAMMER_INODE_MODMASK &
	2025	~HAMMER_INODE_TRUNCATED);
	2026	ip->sync_ino_leaf = ip->ino_leaf;
	2027	ip->sync_ino_data = ip->ino_data;
	2028	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	2029	#ifdef DEBUG_TRUNCATE
	2030	if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
	2031	kprintf("truncateS %016llx\n", ip->sync_trunc_off);
	2032	#endif
	2033
	2034	/*
	2035	* The flusher list inherits our inode and reference.
	2036	*/
	2037	KKASSERT(flg->running == 0);
	2038	TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry);
	2039	if (--ip->hmp->flusher.group_lock == 0)
	2040	wakeup(&ip->hmp->flusher.group_lock);
	2041
	2042	if (flags & HAMMER_FLUSH_SIGNAL) {
	2043	hammer_flusher_async(ip->hmp, flg);
	2044	}
	2045	}
	2046
	2047	/*
	2048	* Callback for scan of ip->rec_tree. Try to include each record in our
	2049	* flush. ip->flush_group has been set but the inode has not yet been
	2050	* moved into a flushing state.
	2051	*
	2052	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	2053	* both inodes.
	2054	*
	2055	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	2056	* the caller from shortcutting the flush.
	2057	*/
	2058	static int
	2059	hammer_setup_child_callback(hammer_record_t rec, void *data)
	2060	{
	2061	hammer_flush_group_t flg;
	2062	hammer_inode_t target_ip;
	2063	hammer_inode_t ip;
	2064	int r;
	2065
	2066	/*
	2067	* Records deleted or committed by the backend are ignored.
	2068	* Note that the flush detects deleted frontend records at
	2069	* multiple points to deal with races. This is just the first
	2070	* line of defense. The only time HAMMER_RECF_DELETED_FE cannot
	2071	* be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
	2072	* messes up link-count calculations.
	2073	*
	2074	* NOTE: Don't get confused between record deletion and, say,
	2075	* directory entry deletion. The deletion of a directory entry
	2076	* which is on-media has nothing to do with the record deletion
	2077	* flags.
	2078	*/
	2079	if (rec->flags & (HAMMER_RECF_DELETED_FE \| HAMMER_RECF_DELETED_BE \|
	2080	HAMMER_RECF_COMMITTED)) {
	2081	if (rec->flush_state == HAMMER_FST_FLUSH) {
	2082	KKASSERT(rec->flush_group == rec->ip->flush_group);
	2083	r = 1;
	2084	} else {
	2085	r = 0;
	2086	}
	2087	return(r);
	2088	}
	2089
	2090	/*
	2091	* If the record is in an idle state it has no dependancies and
	2092	* can be flushed.
	2093	*/
	2094	ip = rec->ip;
	2095	flg = ip->flush_group;
	2096	r = 0;
	2097
	2098	switch(rec->flush_state) {
	2099	case HAMMER_FST_IDLE:
	2100	/*
	2101	* The record has no setup dependancy, we can flush it.
	2102	*/
	2103	KKASSERT(rec->target_ip == NULL);
	2104	rec->flush_state = HAMMER_FST_FLUSH;
	2105	rec->flush_group = flg;
	2106	++flg->refs;
	2107	hammer_ref(&rec->lock);
	2108	r = 1;
	2109	break;
	2110	case HAMMER_FST_SETUP:
	2111	/*
	2112	* The record has a setup dependancy. These are typically
	2113	* directory entry adds and deletes. Such entries will be
	2114	* flushed when their inodes are flushed so we do not
	2115	* usually have to add them to the flush here. However,
	2116	* if the target_ip has set HAMMER_INODE_CONN_DOWN then
	2117	* it is asking us to flush this record (and it).
	2118	*/
	2119	target_ip = rec->target_ip;
	2120	KKASSERT(target_ip != NULL);
	2121	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	2122
	2123	/*
	2124	* If the target IP is already flushing in our group
	2125	* we could associate the record, but target_ip has
	2126	* already synced ino_data to sync_ino_data and we
	2127	* would also have to adjust nlinks. Plus there are
	2128	* ordering issues for adds and deletes.
	2129	*
	2130	* Reflush downward if this is an ADD, and upward if
	2131	* this is a DEL.
	2132	*/
	2133	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	2134	if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
	2135	ip->flags \|= HAMMER_INODE_REFLUSH;
	2136	else
	2137	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	2138	break;
	2139	}
	2140
	2141	/*
	2142	* Target IP is not yet flushing. This can get complex
	2143	* because we have to be careful about the recursion.
	2144	*
	2145	* Directories create an issue for us in that if a flush
	2146	* of a directory is requested the expectation is to flush
	2147	* any pending directory entries, but this will cause the
	2148	* related inodes to recursively flush as well. We can't
	2149	* really defer the operation so just get as many as we
	2150	* can and
	2151	*/
	2152	#if 0
	2153	if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
	2154	(target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
	2155	/*
	2156	* We aren't reclaiming and the target ip was not
	2157	* previously prevented from flushing due to this
	2158	* record dependancy. Do not flush this record.
	2159	*/
	2160	/r = 0;/
	2161	} else
	2162	#endif
	2163	if (flg->total_count + flg->refs >
	2164	ip->hmp->undo_rec_limit) {
	2165	/*
	2166	* Our flush group is over-full and we risk blowing
	2167	* out the UNDO FIFO. Stop the scan, flush what we
	2168	* have, then reflush the directory.
	2169	*
	2170	* The directory may be forced through multiple
	2171	* flush groups before it can be completely
	2172	* flushed.
	2173	*/
	2174	ip->flags \|= HAMMER_INODE_RESIGNAL \|
	2175	HAMMER_INODE_REFLUSH;
	2176	r = -1;
	2177	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	2178	/*
	2179	* If the target IP is not flushing we can force
	2180	* it to flush, even if it is unable to write out
	2181	* any of its own records we have at least one in
	2182	* hand that we CAN deal with.
	2183	*/
	2184	rec->flush_state = HAMMER_FST_FLUSH;
	2185	rec->flush_group = flg;
	2186	++flg->refs;
	2187	hammer_ref(&rec->lock);
	2188	hammer_flush_inode_core(target_ip, flg,
	2189	HAMMER_FLUSH_RECURSION);
	2190	r = 1;
	2191	} else {
	2192	/*
	2193	* General or delete-on-disk record.
	2194	*
	2195	* XXX this needs help. If a delete-on-disk we could
	2196	* disconnect the target. If the target has its own
	2197	* dependancies they really need to be flushed.
	2198	*
	2199	* XXX
	2200	*/
	2201	rec->flush_state = HAMMER_FST_FLUSH;
	2202	rec->flush_group = flg;
	2203	++flg->refs;
	2204	hammer_ref(&rec->lock);
	2205	hammer_flush_inode_core(target_ip, flg,
	2206	HAMMER_FLUSH_RECURSION);
	2207	r = 1;
	2208	}
	2209	break;
	2210	case HAMMER_FST_FLUSH:
	2211	/*
	2212	* The flush_group should already match.
	2213	*/
	2214	KKASSERT(rec->flush_group == flg);
	2215	r = 1;
	2216	break;
	2217	}
	2218	return(r);
	2219	}
	2220
	2221	#if 0
	2222	/*
	2223	* This version just moves records already in a flush state to the new
	2224	* flush group and that is it.
	2225	*/
	2226	static int
	2227	hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
	2228	{
	2229	hammer_inode_t ip = rec->ip;
	2230
	2231	switch(rec->flush_state) {
	2232	case HAMMER_FST_FLUSH:
	2233	KKASSERT(rec->flush_group == ip->flush_group);
	2234	break;
	2235	default:
	2236	break;
	2237	}
	2238	return(0);
	2239	}
	2240	#endif
	2241
	2242	/*
	2243	* Wait for a previously queued flush to complete.
	2244	*
	2245	* If a critical error occured we don't try to wait.
	2246	*/
	2247	void
	2248	hammer_wait_inode(hammer_inode_t ip)
	2249	{
	2250	hammer_flush_group_t flg;
	2251
	2252	flg = NULL;
	2253	if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2254	while (ip->flush_state != HAMMER_FST_IDLE &&
	2255	(ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2256	if (ip->flush_state == HAMMER_FST_SETUP)
	2257	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2258	if (ip->flush_state != HAMMER_FST_IDLE) {
	2259	ip->flags \|= HAMMER_INODE_FLUSHW;
	2260	tsleep(&ip->flags, 0, "hmrwin", 0);
	2261	}
	2262	}
	2263	}
	2264	}
	2265
	2266	/*
	2267	* Called by the backend code when a flush has been completed.
	2268	* The inode has already been removed from the flush list.
	2269	*
	2270	* A pipelined flush can occur, in which case we must re-enter the
	2271	* inode on the list and re-copy its fields.
	2272	*/
	2273	void
	2274	hammer_flush_inode_done(hammer_inode_t ip, int error)
	2275	{
	2276	hammer_mount_t hmp;
	2277	int dorel;
	2278
	2279	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	2280
	2281	hmp = ip->hmp;
	2282
	2283	/*
	2284	* Auto-reflush if the backend could not completely flush
	2285	* the inode. This fixes a case where a deferred buffer flush
	2286	* could cause fsync to return early.
	2287	*/
	2288	if (ip->sync_flags & HAMMER_INODE_MODMASK)
	2289	ip->flags \|= HAMMER_INODE_REFLUSH;
	2290
	2291	/*
	2292	* Merge left-over flags back into the frontend and fix the state.
	2293	* Incomplete truncations are retained by the backend.
	2294	*/
	2295	ip->error = error;
	2296	ip->flags \|= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
	2297	ip->sync_flags &= HAMMER_INODE_TRUNCATED;
	2298
	2299	/*
	2300	* The backend may have adjusted nlinks, so if the adjusted nlinks
	2301	* does not match the fronttend set the frontend's RDIRTY flag again.
	2302	*/
	2303	if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
	2304	ip->flags \|= HAMMER_INODE_DDIRTY;
	2305
	2306	/*
	2307	* Fix up the dirty buffer status.
	2308	*/
	2309	if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
	2310	ip->flags \|= HAMMER_INODE_BUFS;
	2311	}
	2312
	2313	/*
	2314	* Re-set the XDIRTY flag if some of the inode's in-memory records
	2315	* could not be flushed.
	2316	*/
	2317	KKASSERT((RB_EMPTY(&ip->rec_tree) &&
	2318	(ip->flags & HAMMER_INODE_XDIRTY) == 0) \|\|
	2319	(!RB_EMPTY(&ip->rec_tree) &&
	2320	(ip->flags & HAMMER_INODE_XDIRTY) != 0));
	2321
	2322	/*
	2323	* Do not lose track of inodes which no longer have vnode
	2324	* assocations, otherwise they may never get flushed again.
	2325	*
	2326	* The reflush flag can be set superfluously, causing extra pain
	2327	* for no reason. If the inode is no longer modified it no longer
	2328	* needs to be flushed.
	2329	*/
	2330	if (ip->flags & HAMMER_INODE_MODMASK) {
	2331	if (ip->vp == NULL)
	2332	ip->flags \|= HAMMER_INODE_REFLUSH;
	2333	} else {
	2334	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2335	}
	2336
	2337	/*
	2338	* Adjust the flush state.
	2339	*/
	2340	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	2341	/*
	2342	* We were unable to flush out all our records, leave the
	2343	* inode in a flush state and in the current flush group.
	2344	* The flush group will be re-run.
	2345	*
	2346	* This occurs if the UNDO block gets too full or there is
	2347	* too much dirty meta-data and allows the flusher to
	2348	* finalize the UNDO block and then re-flush.
	2349	*/
	2350	ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
	2351	dorel = 0;
	2352	} else {
	2353	/*
	2354	* Remove from the flush_group
	2355	*/
	2356	TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
	2357	ip->flush_group = NULL;
	2358
	2359	/*
	2360	* Clean up the vnode ref and tracking counts.
	2361	*/
	2362	if (ip->flags & HAMMER_INODE_VHELD) {
	2363	ip->flags &= ~HAMMER_INODE_VHELD;
	2364	vrele(ip->vp);
	2365	}
	2366	--hmp->count_iqueued;
	2367	--hammer_count_iqueued;
	2368
	2369	/*
	2370	* And adjust the state.
	2371	*/
	2372	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	2373	ip->flush_state = HAMMER_FST_IDLE;
	2374	dorel = 1;
	2375	} else {
	2376	ip->flush_state = HAMMER_FST_SETUP;
	2377	dorel = 0;
	2378	}
	2379
	2380	/*
	2381	* If the frontend is waiting for a flush to complete,
	2382	* wake it up.
	2383	*/
	2384	if (ip->flags & HAMMER_INODE_FLUSHW) {
	2385	ip->flags &= ~HAMMER_INODE_FLUSHW;
	2386	wakeup(&ip->flags);
	2387	}
	2388
	2389	/*
	2390	* If the frontend made more changes and requested another
	2391	* flush, then try to get it running.
	2392	*
	2393	* Reflushes are aborted when the inode is errored out.
	2394	*/
	2395	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2396	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2397	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2398	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2399	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2400	} else {
	2401	hammer_flush_inode(ip, 0);
	2402	}
	2403	}
	2404	}
	2405
	2406	/*
	2407	* If we have no parent dependancies we can clear CONN_DOWN
	2408	*/
	2409	if (TAILQ_EMPTY(&ip->target_list))
	2410	ip->flags &= ~HAMMER_INODE_CONN_DOWN;
	2411
	2412	/*
	2413	* If the inode is now clean drop the space reservation.
	2414	*/
	2415	if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	2416	(ip->flags & HAMMER_INODE_RSV_INODES)) {
	2417	ip->flags &= ~HAMMER_INODE_RSV_INODES;
	2418	--hmp->rsv_inodes;
	2419	}
	2420
	2421	if (dorel)
	2422	hammer_rel_inode(ip, 0);
	2423	}
	2424
	2425	/*
	2426	* Called from hammer_sync_inode() to synchronize in-memory records
	2427	* to the media.
	2428	*/
	2429	static int
	2430	hammer_sync_record_callback(hammer_record_t record, void *data)
	2431	{
	2432	hammer_cursor_t cursor = data;
	2433	hammer_transaction_t trans = cursor->trans;
	2434	hammer_mount_t hmp = trans->hmp;
	2435	int error;
	2436
	2437	/*
	2438	* Skip records that do not belong to the current flush.
	2439	*/
	2440	++hammer_stats_record_iterations;
	2441	if (record->flush_state != HAMMER_FST_FLUSH)
	2442	return(0);
	2443
	2444	#if 1
	2445	if (record->flush_group != record->ip->flush_group) {
	2446	kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	2447	Debugger("blah2");
	2448	return(0);
	2449	}
	2450	#endif
	2451	KKASSERT(record->flush_group == record->ip->flush_group);
	2452
	2453	/*
	2454	* Interlock the record using the BE flag. Once BE is set the
	2455	* frontend cannot change the state of FE.
	2456	*
	2457	* NOTE: If FE is set prior to us setting BE we still sync the
	2458	* record out, but the flush completion code converts it to
	2459	* a delete-on-disk record instead of destroying it.
	2460	*/
	2461	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	2462	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	2463
	2464	/*
	2465	* The backend has already disposed of the record.
	2466	*/
	2467	if (record->flags & (HAMMER_RECF_DELETED_BE \| HAMMER_RECF_COMMITTED)) {
	2468	error = 0;
	2469	goto done;
	2470	}
	2471
	2472	/*
	2473	* If the whole inode is being deleting all on-disk records will
	2474	* be deleted very soon, we can't sync any new records to disk
	2475	* because they will be deleted in the same transaction they were
	2476	* created in (delete_tid == create_tid), which will assert.
	2477	*
	2478	* XXX There may be a case with RECORD_ADD with DELETED_FE set
	2479	* that we currently panic on.
	2480	*/
	2481	if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
	2482	switch(record->type) {
	2483	case HAMMER_MEM_RECORD_DATA:
	2484	/*
	2485	* We don't have to do anything, if the record was
	2486	* committed the space will have been accounted for
	2487	* in the blockmap.
	2488	*/
	2489	/* fall through */
	2490	case HAMMER_MEM_RECORD_GENERAL:
	2491	/*
	2492	* Set deleted-by-backend flag. Do not set the
	2493	* backend committed flag, because we are throwing
	2494	* the record away.
	2495	*/
	2496	record->flags \|= HAMMER_RECF_DELETED_BE;
	2497	++record->ip->rec_generation;
	2498	error = 0;
	2499	goto done;
	2500	case HAMMER_MEM_RECORD_ADD:
	2501	panic("hammer_sync_record_callback: illegal add "
	2502	"during inode deletion record %p", record);
	2503	break; /* NOT REACHED */
	2504	case HAMMER_MEM_RECORD_INODE:
	2505	panic("hammer_sync_record_callback: attempt to "
	2506	"sync inode record %p?", record);
	2507	break; /* NOT REACHED */
	2508	case HAMMER_MEM_RECORD_DEL:
	2509	/*
	2510	* Follow through and issue the on-disk deletion
	2511	*/
	2512	break;
	2513	}
	2514	}
	2515
	2516	/*
	2517	* If DELETED_FE is set special handling is needed for directory
	2518	* entries. Dependant pieces related to the directory entry may
	2519	* have already been synced to disk. If this occurs we have to
	2520	* sync the directory entry and then change the in-memory record
	2521	* from an ADD to a DELETE to cover the fact that it's been
	2522	* deleted by the frontend.
	2523	*
	2524	* A directory delete covering record (MEM_RECORD_DEL) can never
	2525	* be deleted by the frontend.
	2526	*
	2527	* Any other record type (aka DATA) can be deleted by the frontend.
	2528	* XXX At the moment the flusher must skip it because there may
	2529	* be another data record in the flush group for the same block,
	2530	* meaning that some frontend data changes can leak into the backend's
	2531	* synchronization point.
	2532	*/
	2533	if (record->flags & HAMMER_RECF_DELETED_FE) {
	2534	if (record->type == HAMMER_MEM_RECORD_ADD) {
	2535	/*
	2536	* Convert a front-end deleted directory-add to
	2537	* a directory-delete entry later.
	2538	*/
	2539	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	2540	} else {
	2541	/*
	2542	* Dispose of the record (race case). Mark as
	2543	* deleted by backend (and not committed).
	2544	*/
	2545	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	2546	record->flags \|= HAMMER_RECF_DELETED_BE;
	2547	++record->ip->rec_generation;
	2548	error = 0;
	2549	goto done;
	2550	}
	2551	}
	2552
	2553	/*
	2554	* Assign the create_tid for new records. Deletions already
	2555	* have the record's entire key properly set up.
	2556	*/
	2557	if (record->type != HAMMER_MEM_RECORD_DEL) {
	2558	record->leaf.base.create_tid = trans->tid;
	2559	record->leaf.create_ts = trans->time32;
	2560	}
	2561	for (;;) {
	2562	error = hammer_ip_sync_record_cursor(cursor, record);
	2563	if (error != EDEADLK)
	2564	break;
	2565	hammer_done_cursor(cursor);
	2566	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	2567	record->ip);
	2568	if (error)
	2569	break;
	2570	}
	2571	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	2572
	2573	if (error)
	2574	error = -error;
	2575	done:
	2576	hammer_flush_record_done(record, error);
	2577
	2578	/*
	2579	* Do partial finalization if we have built up too many dirty
	2580	* buffers. Otherwise a buffer cache deadlock can occur when
	2581	* doing things like creating tens of thousands of tiny files.
	2582	*
	2583	* We must release our cursor lock to avoid a 3-way deadlock
	2584	* due to the exclusive sync lock the finalizer must get.
	2585	*/
	2586	if (hammer_flusher_meta_limit(hmp)) {
	2587	hammer_unlock_cursor(cursor);
	2588	hammer_flusher_finalize(trans, 0);
	2589	hammer_lock_cursor(cursor);
	2590	}
	2591
	2592	return(error);
	2593	}
	2594
	2595	/*
	2596	* Backend function called by the flusher to sync an inode to media.
	2597	*/
	2598	int
	2599	hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
	2600	{
	2601	struct hammer_cursor cursor;
	2602	hammer_node_t tmp_node;
	2603	hammer_record_t depend;
	2604	hammer_record_t next;
	2605	int error, tmp_error;
	2606	u_int64_t nlinks;
	2607
	2608	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	2609	return(0);
	2610
	2611	error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	2612	if (error)
	2613	goto done;
	2614
	2615	/*
	2616	* Any directory records referencing this inode which are not in
	2617	* our current flush group must adjust our nlink count for the
	2618	* purposes of synchronization to disk.
	2619	*
	2620	* Records which are in our flush group can be unlinked from our
	2621	* inode now, potentially allowing the inode to be physically
	2622	* deleted.
	2623	*
	2624	* This cannot block.
	2625	*/
	2626	nlinks = ip->ino_data.nlinks;
	2627	next = TAILQ_FIRST(&ip->target_list);
	2628	while ((depend = next) != NULL) {
	2629	next = TAILQ_NEXT(depend, target_entry);
	2630	if (depend->flush_state == HAMMER_FST_FLUSH &&
	2631	depend->flush_group == ip->flush_group) {
	2632	/*
	2633	* If this is an ADD that was deleted by the frontend
	2634	* the frontend nlinks count will have already been
	2635	* decremented, but the backend is going to sync its
	2636	* directory entry and must account for it. The
	2637	* record will be converted to a delete-on-disk when
	2638	* it gets synced.
	2639	*
	2640	* If the ADD was not deleted by the frontend we
	2641	* can remove the dependancy from our target_list.
	2642	*/
	2643	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	2644	++nlinks;
	2645	} else {
	2646	TAILQ_REMOVE(&ip->target_list, depend,
	2647	target_entry);
	2648	depend->target_ip = NULL;
	2649	}
	2650	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	2651	/*
	2652	* Not part of our flush group and not deleted by
	2653	* the front-end, adjust the link count synced to
	2654	* the media (undo what the frontend did when it
	2655	* queued the record).
	2656	*/
	2657	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	2658	switch(depend->type) {
	2659	case HAMMER_MEM_RECORD_ADD:
	2660	--nlinks;
	2661	break;
	2662	case HAMMER_MEM_RECORD_DEL:
	2663	++nlinks;
	2664	break;
	2665	default:
	2666	break;
	2667	}
	2668	}
	2669	}
	2670
	2671	/*
	2672	* Set dirty if we had to modify the link count.
	2673	*/
	2674	if (ip->sync_ino_data.nlinks != nlinks) {
	2675	KKASSERT((int64_t)nlinks >= 0);
	2676	ip->sync_ino_data.nlinks = nlinks;
	2677	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2678	}
	2679
	2680	/*
	2681	* If there is a trunction queued destroy any data past the (aligned)
	2682	* truncation point. Userland will have dealt with the buffer
	2683	* containing the truncation point for us.
	2684	*
	2685	* We don't flush pending frontend data buffers until after we've
	2686	* dealt with the truncation.
	2687	*/
	2688	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2689	/*
	2690	* Interlock trunc_off. The VOP front-end may continue to
	2691	* make adjustments to it while we are blocked.
	2692	*/
	2693	off_t trunc_off;
	2694	off_t aligned_trunc_off;
	2695	int blkmask;
	2696
	2697	trunc_off = ip->sync_trunc_off;
	2698	blkmask = hammer_blocksize(trunc_off) - 1;
	2699	aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
	2700
	2701	/*
	2702	* Delete any whole blocks on-media. The front-end has
	2703	* already cleaned out any partial block and made it
	2704	* pending. The front-end may have updated trunc_off
	2705	* while we were blocked so we only use sync_trunc_off.
	2706	*
	2707	* This operation can blow out the buffer cache, EWOULDBLOCK
	2708	* means we were unable to complete the deletion. The
	2709	* deletion will update sync_trunc_off in that case.
	2710	*/
	2711	error = hammer_ip_delete_range(&cursor, ip,
	2712	aligned_trunc_off,
	2713	0x7FFFFFFFFFFFFFFFLL, 2);
	2714	if (error == EWOULDBLOCK) {
	2715	ip->flags \|= HAMMER_INODE_WOULDBLOCK;
	2716	error = 0;
	2717	goto defer_buffer_flush;
	2718	}
	2719
	2720	if (error)
	2721	goto done;
	2722
	2723	/*
	2724	* Clear the truncation flag on the backend after we have
	2725	* complete the deletions. Backend data is now good again
	2726	* (including new records we are about to sync, below).
	2727	*
	2728	* Leave sync_trunc_off intact. As we write additional
	2729	* records the backend will update sync_trunc_off. This
	2730	* tells the backend whether it can skip the overwrite
	2731	* test. This should work properly even when the backend
	2732	* writes full blocks where the truncation point straddles
	2733	* the block because the comparison is against the base
	2734	* offset of the record.
	2735	*/
	2736	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2737	/* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
	2738	} else {
	2739	error = 0;
	2740	}
	2741
	2742	/*
	2743	* Now sync related records. These will typically be directory
	2744	* entries, records tracking direct-writes, or delete-on-disk records.
	2745	*/
	2746	if (error == 0) {
	2747	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2748	hammer_sync_record_callback, &cursor);
	2749	if (tmp_error < 0)
	2750	tmp_error = -error;
	2751	if (tmp_error)
	2752	error = tmp_error;
	2753	}
	2754	hammer_cache_node(&ip->cache[1], cursor.node);
	2755
	2756	/*
	2757	* Re-seek for inode update, assuming our cache hasn't been ripped
	2758	* out from under us.
	2759	*/
	2760	if (error == 0) {
	2761	tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
	2762	if (tmp_node) {
	2763	hammer_cursor_downgrade(&cursor);
	2764	hammer_lock_sh(&tmp_node->lock);
	2765	if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
	2766	hammer_cursor_seek(&cursor, tmp_node, 0);
	2767	hammer_unlock(&tmp_node->lock);
	2768	hammer_rel_node(tmp_node);
	2769	}
	2770	error = 0;
	2771	}
	2772
	2773	/*
	2774	* If we are deleting the inode the frontend had better not have
	2775	* any active references on elements making up the inode.
	2776	*
	2777	* The call to hammer_ip_delete_clean() cleans up auxillary records
	2778	* but not DB or DATA records. Those must have already been deleted
	2779	* by the normal truncation mechanic.
	2780	*/
	2781	if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
	2782	RB_EMPTY(&ip->rec_tree) &&
	2783	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	2784	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	2785	int count1 = 0;
	2786
	2787	error = hammer_ip_delete_clean(&cursor, ip, &count1);
	2788	if (error == 0) {
	2789	ip->flags \|= HAMMER_INODE_DELETED;
	2790	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	2791	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2792	KKASSERT(RB_EMPTY(&ip->rec_tree));
	2793
	2794	/*
	2795	* Set delete_tid in both the frontend and backend
	2796	* copy of the inode record. The DELETED flag handles
	2797	* this, do not set RDIRTY.
	2798	*/
	2799	ip->ino_leaf.base.delete_tid = trans->tid;
	2800	ip->sync_ino_leaf.base.delete_tid = trans->tid;
	2801	ip->ino_leaf.delete_ts = trans->time32;
	2802	ip->sync_ino_leaf.delete_ts = trans->time32;
	2803
	2804
	2805	/*
	2806	* Adjust the inode count in the volume header
	2807	*/
	2808	hammer_sync_lock_sh(trans);
	2809	if (ip->flags & HAMMER_INODE_ONDISK) {
	2810	hammer_modify_volume_field(trans,
	2811	trans->rootvol,
	2812	vol0_stat_inodes);
	2813	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	2814	hammer_modify_volume_done(trans->rootvol);
	2815	}
	2816	hammer_sync_unlock(trans);
	2817	}
	2818	}
	2819
	2820	if (error)
	2821	goto done;
	2822	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	2823
	2824	defer_buffer_flush:
	2825	/*
	2826	* Now update the inode's on-disk inode-data and/or on-disk record.
	2827	* DELETED and ONDISK are managed only in ip->flags.
	2828	*
	2829	* In the case of a defered buffer flush we still update the on-disk
	2830	* inode to satisfy visibility requirements if there happen to be
	2831	* directory dependancies.
	2832	*/
	2833	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	2834	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	2835	/*
	2836	* If deleted and on-disk, don't set any additional flags.
	2837	* the delete flag takes care of things.
	2838	*
	2839	* Clear flags which may have been set by the frontend.
	2840	*/
	2841	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2842	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2843	HAMMER_INODE_DELETING);
	2844	break;
	2845	case HAMMER_INODE_DELETED:
	2846	/*
	2847	* Take care of the case where a deleted inode was never
	2848	* flushed to the disk in the first place.
	2849	*
	2850	* Clear flags which may have been set by the frontend.
	2851	*/
	2852	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2853	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2854	HAMMER_INODE_DELETING);
	2855	while (RB_ROOT(&ip->rec_tree)) {
	2856	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	2857	hammer_ref(&record->lock);
	2858	KKASSERT(record->lock.refs == 1);
	2859	record->flags \|= HAMMER_RECF_DELETED_BE;
	2860	++record->ip->rec_generation;
	2861	hammer_rel_mem_record(record);
	2862	}
	2863	break;
	2864	case HAMMER_INODE_ONDISK:
	2865	/*
	2866	* If already on-disk, do not set any additional flags.
	2867	*/
	2868	break;
	2869	default:
	2870	/*
	2871	* If not on-disk and not deleted, set DDIRTY to force
	2872	* an initial record to be written.
	2873	*
	2874	* Also set the create_tid in both the frontend and backend
	2875	* copy of the inode record.
	2876	*/
	2877	ip->ino_leaf.base.create_tid = trans->tid;
	2878	ip->ino_leaf.create_ts = trans->time32;
	2879	ip->sync_ino_leaf.base.create_tid = trans->tid;
	2880	ip->sync_ino_leaf.create_ts = trans->time32;
	2881	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2882	break;
	2883	}
	2884
	2885	/*
	2886	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	2887	* is already on-disk the old record is marked as deleted.
	2888	*
	2889	* If DELETED is set hammer_update_inode() will delete the existing
	2890	* record without writing out a new one.
	2891	*
	2892	* If ONLY the ITIMES flag is set we can update the record in-place.
	2893	*/
	2894	if (ip->flags & HAMMER_INODE_DELETED) {
	2895	error = hammer_update_inode(&cursor, ip);
	2896	} else
	2897	if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
	2898	(ip->sync_flags & (HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME))) {
	2899	error = hammer_update_itimes(&cursor, ip);
	2900	} else
	2901	if (ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) {
	2902	error = hammer_update_inode(&cursor, ip);
	2903	}
	2904	done:
	2905	if (error) {
	2906	hammer_critical_error(ip->hmp, ip, error,
	2907	"while syncing inode");
	2908	}
	2909	hammer_done_cursor(&cursor);
	2910	return(error);
	2911	}
	2912
	2913	/*
	2914	* This routine is called when the OS is no longer actively referencing
	2915	* the inode (but might still be keeping it cached), or when releasing
	2916	* the last reference to an inode.
	2917	*
	2918	* At this point if the inode's nlinks count is zero we want to destroy
	2919	* it, which may mean destroying it on-media too.
	2920	*/
	2921	void
	2922	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	2923	{
	2924	struct vnode *vp;
	2925
	2926	/*
	2927	* Set the DELETING flag when the link count drops to 0 and the
	2928	* OS no longer has any opens on the inode.
	2929	*
	2930	* The backend will clear DELETING (a mod flag) and set DELETED
	2931	* (a state flag) when it is actually able to perform the
	2932	* operation.
	2933	*
	2934	* Don't reflag the deletion if the flusher is currently syncing
	2935	* one that was already flagged. A previously set DELETING flag
	2936	* may bounce around flags and sync_flags until the operation is
	2937	* completely done.
	2938	*/
	2939	if (ip->ino_data.nlinks == 0 &&
	2940	((ip->flags \| ip->sync_flags) & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	2941	ip->flags \|= HAMMER_INODE_DELETING;
	2942	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2943	ip->trunc_off = 0;
	2944	vp = NULL;
	2945	if (getvp) {
	2946	if (hammer_get_vnode(ip, &vp) != 0)
	2947	return;
	2948	}
	2949
	2950	/*
	2951	* Final cleanup
	2952	*/
	2953	if (ip->vp) {
	2954	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	2955	vnode_pager_setsize(ip->vp, 0);
	2956	}
	2957	if (getvp) {
	2958	vput(vp);
	2959	}
	2960	}
	2961	}
	2962
	2963	/*
	2964	* After potentially resolving a dependancy the inode is tested
	2965	* to determine whether it needs to be reflushed.
	2966	*/
	2967	void
	2968	hammer_test_inode(hammer_inode_t ip)
	2969	{
	2970	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2971	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2972	hammer_ref(&ip->lock);
	2973	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2974	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2975	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2976	} else {
	2977	hammer_flush_inode(ip, 0);
	2978	}
	2979	hammer_rel_inode(ip, 0);
	2980	}
	2981	}
	2982
	2983	/*
	2984	* Clear the RECLAIM flag on an inode. This occurs when the inode is
	2985	* reassociated with a vp or just before it gets freed.
	2986	*
	2987	* Pipeline wakeups to threads blocked due to an excessive number of
	2988	* detached inodes. The reclaim count generates a bit of negative
	2989	* feedback.
	2990	*/
	2991	static void
	2992	hammer_inode_wakereclaims(hammer_inode_t ip, int dowake)
	2993	{
	2994	struct hammer_reclaim *reclaim;
	2995	hammer_mount_t hmp = ip->hmp;
	2996
	2997	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
	2998	return;
	2999
	3000	--hammer_count_reclaiming;
	3001	--hmp->inode_reclaims;
	3002	ip->flags &= ~HAMMER_INODE_RECLAIM;
	3003
	3004	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT \|\| dowake) {
	3005	reclaim = TAILQ_FIRST(&hmp->reclaim_list);
	3006	if (reclaim && reclaim->count > 0 && --reclaim->count == 0) {
	3007	TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
	3008	wakeup(reclaim);
	3009	}
	3010	}
	3011	}
	3012
	3013	/*
	3014	* Setup our reclaim pipeline. We only let so many detached (and dirty)
	3015	* inodes build up before we start blocking.
	3016	*
	3017	* When we block we don't care which inode has finished reclaiming,
	3018	* as lone as one does. This is somewhat heuristical... we also put a
	3019	* cap on how long we are willing to wait.
	3020	*/
	3021	void
	3022	hammer_inode_waitreclaims(hammer_mount_t hmp)
	3023	{
	3024	struct hammer_reclaim reclaim;
	3025	int delay;
	3026
	3027	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT)
	3028	return;
	3029	delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
	3030	(HAMMER_RECLAIM_WAIT * 3) + 1;
	3031	if (delay > 0) {
	3032	reclaim.count = 2;
	3033	TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
	3034	tsleep(&reclaim, 0, "hmrrcm", delay);
	3035	if (reclaim.count > 0)
	3036	TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
	3037	}
	3038	}
	3039
	3040	/*
	3041	* A larger then normal backlog of inodes is sitting in the flusher,
	3042	* enforce a general slowdown to let it catch up. This routine is only
	3043	* called on completion of a non-flusher-related transaction which
	3044	* performed B-Tree node I/O.
	3045	*
	3046	* It is possible for the flusher to stall in a continuous load.
	3047	* blogbench -i1000 -o seems to do a good job generating this sort of load.
	3048	* If the flusher is unable to catch up the inode count can bloat until
	3049	* we run out of kvm.
	3050	*
	3051	* This is a bit of a hack.
	3052	*/
	3053	void
	3054	hammer_inode_waithard(hammer_mount_t hmp)
	3055	{
	3056	/*
	3057	* Hysteresis.
	3058	*/
	3059	if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
	3060	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 &&
	3061	hmp->count_iqueued < hmp->count_inodes / 20) {
	3062	hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
	3063	return;
	3064	}
	3065	} else {
	3066	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT \|\|
	3067	hmp->count_iqueued < hmp->count_inodes / 10) {
	3068	return;
	3069	}
	3070	hmp->flags \|= HAMMER_MOUNT_FLUSH_RECOVERY;
	3071	}
	3072
	3073	/*
	3074	* Block for one flush cycle.
	3075	*/
	3076	hammer_flusher_wait_next(hmp);
	3077	}
	3078