gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39	#include <sys/buf.h>
	40	#include <sys/buf2.h>
	41
	42	static int hammer_unload_inode(struct hammer_inode *ip);
	43	static void hammer_free_inode(hammer_inode_t ip);
	44	static void hammer_flush_inode_core(hammer_inode_t ip,
	45	hammer_flush_group_t flg, int flags);
	46	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	47	#if 0
	48	static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
	49	#endif
	50	static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	51	hammer_flush_group_t flg);
	52	static int hammer_setup_parent_inodes_helper(hammer_record_t record,
	53	int depth, hammer_flush_group_t flg);
	54	static void hammer_inode_wakereclaims(hammer_inode_t ip, int dowake);
	55
	56	#ifdef DEBUG_TRUNCATE
	57	extern struct hammer_inode *HammerTruncIp;
	58	#endif
	59
	60	/*
	61	* RB-Tree support for inode structures
	62	*/
	63	int
	64	hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
	65	{
	66	if (ip1->obj_localization < ip2->obj_localization)
	67	return(-1);
	68	if (ip1->obj_localization > ip2->obj_localization)
	69	return(1);
	70	if (ip1->obj_id < ip2->obj_id)
	71	return(-1);
	72	if (ip1->obj_id > ip2->obj_id)
	73	return(1);
	74	if (ip1->obj_asof < ip2->obj_asof)
	75	return(-1);
	76	if (ip1->obj_asof > ip2->obj_asof)
	77	return(1);
	78	return(0);
	79	}
	80
	81	/*
	82	* RB-Tree support for inode structures / special LOOKUP_INFO
	83	*/
	84	static int
	85	hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
	86	{
	87	if (info->obj_localization < ip->obj_localization)
	88	return(-1);
	89	if (info->obj_localization > ip->obj_localization)
	90	return(1);
	91	if (info->obj_id < ip->obj_id)
	92	return(-1);
	93	if (info->obj_id > ip->obj_id)
	94	return(1);
	95	if (info->obj_asof < ip->obj_asof)
	96	return(-1);
	97	if (info->obj_asof > ip->obj_asof)
	98	return(1);
	99	return(0);
	100	}
	101
	102	/*
	103	* Used by hammer_scan_inode_snapshots() to locate all of an object's
	104	* snapshots. Note that the asof field is not tested, which we can get
	105	* away with because it is the lowest-priority field.
	106	*/
	107	static int
	108	hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
	109	{
	110	hammer_inode_info_t info = data;
	111
	112	if (ip->obj_localization > info->obj_localization)
	113	return(1);
	114	if (ip->obj_localization < info->obj_localization)
	115	return(-1);
	116	if (ip->obj_id > info->obj_id)
	117	return(1);
	118	if (ip->obj_id < info->obj_id)
	119	return(-1);
	120	return(0);
	121	}
	122
	123	/*
	124	* Used by hammer_unload_pseudofs() to locate all inodes associated with
	125	* a particular PFS.
	126	*/
	127	static int
	128	hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
	129	{
	130	u_int32_t localization = (u_int32_t )data;
	131	if (ip->obj_localization > localization)
	132	return(1);
	133	if (ip->obj_localization < localization)
	134	return(-1);
	135	return(0);
	136	}
	137
	138	/*
	139	* RB-Tree support for pseudofs structures
	140	*/
	141	static int
	142	hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
	143	{
	144	if (p1->localization < p2->localization)
	145	return(-1);
	146	if (p1->localization > p2->localization)
	147	return(1);
	148	return(0);
	149	}
	150
	151
	152	RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
	153	RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
	154	hammer_inode_info_cmp, hammer_inode_info_t);
	155	RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
	156	hammer_pfs_rb_compare, u_int32_t, localization);
	157
	158	/*
	159	* The kernel is not actively referencing this vnode but is still holding
	160	* it cached.
	161	*
	162	* This is called from the frontend.
	163	*/
	164	int
	165	hammer_vop_inactive(struct vop_inactive_args *ap)
	166	{
	167	struct hammer_inode *ip = VTOI(ap->a_vp);
	168
	169	/*
	170	* Degenerate case
	171	*/
	172	if (ip == NULL) {
	173	vrecycle(ap->a_vp);
	174	return(0);
	175	}
	176
	177	/*
	178	* If the inode no longer has visibility in the filesystem try to
	179	* recycle it immediately, even if the inode is dirty. Recycling
	180	* it quickly allows the system to reclaim buffer cache and VM
	181	* resources which can matter a lot in a heavily loaded system.
	182	*
	183	* This can deadlock in vfsync() if we aren't careful.
	184	*
	185	* Do not queue the inode to the flusher if we still have visibility,
	186	* otherwise namespace calls such as chmod will unnecessarily generate
	187	* multiple inode updates.
	188	*/
	189	hammer_inode_unloadable_check(ip, 0);
	190	if (ip->ino_data.nlinks == 0) {
	191	if (ip->flags & HAMMER_INODE_MODMASK)
	192	hammer_flush_inode(ip, 0);
	193	vrecycle(ap->a_vp);
	194	}
	195	return(0);
	196	}
	197
	198	/*
	199	* Release the vnode association. This is typically (but not always)
	200	* the last reference on the inode.
	201	*
	202	* Once the association is lost we are on our own with regards to
	203	* flushing the inode.
	204	*/
	205	int
	206	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	207	{
	208	struct hammer_inode *ip;
	209	hammer_mount_t hmp;
	210	struct vnode *vp;
	211
	212	vp = ap->a_vp;
	213
	214	if ((ip = vp->v_data) != NULL) {
	215	hmp = ip->hmp;
	216	vp->v_data = NULL;
	217	ip->vp = NULL;
	218
	219	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
	220	++hammer_count_reclaiming;
	221	++hmp->inode_reclaims;
	222	ip->flags \|= HAMMER_INODE_RECLAIM;
	223	}
	224	hammer_rel_inode(ip, 1);
	225	}
	226	return(0);
	227	}
	228
	229	/*
	230	* Return a locked vnode for the specified inode. The inode must be
	231	* referenced but NOT LOCKED on entry and will remain referenced on
	232	* return.
	233	*
	234	* Called from the frontend.
	235	*/
	236	int
	237	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	238	{
	239	hammer_mount_t hmp;
	240	struct vnode *vp;
	241	int error = 0;
	242	u_int8_t obj_type;
	243
	244	hmp = ip->hmp;
	245
	246	for (;;) {
	247	if ((vp = ip->vp) == NULL) {
	248	error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
	249	if (error)
	250	break;
	251	hammer_lock_ex(&ip->lock);
	252	if (ip->vp != NULL) {
	253	hammer_unlock(&ip->lock);
	254	vp->v_type = VBAD;
	255	vx_put(vp);
	256	continue;
	257	}
	258	hammer_ref(&ip->lock);
	259	vp = *vpp;
	260	ip->vp = vp;
	261
	262	obj_type = ip->ino_data.obj_type;
	263	vp->v_type = hammer_get_vnode_type(obj_type);
	264
	265	hammer_inode_wakereclaims(ip, 0);
	266
	267	switch(ip->ino_data.obj_type) {
	268	case HAMMER_OBJTYPE_CDEV:
	269	case HAMMER_OBJTYPE_BDEV:
	270	vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
	271	addaliasu(vp, ip->ino_data.rmajor,
	272	ip->ino_data.rminor);
	273	break;
	274	case HAMMER_OBJTYPE_FIFO:
	275	vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
	276	break;
	277	default:
	278	break;
	279	}
	280
	281	/*
	282	* Only mark as the root vnode if the ip is not
	283	* historical, otherwise the VFS cache will get
	284	* confused. The other half of the special handling
	285	* is in hammer_vop_nlookupdotdot().
	286	*
	287	* Pseudo-filesystem roots can be accessed via
	288	* non-root filesystem paths and setting VROOT may
	289	* confuse the namecache. Set VPFSROOT instead.
	290	*/
	291	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	292	ip->obj_asof == hmp->asof) {
	293	if (ip->obj_localization == 0)
	294	vp->v_flag \|= VROOT;
	295	else
	296	vp->v_flag \|= VPFSROOT;
	297	}
	298
	299	vp->v_data = (void *)ip;
	300	/* vnode locked by getnewvnode() */
	301	/* make related vnode dirty if inode dirty? */
	302	hammer_unlock(&ip->lock);
	303	if (vp->v_type == VREG)
	304	vinitvmio(vp, ip->ino_data.size);
	305	break;
	306	}
	307
	308	/*
	309	* loop if the vget fails (aka races), or if the vp
	310	* no longer matches ip->vp.
	311	*/
	312	if (vget(vp, LK_EXCLUSIVE) == 0) {
	313	if (vp == ip->vp)
	314	break;
	315	vput(vp);
	316	}
	317	}
	318	*vpp = vp;
	319	return(error);
	320	}
	321
	322	/*
	323	* Locate all copies of the inode for obj_id compatible with the specified
	324	* asof, reference, and issue the related call-back. This routine is used
	325	* for direct-io invalidation and does not create any new inodes.
	326	*/
	327	void
	328	hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
	329	int (callback)(hammer_inode_t ip, void data),
	330	void *data)
	331	{
	332	hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
	333	hammer_inode_info_cmp_all_history,
	334	callback, iinfo);
	335	}
	336
	337	/*
	338	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	339	* do not attach or detach the related vnode (use hammer_get_vnode() for
	340	* that).
	341	*
	342	* The flags argument is only applied for newly created inodes, and only
	343	* certain flags are inherited.
	344	*
	345	* Called from the frontend.
	346	*/
	347	struct hammer_inode *
	348	hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
	349	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	350	int flags, int *errorp)
	351	{
	352	hammer_mount_t hmp = trans->hmp;
	353	struct hammer_inode_info iinfo;
	354	struct hammer_cursor cursor;
	355	struct hammer_inode *ip;
	356
	357
	358	/*
	359	* Determine if we already have an inode cached. If we do then
	360	* we are golden.
	361	*
	362	* If we find an inode with no vnode we have to mark the
	363	* transaction such that hammer_inode_waitreclaims() is
	364	* called later on to avoid building up an infinite number
	365	* of inodes. Otherwise we can continue to * add new inodes
	366	* faster then they can be disposed of, even with the tsleep
	367	* delay.
	368	*
	369	* If we find a dummy inode we return a failure so dounlink
	370	* (which does another lookup) doesn't try to mess with the
	371	* link count. hammer_vop_nresolve() uses hammer_get_dummy_inode()
	372	* to ref dummy inodes.
	373	*/
	374	iinfo.obj_id = obj_id;
	375	iinfo.obj_asof = asof;
	376	iinfo.obj_localization = localization;
	377	loop:
	378	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	379	if (ip) {
	380	if (ip->flags & HAMMER_INODE_DUMMY) {
	381	*errorp = ENOENT;
	382	return(NULL);
	383	}
	384	hammer_ref(&ip->lock);
	385	*errorp = 0;
	386	return(ip);
	387	}
	388
	389	/*
	390	* Allocate a new inode structure and deal with races later.
	391	*/
	392	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	393	++hammer_count_inodes;
	394	++hmp->count_inodes;
	395	ip->obj_id = obj_id;
	396	ip->obj_asof = iinfo.obj_asof;
	397	ip->obj_localization = localization;
	398	ip->hmp = hmp;
	399	ip->flags = flags & HAMMER_INODE_RO;
	400	ip->cache[0].ip = ip;
	401	ip->cache[1].ip = ip;
	402	if (hmp->ronly)
	403	ip->flags \|= HAMMER_INODE_RO;
	404	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	405	0x7FFFFFFFFFFFFFFFLL;
	406	RB_INIT(&ip->rec_tree);
	407	TAILQ_INIT(&ip->target_list);
	408	hammer_ref(&ip->lock);
	409
	410	/*
	411	* Locate the on-disk inode. If this is a PFS root we always
	412	* access the current version of the root inode and (if it is not
	413	* a master) always access information under it with a snapshot
	414	* TID.
	415	*/
	416	retry:
	417	hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL);
	418	cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
	419	cursor.key_beg.obj_id = ip->obj_id;
	420	cursor.key_beg.key = 0;
	421	cursor.key_beg.create_tid = 0;
	422	cursor.key_beg.delete_tid = 0;
	423	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	424	cursor.key_beg.obj_type = 0;
	425
	426	cursor.asof = iinfo.obj_asof;
	427	cursor.flags = HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_GET_DATA \|
	428	HAMMER_CURSOR_ASOF;
	429
	430	*errorp = hammer_btree_lookup(&cursor);
	431	if (*errorp == EDEADLK) {
	432	hammer_done_cursor(&cursor);
	433	goto retry;
	434	}
	435
	436	/*
	437	* On success the B-Tree lookup will hold the appropriate
	438	* buffer cache buffers and provide a pointer to the requested
	439	* information. Copy the information to the in-memory inode
	440	* and cache the B-Tree node to improve future operations.
	441	*/
	442	if (*errorp == 0) {
	443	ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
	444	ip->ino_data = cursor.data->inode;
	445
	446	/*
	447	* cache[0] tries to cache the location of the object inode.
	448	* The assumption is that it is near the directory inode.
	449	*
	450	* cache[1] tries to cache the location of the object data.
	451	* The assumption is that it is near the directory data.
	452	*/
	453	hammer_cache_node(&ip->cache[0], cursor.node);
	454	if (dip && dip->cache[1].node)
	455	hammer_cache_node(&ip->cache[1], dip->cache[1].node);
	456
	457	/*
	458	* The file should not contain any data past the file size
	459	* stored in the inode. Setting save_trunc_off to the
	460	* file size instead of max reduces B-Tree lookup overheads
	461	* on append by allowing the flusher to avoid checking for
	462	* record overwrites.
	463	*/
	464	ip->save_trunc_off = ip->ino_data.size;
	465
	466	/*
	467	* Locate and assign the pseudofs management structure to
	468	* the inode.
	469	*/
	470	if (dip && dip->obj_localization == ip->obj_localization) {
	471	ip->pfsm = dip->pfsm;
	472	hammer_ref(&ip->pfsm->lock);
	473	} else {
	474	ip->pfsm = hammer_load_pseudofs(trans,
	475	ip->obj_localization,
	476	errorp);
	477	errorp = 0; / ignore ENOENT */
	478	}
	479	}
	480
	481	/*
	482	* The inode is placed on the red-black tree and will be synced to
	483	* the media when flushed or by the filesystem sync. If this races
	484	* another instantiation/lookup the insertion will fail.
	485	*/
	486	if (*errorp == 0) {
	487	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	488	hammer_free_inode(ip);
	489	hammer_done_cursor(&cursor);
	490	goto loop;
	491	}
	492	ip->flags \|= HAMMER_INODE_ONDISK;
	493	} else {
	494	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	495	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	496	--hmp->rsv_inodes;
	497	}
	498
	499	hammer_free_inode(ip);
	500	ip = NULL;
	501	}
	502	hammer_done_cursor(&cursor);
	503	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	504	return (ip);
	505	}
	506
	507	/*
	508	* Get a dummy inode to placemark a broken directory entry.
	509	*/
	510	struct hammer_inode *
	511	hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
	512	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	513	int flags, int *errorp)
	514	{
	515	hammer_mount_t hmp = trans->hmp;
	516	struct hammer_inode_info iinfo;
	517	struct hammer_inode *ip;
	518
	519	/*
	520	* Determine if we already have an inode cached. If we do then
	521	* we are golden.
	522	*
	523	* If we find an inode with no vnode we have to mark the
	524	* transaction such that hammer_inode_waitreclaims() is
	525	* called later on to avoid building up an infinite number
	526	* of inodes. Otherwise we can continue to * add new inodes
	527	* faster then they can be disposed of, even with the tsleep
	528	* delay.
	529	*
	530	* If we find a non-fake inode we return an error. Only fake
	531	* inodes can be returned by this routine.
	532	*/
	533	iinfo.obj_id = obj_id;
	534	iinfo.obj_asof = asof;
	535	iinfo.obj_localization = localization;
	536	loop:
	537	*errorp = 0;
	538	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	539	if (ip) {
	540	if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
	541	*errorp = ENOENT;
	542	return(NULL);
	543	}
	544	hammer_ref(&ip->lock);
	545	return(ip);
	546	}
	547
	548	/*
	549	* Allocate a new inode structure and deal with races later.
	550	*/
	551	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	552	++hammer_count_inodes;
	553	++hmp->count_inodes;
	554	ip->obj_id = obj_id;
	555	ip->obj_asof = iinfo.obj_asof;
	556	ip->obj_localization = localization;
	557	ip->hmp = hmp;
	558	ip->flags = flags \| HAMMER_INODE_RO \| HAMMER_INODE_DUMMY;
	559	ip->cache[0].ip = ip;
	560	ip->cache[1].ip = ip;
	561	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	562	0x7FFFFFFFFFFFFFFFLL;
	563	RB_INIT(&ip->rec_tree);
	564	TAILQ_INIT(&ip->target_list);
	565	hammer_ref(&ip->lock);
	566
	567	/*
	568	* Populate the dummy inode. Leave everything zero'd out.
	569	*
	570	* (ip->ino_leaf and ip->ino_data)
	571	*
	572	* Make the dummy inode a FIFO object which most copy programs
	573	* will properly ignore.
	574	*/
	575	ip->save_trunc_off = ip->ino_data.size;
	576	ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
	577
	578	/*
	579	* Locate and assign the pseudofs management structure to
	580	* the inode.
	581	*/
	582	if (dip && dip->obj_localization == ip->obj_localization) {
	583	ip->pfsm = dip->pfsm;
	584	hammer_ref(&ip->pfsm->lock);
	585	} else {
	586	ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
	587	errorp);
	588	errorp = 0; / ignore ENOENT */
	589	}
	590
	591	/*
	592	* The inode is placed on the red-black tree and will be synced to
	593	* the media when flushed or by the filesystem sync. If this races
	594	* another instantiation/lookup the insertion will fail.
	595	*
	596	* NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake.
	597	*/
	598	if (*errorp == 0) {
	599	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	600	hammer_free_inode(ip);
	601	goto loop;
	602	}
	603	} else {
	604	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	605	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	606	--hmp->rsv_inodes;
	607	}
	608	hammer_free_inode(ip);
	609	ip = NULL;
	610	}
	611	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	612	return (ip);
	613	}
	614
	615	/*
	616	* Create a new filesystem object, returning the inode in *ipp. The
	617	* returned inode will be referenced. The inode is created in-memory.
	618	*
	619	* If pfsm is non-NULL the caller wishes to create the root inode for
	620	* a master PFS.
	621	*/
	622	int
	623	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	624	struct ucred *cred, hammer_inode_t dip,
	625	hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
	626	{
	627	hammer_mount_t hmp;
	628	hammer_inode_t ip;
	629	uid_t xuid;
	630	int error;
	631
	632	hmp = trans->hmp;
	633
	634	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	635	++hammer_count_inodes;
	636	++hmp->count_inodes;
	637	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	638
	639	if (pfsm) {
	640	KKASSERT(pfsm->localization != 0);
	641	ip->obj_id = HAMMER_OBJID_ROOT;
	642	ip->obj_localization = pfsm->localization;
	643	} else {
	644	KKASSERT(dip != NULL);
	645	ip->obj_id = hammer_alloc_objid(hmp, dip);
	646	ip->obj_localization = dip->obj_localization;
	647	}
	648
	649	KKASSERT(ip->obj_id != 0);
	650	ip->obj_asof = hmp->asof;
	651	ip->hmp = hmp;
	652	ip->flush_state = HAMMER_FST_IDLE;
	653	ip->flags = HAMMER_INODE_DDIRTY \|
	654	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME;
	655	ip->cache[0].ip = ip;
	656	ip->cache[1].ip = ip;
	657
	658	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	659	/* ip->save_trunc_off = 0; (already zero) */
	660	RB_INIT(&ip->rec_tree);
	661	TAILQ_INIT(&ip->target_list);
	662
	663	ip->ino_data.atime = trans->time;
	664	ip->ino_data.mtime = trans->time;
	665	ip->ino_data.size = 0;
	666	ip->ino_data.nlinks = 0;
	667
	668	/*
	669	* A nohistory designator on the parent directory is inherited by
	670	* the child. We will do this even for pseudo-fs creation... the
	671	* sysad can turn it off.
	672	*/
	673	if (dip) {
	674	ip->ino_data.uflags = dip->ino_data.uflags &
	675	(SF_NOHISTORY\|UF_NOHISTORY\|UF_NODUMP);
	676	}
	677
	678	ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
	679	ip->ino_leaf.base.localization = ip->obj_localization +
	680	HAMMER_LOCALIZE_INODE;
	681	ip->ino_leaf.base.obj_id = ip->obj_id;
	682	ip->ino_leaf.base.key = 0;
	683	ip->ino_leaf.base.create_tid = 0;
	684	ip->ino_leaf.base.delete_tid = 0;
	685	ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
	686	ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
	687
	688	ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
	689	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	690	ip->ino_data.mode = vap->va_mode;
	691	ip->ino_data.ctime = trans->time;
	692
	693	/*
	694	* If we are running version 2 or greater we use dirhash algorithm #1
	695	* which is semi-sorted. Algorithm #0 was just a pure crc.
	696	*/
	697	if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
	698	if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	699	ip->ino_data.cap_flags \|= HAMMER_INODE_CAP_DIRHASH_ALG1;
	700	}
	701	}
	702
	703	/*
	704	* Setup the ".." pointer. This only needs to be done for directories
	705	* but we do it for all objects as a recovery aid.
	706	*/
	707	if (dip)
	708	ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
	709	#if 0
	710	/*
	711	* The parent_obj_localization field only applies to pseudo-fs roots.
	712	* XXX this is no longer applicable, PFSs are no longer directly
	713	* tied into the parent's directory structure.
	714	*/
	715	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
	716	ip->obj_id == HAMMER_OBJID_ROOT) {
	717	ip->ino_data.ext.obj.parent_obj_localization =
	718	dip->obj_localization;
	719	}
	720	#endif
	721
	722	switch(ip->ino_leaf.base.obj_type) {
	723	case HAMMER_OBJTYPE_CDEV:
	724	case HAMMER_OBJTYPE_BDEV:
	725	ip->ino_data.rmajor = vap->va_rmajor;
	726	ip->ino_data.rminor = vap->va_rminor;
	727	break;
	728	default:
	729	break;
	730	}
	731
	732	/*
	733	* Calculate default uid/gid and overwrite with information from
	734	* the vap.
	735	*/
	736	if (dip) {
	737	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	738	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
	739	xuid, cred, &vap->va_mode);
	740	} else {
	741	xuid = 0;
	742	}
	743	ip->ino_data.mode = vap->va_mode;
	744
	745	if (vap->va_vaflags & VA_UID_UUID_VALID)
	746	ip->ino_data.uid = vap->va_uid_uuid;
	747	else if (vap->va_uid != (uid_t)VNOVAL)
	748	hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
	749	else
	750	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	751
	752	if (vap->va_vaflags & VA_GID_UUID_VALID)
	753	ip->ino_data.gid = vap->va_gid_uuid;
	754	else if (vap->va_gid != (gid_t)VNOVAL)
	755	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	756	else if (dip)
	757	ip->ino_data.gid = dip->ino_data.gid;
	758
	759	hammer_ref(&ip->lock);
	760
	761	if (pfsm) {
	762	ip->pfsm = pfsm;
	763	hammer_ref(&pfsm->lock);
	764	error = 0;
	765	} else if (dip->obj_localization == ip->obj_localization) {
	766	ip->pfsm = dip->pfsm;
	767	hammer_ref(&ip->pfsm->lock);
	768	error = 0;
	769	} else {
	770	ip->pfsm = hammer_load_pseudofs(trans,
	771	ip->obj_localization,
	772	&error);
	773	error = 0; /* ignore ENOENT */
	774	}
	775
	776	if (error) {
	777	hammer_free_inode(ip);
	778	ip = NULL;
	779	} else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	780	panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
	781	/* not reached */
	782	hammer_free_inode(ip);
	783	}
	784	*ipp = ip;
	785	return(error);
	786	}
	787
	788	/*
	789	* Final cleanup / freeing of an inode structure
	790	*/
	791	static void
	792	hammer_free_inode(hammer_inode_t ip)
	793	{
	794	struct hammer_mount *hmp;
	795
	796	hmp = ip->hmp;
	797	KKASSERT(ip->lock.refs == 1);
	798	hammer_uncache_node(&ip->cache[0]);
	799	hammer_uncache_node(&ip->cache[1]);
	800	hammer_inode_wakereclaims(ip, 1);
	801	if (ip->objid_cache)
	802	hammer_clear_objid(ip);
	803	--hammer_count_inodes;
	804	--hmp->count_inodes;
	805	if (ip->pfsm) {
	806	hammer_rel_pseudofs(hmp, ip->pfsm);
	807	ip->pfsm = NULL;
	808	}
	809	kfree(ip, hmp->m_inodes);
	810	ip = NULL;
	811	}
	812
	813	/*
	814	* Retrieve pseudo-fs data. NULL will never be returned.
	815	*
	816	* If an error occurs *errorp will be set and a default template is returned,
	817	* otherwise *errorp is set to 0. Typically when an error occurs it will
	818	* be ENOENT.
	819	*/
	820	hammer_pseudofs_inmem_t
	821	hammer_load_pseudofs(hammer_transaction_t trans,
	822	u_int32_t localization, int *errorp)
	823	{
	824	hammer_mount_t hmp = trans->hmp;
	825	hammer_inode_t ip;
	826	hammer_pseudofs_inmem_t pfsm;
	827	struct hammer_cursor cursor;
	828	int bytes;
	829
	830	retry:
	831	pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
	832	if (pfsm) {
	833	hammer_ref(&pfsm->lock);
	834	*errorp = 0;
	835	return(pfsm);
	836	}
	837
	838	/*
	839	* PFS records are stored in the root inode (not the PFS root inode,
	840	* but the real root). Avoid an infinite recursion if loading
	841	* the PFS for the real root.
	842	*/
	843	if (localization) {
	844	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
	845	HAMMER_MAX_TID,
	846	HAMMER_DEF_LOCALIZATION, 0, errorp);
	847	} else {
	848	ip = NULL;
	849	}
	850
	851	pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK \| M_ZERO);
	852	pfsm->localization = localization;
	853	pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
	854	pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
	855
	856	hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
	857	cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
	858	HAMMER_LOCALIZE_MISC;
	859	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	860	cursor.key_beg.create_tid = 0;
	861	cursor.key_beg.delete_tid = 0;
	862	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	863	cursor.key_beg.obj_type = 0;
	864	cursor.key_beg.key = localization;
	865	cursor.asof = HAMMER_MAX_TID;
	866	cursor.flags \|= HAMMER_CURSOR_ASOF;
	867
	868	if (ip)
	869	*errorp = hammer_ip_lookup(&cursor);
	870	else
	871	*errorp = hammer_btree_lookup(&cursor);
	872	if (*errorp == 0) {
	873	*errorp = hammer_ip_resolve_data(&cursor);
	874	if (*errorp == 0) {
	875	if (cursor.data->pfsd.mirror_flags &
	876	HAMMER_PFSD_DELETED) {
	877	*errorp = ENOENT;
	878	} else {
	879	bytes = cursor.leaf->data_len;
	880	if (bytes > sizeof(pfsm->pfsd))
	881	bytes = sizeof(pfsm->pfsd);
	882	bcopy(cursor.data, &pfsm->pfsd, bytes);
	883	}
	884	}
	885	}
	886	hammer_done_cursor(&cursor);
	887
	888	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	889	hammer_ref(&pfsm->lock);
	890	if (ip)
	891	hammer_rel_inode(ip, 0);
	892	if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
	893	kfree(pfsm, hmp->m_misc);
	894	goto retry;
	895	}
	896	return(pfsm);
	897	}
	898
	899	/*
	900	* Store pseudo-fs data. The backend will automatically delete any prior
	901	* on-disk pseudo-fs data but we have to delete in-memory versions.
	902	*/
	903	int
	904	hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
	905	{
	906	struct hammer_cursor cursor;
	907	hammer_record_t record;
	908	hammer_inode_t ip;
	909	int error;
	910
	911	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	912	HAMMER_DEF_LOCALIZATION, 0, &error);
	913	retry:
	914	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	915	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	916	cursor.key_beg.localization = ip->obj_localization +
	917	HAMMER_LOCALIZE_MISC;
	918	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	919	cursor.key_beg.create_tid = 0;
	920	cursor.key_beg.delete_tid = 0;
	921	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	922	cursor.key_beg.obj_type = 0;
	923	cursor.key_beg.key = pfsm->localization;
	924	cursor.asof = HAMMER_MAX_TID;
	925	cursor.flags \|= HAMMER_CURSOR_ASOF;
	926
	927	/*
	928	* Replace any in-memory version of the record.
	929	*/
	930	error = hammer_ip_lookup(&cursor);
	931	if (error == 0 && hammer_cursor_inmem(&cursor)) {
	932	record = cursor.iprec;
	933	if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
	934	KKASSERT(cursor.deadlk_rec == NULL);
	935	hammer_ref(&record->lock);
	936	cursor.deadlk_rec = record;
	937	error = EDEADLK;
	938	} else {
	939	record->flags \|= HAMMER_RECF_DELETED_FE;
	940	error = 0;
	941	}
	942	}
	943
	944	/*
	945	* Allocate replacement general record. The backend flush will
	946	* delete any on-disk version of the record.
	947	*/
	948	if (error == 0 \|\| error == ENOENT) {
	949	record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
	950	record->type = HAMMER_MEM_RECORD_GENERAL;
	951
	952	record->leaf.base.localization = ip->obj_localization +
	953	HAMMER_LOCALIZE_MISC;
	954	record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
	955	record->leaf.base.key = pfsm->localization;
	956	record->leaf.data_len = sizeof(pfsm->pfsd);
	957	bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
	958	error = hammer_ip_add_record(trans, record);
	959	}
	960	hammer_done_cursor(&cursor);
	961	if (error == EDEADLK)
	962	goto retry;
	963	hammer_rel_inode(ip, 0);
	964	return(error);
	965	}
	966
	967	/*
	968	* Create a root directory for a PFS if one does not alredy exist.
	969	*
	970	* The PFS root stands alone so we must also bump the nlinks count
	971	* to prevent it from being destroyed on release.
	972	*/
	973	int
	974	hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
	975	hammer_pseudofs_inmem_t pfsm)
	976	{
	977	hammer_inode_t ip;
	978	struct vattr vap;
	979	int error;
	980
	981	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	982	pfsm->localization, 0, &error);
	983	if (ip == NULL) {
	984	vattr_null(&vap);
	985	vap.va_mode = 0755;
	986	vap.va_type = VDIR;
	987	error = hammer_create_inode(trans, &vap, cred, NULL, pfsm, &ip);
	988	if (error == 0) {
	989	++ip->ino_data.nlinks;
	990	hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
	991	}
	992	}
	993	if (ip)
	994	hammer_rel_inode(ip, 0);
	995	return(error);
	996	}
	997
	998	/*
	999	* Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
	1000	* if we are unable to disassociate all the inodes.
	1001	*/
	1002	static
	1003	int
	1004	hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
	1005	{
	1006	int res;
	1007
	1008	hammer_ref(&ip->lock);
	1009	if (ip->lock.refs == 2 && ip->vp)
	1010	vclean_unlocked(ip->vp);
	1011	if (ip->lock.refs == 1 && ip->vp == NULL)
	1012	res = 0;
	1013	else
	1014	res = -1; /* stop, someone is using the inode */
	1015	hammer_rel_inode(ip, 0);
	1016	return(res);
	1017	}
	1018
	1019	int
	1020	hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
	1021	{
	1022	int res;
	1023	int try;
	1024
	1025	for (try = res = 0; try < 4; ++try) {
	1026	res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
	1027	hammer_inode_pfs_cmp,
	1028	hammer_unload_pseudofs_callback,
	1029	&localization);
	1030	if (res == 0 && try > 1)
	1031	break;
	1032	hammer_flusher_sync(trans->hmp);
	1033	}
	1034	if (res != 0)
	1035	res = ENOTEMPTY;
	1036	return(res);
	1037	}
	1038
	1039
	1040	/*
	1041	* Release a reference on a PFS
	1042	*/
	1043	void
	1044	hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
	1045	{
	1046	hammer_unref(&pfsm->lock);
	1047	if (pfsm->lock.refs == 0) {
	1048	RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
	1049	kfree(pfsm, hmp->m_misc);
	1050	}
	1051	}
	1052
	1053	/*
	1054	* Called by hammer_sync_inode().
	1055	*/
	1056	static int
	1057	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	1058	{
	1059	hammer_transaction_t trans = cursor->trans;
	1060	hammer_record_t record;
	1061	int error;
	1062	int redirty;
	1063
	1064	retry:
	1065	error = 0;
	1066
	1067	/*
	1068	* If the inode has a presence on-disk then locate it and mark
	1069	* it deleted, setting DELONDISK.
	1070	*
	1071	* The record may or may not be physically deleted, depending on
	1072	* the retention policy.
	1073	*/
	1074	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	1075	HAMMER_INODE_ONDISK) {
	1076	hammer_normalize_cursor(cursor);
	1077	cursor->key_beg.localization = ip->obj_localization +
	1078	HAMMER_LOCALIZE_INODE;
	1079	cursor->key_beg.obj_id = ip->obj_id;
	1080	cursor->key_beg.key = 0;
	1081	cursor->key_beg.create_tid = 0;
	1082	cursor->key_beg.delete_tid = 0;
	1083	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1084	cursor->key_beg.obj_type = 0;
	1085	cursor->asof = ip->obj_asof;
	1086	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1087	cursor->flags \|= HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_ASOF;
	1088	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1089
	1090	error = hammer_btree_lookup(cursor);
	1091	if (hammer_debug_inode)
	1092	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	1093
	1094	if (error == 0) {
	1095	error = hammer_ip_delete_record(cursor, ip, trans->tid);
	1096	if (hammer_debug_inode)
	1097	kprintf(" error %d\n", error);
	1098	if (error == 0) {
	1099	ip->flags \|= HAMMER_INODE_DELONDISK;
	1100	}
	1101	if (cursor->node)
	1102	hammer_cache_node(&ip->cache[0], cursor->node);
	1103	}
	1104	if (error == EDEADLK) {
	1105	hammer_done_cursor(cursor);
	1106	error = hammer_init_cursor(trans, cursor,
	1107	&ip->cache[0], ip);
	1108	if (hammer_debug_inode)
	1109	kprintf("IPDED %p %d\n", ip, error);
	1110	if (error == 0)
	1111	goto retry;
	1112	}
	1113	}
	1114
	1115	/*
	1116	* Ok, write out the initial record or a new record (after deleting
	1117	* the old one), unless the DELETED flag is set. This routine will
	1118	* clear DELONDISK if it writes out a record.
	1119	*
	1120	* Update our inode statistics if this is the first application of
	1121	* the inode on-disk.
	1122	*/
	1123	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	1124	/*
	1125	* Generate a record and write it to the media. We clean-up
	1126	* the state before releasing so we do not have to set-up
	1127	* a flush_group.
	1128	*/
	1129	record = hammer_alloc_mem_record(ip, 0);
	1130	record->type = HAMMER_MEM_RECORD_INODE;
	1131	record->flush_state = HAMMER_FST_FLUSH;
	1132	record->leaf = ip->sync_ino_leaf;
	1133	record->leaf.base.create_tid = trans->tid;
	1134	record->leaf.data_len = sizeof(ip->sync_ino_data);
	1135	record->leaf.create_ts = trans->time32;
	1136	record->data = (void *)&ip->sync_ino_data;
	1137	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1138
	1139	/*
	1140	* If this flag is set we cannot sync the new file size
	1141	* because we haven't finished related truncations. The
	1142	* inode will be flushed in another flush group to finish
	1143	* the job.
	1144	*/
	1145	if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
	1146	ip->sync_ino_data.size != ip->ino_data.size) {
	1147	redirty = 1;
	1148	ip->sync_ino_data.size = ip->ino_data.size;
	1149	} else {
	1150	redirty = 0;
	1151	}
	1152
	1153	for (;;) {
	1154	error = hammer_ip_sync_record_cursor(cursor, record);
	1155	if (hammer_debug_inode)
	1156	kprintf("GENREC %p rec %08x %d\n",
	1157	ip, record->flags, error);
	1158	if (error != EDEADLK)
	1159	break;
	1160	hammer_done_cursor(cursor);
	1161	error = hammer_init_cursor(trans, cursor,
	1162	&ip->cache[0], ip);
	1163	if (hammer_debug_inode)
	1164	kprintf("GENREC reinit %d\n", error);
	1165	if (error)
	1166	break;
	1167	}
	1168
	1169	/*
	1170	* Note: The record was never on the inode's record tree
	1171	* so just wave our hands importantly and destroy it.
	1172	*/
	1173	record->flags \|= HAMMER_RECF_COMMITTED;
	1174	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	1175	record->flush_state = HAMMER_FST_IDLE;
	1176	++ip->rec_generation;
	1177	hammer_rel_mem_record(record);
	1178
	1179	/*
	1180	* Finish up.
	1181	*/
	1182	if (error == 0) {
	1183	if (hammer_debug_inode)
	1184	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	1185	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1186	HAMMER_INODE_ATIME \|
	1187	HAMMER_INODE_MTIME);
	1188	ip->flags &= ~HAMMER_INODE_DELONDISK;
	1189	if (redirty)
	1190	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	1191
	1192	/*
	1193	* Root volume count of inodes
	1194	*/
	1195	hammer_sync_lock_sh(trans);
	1196	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	1197	hammer_modify_volume_field(trans,
	1198	trans->rootvol,
	1199	vol0_stat_inodes);
	1200	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1201	hammer_modify_volume_done(trans->rootvol);
	1202	ip->flags \|= HAMMER_INODE_ONDISK;
	1203	if (hammer_debug_inode)
	1204	kprintf("NOWONDISK %p\n", ip);
	1205	}
	1206	hammer_sync_unlock(trans);
	1207	}
	1208	}
	1209
	1210	/*
	1211	* If the inode has been destroyed, clean out any left-over flags
	1212	* that may have been set by the frontend.
	1213	*/
	1214	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	1215	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1216	HAMMER_INODE_ATIME \|
	1217	HAMMER_INODE_MTIME);
	1218	}
	1219	return(error);
	1220	}
	1221
	1222	/*
	1223	* Update only the itimes fields.
	1224	*
	1225	* ATIME can be updated without generating any UNDO. MTIME is updated
	1226	* with UNDO so it is guaranteed to be synchronized properly in case of
	1227	* a crash.
	1228	*
	1229	* Neither field is included in the B-Tree leaf element's CRC, which is how
	1230	* we can get away with updating ATIME the way we do.
	1231	*/
	1232	static int
	1233	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	1234	{
	1235	hammer_transaction_t trans = cursor->trans;
	1236	int error;
	1237
	1238	retry:
	1239	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) !=
	1240	HAMMER_INODE_ONDISK) {
	1241	return(0);
	1242	}
	1243
	1244	hammer_normalize_cursor(cursor);
	1245	cursor->key_beg.localization = ip->obj_localization +
	1246	HAMMER_LOCALIZE_INODE;
	1247	cursor->key_beg.obj_id = ip->obj_id;
	1248	cursor->key_beg.key = 0;
	1249	cursor->key_beg.create_tid = 0;
	1250	cursor->key_beg.delete_tid = 0;
	1251	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1252	cursor->key_beg.obj_type = 0;
	1253	cursor->asof = ip->obj_asof;
	1254	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1255	cursor->flags \|= HAMMER_CURSOR_ASOF;
	1256	cursor->flags \|= HAMMER_CURSOR_GET_LEAF;
	1257	cursor->flags \|= HAMMER_CURSOR_GET_DATA;
	1258	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1259
	1260	error = hammer_btree_lookup(cursor);
	1261	if (error == 0) {
	1262	hammer_cache_node(&ip->cache[0], cursor->node);
	1263	if (ip->sync_flags & HAMMER_INODE_MTIME) {
	1264	/*
	1265	* Updating MTIME requires an UNDO. Just cover
	1266	* both atime and mtime.
	1267	*/
	1268	hammer_sync_lock_sh(trans);
	1269	hammer_modify_buffer(trans, cursor->data_buffer,
	1270	HAMMER_ITIMES_BASE(&cursor->data->inode),
	1271	HAMMER_ITIMES_BYTES);
	1272	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1273	cursor->data->inode.mtime = ip->sync_ino_data.mtime;
	1274	hammer_modify_buffer_done(cursor->data_buffer);
	1275	hammer_sync_unlock(trans);
	1276	} else if (ip->sync_flags & HAMMER_INODE_ATIME) {
	1277	/*
	1278	* Updating atime only can be done in-place with
	1279	* no UNDO.
	1280	*/
	1281	hammer_sync_lock_sh(trans);
	1282	hammer_modify_buffer(trans, cursor->data_buffer,
	1283	NULL, 0);
	1284	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1285	hammer_modify_buffer_done(cursor->data_buffer);
	1286	hammer_sync_unlock(trans);
	1287	}
	1288	ip->sync_flags &= ~(HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME);
	1289	}
	1290	if (error == EDEADLK) {
	1291	hammer_done_cursor(cursor);
	1292	error = hammer_init_cursor(trans, cursor,
	1293	&ip->cache[0], ip);
	1294	if (error == 0)
	1295	goto retry;
	1296	}
	1297	return(error);
	1298	}
	1299
	1300	/*
	1301	* Release a reference on an inode, flush as requested.
	1302	*
	1303	* On the last reference we queue the inode to the flusher for its final
	1304	* disposition.
	1305	*/
	1306	void
	1307	hammer_rel_inode(struct hammer_inode *ip, int flush)
	1308	{
	1309	/hammer_mount_t hmp = ip->hmp;/
	1310
	1311	/*
	1312	* Handle disposition when dropping the last ref.
	1313	*/
	1314	for (;;) {
	1315	if (ip->lock.refs == 1) {
	1316	/*
	1317	* Determine whether on-disk action is needed for
	1318	* the inode's final disposition.
	1319	*/
	1320	KKASSERT(ip->vp == NULL);
	1321	hammer_inode_unloadable_check(ip, 0);
	1322	if (ip->flags & HAMMER_INODE_MODMASK) {
	1323	hammer_flush_inode(ip, 0);
	1324	} else if (ip->lock.refs == 1) {
	1325	hammer_unload_inode(ip);
	1326	break;
	1327	}
	1328	} else {
	1329	if (flush)
	1330	hammer_flush_inode(ip, 0);
	1331
	1332	/*
	1333	* The inode still has multiple refs, try to drop
	1334	* one ref.
	1335	*/
	1336	KKASSERT(ip->lock.refs >= 1);
	1337	if (ip->lock.refs > 1) {
	1338	hammer_unref(&ip->lock);
	1339	break;
	1340	}
	1341	}
	1342	}
	1343	}
	1344
	1345	/*
	1346	* Unload and destroy the specified inode. Must be called with one remaining
	1347	* reference. The reference is disposed of.
	1348	*
	1349	* The inode must be completely clean.
	1350	*/
	1351	static int
	1352	hammer_unload_inode(struct hammer_inode *ip)
	1353	{
	1354	hammer_mount_t hmp = ip->hmp;
	1355
	1356	KASSERT(ip->lock.refs == 1,
	1357	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	1358	KKASSERT(ip->vp == NULL);
	1359	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	1360	KKASSERT(ip->cursor_ip_refs == 0);
	1361	KKASSERT(ip->lock.lockcount == 0);
	1362	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	1363
	1364	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1365	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	1366
	1367	RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
	1368
	1369	hammer_free_inode(ip);
	1370	return(0);
	1371	}
	1372
	1373	/*
	1374	* Called during unmounting if a critical error occured. The in-memory
	1375	* inode and all related structures are destroyed.
	1376	*
	1377	* If a critical error did not occur the unmount code calls the standard
	1378	* release and asserts that the inode is gone.
	1379	*/
	1380	int
	1381	hammer_destroy_inode_callback(struct hammer_inode ip, void data __unused)
	1382	{
	1383	hammer_record_t rec;
	1384
	1385	/*
	1386	* Get rid of the inodes in-memory records, regardless of their
	1387	* state, and clear the mod-mask.
	1388	*/
	1389	while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
	1390	TAILQ_REMOVE(&ip->target_list, rec, target_entry);
	1391	rec->target_ip = NULL;
	1392	if (rec->flush_state == HAMMER_FST_SETUP)
	1393	rec->flush_state = HAMMER_FST_IDLE;
	1394	}
	1395	while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
	1396	if (rec->flush_state == HAMMER_FST_FLUSH)
	1397	--rec->flush_group->refs;
	1398	else
	1399	hammer_ref(&rec->lock);
	1400	KKASSERT(rec->lock.refs == 1);
	1401	rec->flush_state = HAMMER_FST_IDLE;
	1402	rec->flush_group = NULL;
	1403	rec->flags \|= HAMMER_RECF_DELETED_FE; /* wave hands */
	1404	rec->flags \|= HAMMER_RECF_DELETED_BE; /* wave hands */
	1405	++ip->rec_generation;
	1406	hammer_rel_mem_record(rec);
	1407	}
	1408	ip->flags &= ~HAMMER_INODE_MODMASK;
	1409	ip->sync_flags &= ~HAMMER_INODE_MODMASK;
	1410	KKASSERT(ip->vp == NULL);
	1411
	1412	/*
	1413	* Remove the inode from any flush group, force it idle. FLUSH
	1414	* and SETUP states have an inode ref.
	1415	*/
	1416	switch(ip->flush_state) {
	1417	case HAMMER_FST_FLUSH:
	1418	TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
	1419	--ip->flush_group->refs;
	1420	ip->flush_group = NULL;
	1421	/* fall through */
	1422	case HAMMER_FST_SETUP:
	1423	hammer_unref(&ip->lock);
	1424	ip->flush_state = HAMMER_FST_IDLE;
	1425	/* fall through */
	1426	case HAMMER_FST_IDLE:
	1427	break;
	1428	}
	1429
	1430	/*
	1431	* There shouldn't be any associated vnode. The unload needs at
	1432	* least one ref, if we do have a vp steal its ip ref.
	1433	*/
	1434	if (ip->vp) {
	1435	kprintf("hammer_destroy_inode_callback: Unexpected "
	1436	"vnode association ip %p vp %p\n", ip, ip->vp);
	1437	ip->vp->v_data = NULL;
	1438	ip->vp = NULL;
	1439	} else {
	1440	hammer_ref(&ip->lock);
	1441	}
	1442	hammer_unload_inode(ip);
	1443	return(0);
	1444	}
	1445
	1446	/*
	1447	* Called on mount -u when switching from RW to RO or vise-versa. Adjust
	1448	* the read-only flag for cached inodes.
	1449	*
	1450	* This routine is called from a RB_SCAN().
	1451	*/
	1452	int
	1453	hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
	1454	{
	1455	hammer_mount_t hmp = ip->hmp;
	1456
	1457	if (hmp->ronly \|\| hmp->asof != HAMMER_MAX_TID)
	1458	ip->flags \|= HAMMER_INODE_RO;
	1459	else
	1460	ip->flags &= ~HAMMER_INODE_RO;
	1461	return(0);
	1462	}
	1463
	1464	/*
	1465	* A transaction has modified an inode, requiring updates as specified by
	1466	* the passed flags.
	1467	*
	1468	* HAMMER_INODE_DDIRTY: Inode data has been updated
	1469	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	1470	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	1471	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	1472	* HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
	1473	*/
	1474	void
	1475	hammer_modify_inode(hammer_inode_t ip, int flags)
	1476	{
	1477	/*
	1478	* ronly of 0 or 2 does not trigger assertion.
	1479	* 2 is a special error state
	1480	*/
	1481	KKASSERT(ip->hmp->ronly != 1 \|\|
	1482	(flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	1483	HAMMER_INODE_BUFS \| HAMMER_INODE_DELETED \|
	1484	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) == 0);
	1485	if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
	1486	ip->flags \|= HAMMER_INODE_RSV_INODES;
	1487	++ip->hmp->rsv_inodes;
	1488	}
	1489
	1490	ip->flags \|= flags;
	1491	}
	1492
	1493	/*
	1494	* Request that an inode be flushed. This whole mess cannot block and may
	1495	* recurse (if not synchronous). Once requested HAMMER will attempt to
	1496	* actively flush the inode until the flush can be done.
	1497	*
	1498	* The inode may already be flushing, or may be in a setup state. We can
	1499	* place the inode in a flushing state if it is currently idle and flag it
	1500	* to reflush if it is currently flushing.
	1501	*
	1502	* Upon return if the inode could not be flushed due to a setup
	1503	* dependancy, then it will be automatically flushed when the dependancy
	1504	* is satisfied.
	1505	*/
	1506	void
	1507	hammer_flush_inode(hammer_inode_t ip, int flags)
	1508	{
	1509	hammer_mount_t hmp;
	1510	hammer_flush_group_t flg;
	1511	int good;
	1512
	1513	/*
	1514	* next_flush_group is the first flush group we can place the inode
	1515	* in. It may be NULL. If it becomes full we append a new flush
	1516	* group and make that the next_flush_group.
	1517	*/
	1518	hmp = ip->hmp;
	1519	while ((flg = hmp->next_flush_group) != NULL) {
	1520	KKASSERT(flg->running == 0);
	1521	if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
	1522	break;
	1523	hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
	1524	hammer_flusher_async(ip->hmp, flg);
	1525	}
	1526	if (flg == NULL) {
	1527	flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK\|M_ZERO);
	1528	hmp->next_flush_group = flg;
	1529	TAILQ_INIT(&flg->flush_list);
	1530	TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
	1531	}
	1532
	1533	/*
	1534	* Trivial 'nothing to flush' case. If the inode is in a SETUP
	1535	* state we have to put it back into an IDLE state so we can
	1536	* drop the extra ref.
	1537	*
	1538	* If we have a parent dependancy we must still fall through
	1539	* so we can run it.
	1540	*/
	1541	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	1542	if (ip->flush_state == HAMMER_FST_SETUP &&
	1543	TAILQ_EMPTY(&ip->target_list)) {
	1544	ip->flush_state = HAMMER_FST_IDLE;
	1545	hammer_rel_inode(ip, 0);
	1546	}
	1547	if (ip->flush_state == HAMMER_FST_IDLE)
	1548	return;
	1549	}
	1550
	1551	/*
	1552	* Our flush action will depend on the current state.
	1553	*/
	1554	switch(ip->flush_state) {
	1555	case HAMMER_FST_IDLE:
	1556	/*
	1557	* We have no dependancies and can flush immediately. Some
	1558	* our children may not be flushable so we have to re-test
	1559	* with that additional knowledge.
	1560	*/
	1561	hammer_flush_inode_core(ip, flg, flags);
	1562	break;
	1563	case HAMMER_FST_SETUP:
	1564	/*
	1565	* Recurse upwards through dependancies via target_list
	1566	* and start their flusher actions going if possible.
	1567	*
	1568	* 'good' is our connectivity. -1 means we have none and
	1569	* can't flush, 0 means there weren't any dependancies, and
	1570	* 1 means we have good connectivity.
	1571	*/
	1572	good = hammer_setup_parent_inodes(ip, 0, flg);
	1573
	1574	if (good >= 0) {
	1575	/*
	1576	* We can continue if good >= 0. Determine how
	1577	* many records under our inode can be flushed (and
	1578	* mark them).
	1579	*/
	1580	hammer_flush_inode_core(ip, flg, flags);
	1581	} else {
	1582	/*
	1583	* Parent has no connectivity, tell it to flush
	1584	* us as soon as it does.
	1585	*
	1586	* The REFLUSH flag is also needed to trigger
	1587	* dependancy wakeups.
	1588	*/
	1589	ip->flags \|= HAMMER_INODE_CONN_DOWN \|
	1590	HAMMER_INODE_REFLUSH;
	1591	if (flags & HAMMER_FLUSH_SIGNAL) {
	1592	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1593	hammer_flusher_async(ip->hmp, flg);
	1594	}
	1595	}
	1596	break;
	1597	case HAMMER_FST_FLUSH:
	1598	/*
	1599	* We are already flushing, flag the inode to reflush
	1600	* if needed after it completes its current flush.
	1601	*
	1602	* The REFLUSH flag is also needed to trigger
	1603	* dependancy wakeups.
	1604	*/
	1605	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	1606	ip->flags \|= HAMMER_INODE_REFLUSH;
	1607	if (flags & HAMMER_FLUSH_SIGNAL) {
	1608	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1609	hammer_flusher_async(ip->hmp, flg);
	1610	}
	1611	break;
	1612	}
	1613	}
	1614
	1615	/*
	1616	* Scan ip->target_list, which is a list of records owned by PARENTS to our
	1617	* ip which reference our ip.
	1618	*
	1619	* XXX This is a huge mess of recursive code, but not one bit of it blocks
	1620	* so for now do not ref/deref the structures. Note that if we use the
	1621	* ref/rel code later, the rel CAN block.
	1622	*/
	1623	static int
	1624	hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	1625	hammer_flush_group_t flg)
	1626	{
	1627	hammer_record_t depend;
	1628	int good;
	1629	int r;
	1630
	1631	/*
	1632	* If we hit our recursion limit and we have parent dependencies
	1633	* We cannot continue. Returning < 0 will cause us to be flagged
	1634	* for reflush. Returning -2 cuts off additional dependency checks
	1635	* because they are likely to also hit the depth limit.
	1636	*
	1637	* We cannot return < 0 if there are no dependencies or there might
	1638	* not be anything to wakeup (ip).
	1639	*/
	1640	if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
	1641	kprintf("HAMMER Warning: depth limit reached on "
	1642	"setup recursion, inode %p %016llx\n",
	1643	ip, (long long)ip->obj_id);
	1644	return(-2);
	1645	}
	1646
	1647	/*
	1648	* Scan dependencies
	1649	*/
	1650	good = 0;
	1651	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	1652	r = hammer_setup_parent_inodes_helper(depend, depth, flg);
	1653	KKASSERT(depend->target_ip == ip);
	1654	if (r < 0 && good == 0)
	1655	good = -1;
	1656	if (r > 0)
	1657	good = 1;
	1658
	1659	/*
	1660	* If we failed due to the recursion depth limit then stop
	1661	* now.
	1662	*/
	1663	if (r == -2)
	1664	break;
	1665	}
	1666	return(good);
	1667	}
	1668
	1669	/*
	1670	* This helper function takes a record representing the dependancy between
	1671	* the parent inode and child inode.
	1672	*
	1673	* record->ip = parent inode
	1674	* record->target_ip = child inode
	1675	*
	1676	* We are asked to recurse upwards and convert the record from SETUP
	1677	* to FLUSH if possible.
	1678	*
	1679	* Return 1 if the record gives us connectivity
	1680	*
	1681	* Return 0 if the record is not relevant
	1682	*
	1683	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	1684	*/
	1685	static int
	1686	hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
	1687	hammer_flush_group_t flg)
	1688	{
	1689	hammer_mount_t hmp;
	1690	hammer_inode_t pip;
	1691	int good;
	1692
	1693	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	1694	pip = record->ip;
	1695	hmp = pip->hmp;
	1696
	1697	/*
	1698	* If the record is already flushing, is it in our flush group?
	1699	*
	1700	* If it is in our flush group but it is a general record or a
	1701	* delete-on-disk, it does not improve our connectivity (return 0),
	1702	* and if the target inode is not trying to destroy itself we can't
	1703	* allow the operation yet anyway (the second return -1).
	1704	*/
	1705	if (record->flush_state == HAMMER_FST_FLUSH) {
	1706	/*
	1707	* If not in our flush group ask the parent to reflush
	1708	* us as soon as possible.
	1709	*/
	1710	if (record->flush_group != flg) {
	1711	pip->flags \|= HAMMER_INODE_REFLUSH;
	1712	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1713	return(-1);
	1714	}
	1715
	1716	/*
	1717	* If in our flush group everything is already set up,
	1718	* just return whether the record will improve our
	1719	* visibility or not.
	1720	*/
	1721	if (record->type == HAMMER_MEM_RECORD_ADD)
	1722	return(1);
	1723	return(0);
	1724	}
	1725
	1726	/*
	1727	* It must be a setup record. Try to resolve the setup dependancies
	1728	* by recursing upwards so we can place ip on the flush list.
	1729	*
	1730	* Limit ourselves to 20 levels of recursion to avoid blowing out
	1731	* the kernel stack. If we hit the recursion limit we can't flush
	1732	* until the parent flushes. The parent will flush independantly
	1733	* on its own and ultimately a deep recursion will be resolved.
	1734	*/
	1735	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1736
	1737	good = hammer_setup_parent_inodes(pip, depth + 1, flg);
	1738
	1739	/*
	1740	* If good < 0 the parent has no connectivity and we cannot safely
	1741	* flush the directory entry, which also means we can't flush our
	1742	* ip. Flag us for downward recursion once the parent's
	1743	* connectivity is resolved. Flag the parent for [re]flush or it
	1744	* may not check for downward recursions.
	1745	*/
	1746	if (good < 0) {
	1747	pip->flags \|= HAMMER_INODE_REFLUSH;
	1748	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1749	return(good);
	1750	}
	1751
	1752	/*
	1753	* We are go, place the parent inode in a flushing state so we can
	1754	* place its record in a flushing state. Note that the parent
	1755	* may already be flushing. The record must be in the same flush
	1756	* group as the parent.
	1757	*/
	1758	if (pip->flush_state != HAMMER_FST_FLUSH)
	1759	hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
	1760	KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
	1761	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1762
	1763	#if 0
	1764	if (record->type == HAMMER_MEM_RECORD_DEL &&
	1765	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	1766	/*
	1767	* Regardless of flushing state we cannot sync this path if the
	1768	* record represents a delete-on-disk but the target inode
	1769	* is not ready to sync its own deletion.
	1770	*
	1771	* XXX need to count effective nlinks to determine whether
	1772	* the flush is ok, otherwise removing a hardlink will
	1773	* just leave the DEL record to rot.
	1774	*/
	1775	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1776	return(-1);
	1777	} else
	1778	#endif
	1779	if (pip->flush_group == flg) {
	1780	/*
	1781	* Because we have not calculated nlinks yet we can just
	1782	* set records to the flush state if the parent is in
	1783	* the same flush group as we are.
	1784	*/
	1785	record->flush_state = HAMMER_FST_FLUSH;
	1786	record->flush_group = flg;
	1787	++record->flush_group->refs;
	1788	hammer_ref(&record->lock);
	1789
	1790	/*
	1791	* A general directory-add contributes to our visibility.
	1792	*
	1793	* Otherwise it is probably a directory-delete or
	1794	* delete-on-disk record and does not contribute to our
	1795	* visbility (but we can still flush it).
	1796	*/
	1797	if (record->type == HAMMER_MEM_RECORD_ADD)
	1798	return(1);
	1799	return(0);
	1800	} else {
	1801	/*
	1802	* If the parent is not in our flush group we cannot
	1803	* flush this record yet, there is no visibility.
	1804	* We tell the parent to reflush and mark ourselves
	1805	* so the parent knows it should flush us too.
	1806	*/
	1807	pip->flags \|= HAMMER_INODE_REFLUSH;
	1808	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1809	return(-1);
	1810	}
	1811	}
	1812
	1813	/*
	1814	* This is the core routine placing an inode into the FST_FLUSH state.
	1815	*/
	1816	static void
	1817	hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
	1818	{
	1819	int go_count;
	1820
	1821	/*
	1822	* Set flush state and prevent the flusher from cycling into
	1823	* the next flush group. Do not place the ip on the list yet.
	1824	* Inodes not in the idle state get an extra reference.
	1825	*/
	1826	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	1827	if (ip->flush_state == HAMMER_FST_IDLE)
	1828	hammer_ref(&ip->lock);
	1829	ip->flush_state = HAMMER_FST_FLUSH;
	1830	ip->flush_group = flg;
	1831	++ip->hmp->flusher.group_lock;
	1832	++ip->hmp->count_iqueued;
	1833	++hammer_count_iqueued;
	1834	++flg->total_count;
	1835
	1836	/*
	1837	* If the flush group reaches the autoflush limit we want to signal
	1838	* the flusher. This is particularly important for remove()s.
	1839	*/
	1840	if (flg->total_count == hammer_autoflush)
	1841	flags \|= HAMMER_FLUSH_SIGNAL;
	1842
	1843	/*
	1844	* We need to be able to vfsync/truncate from the backend.
	1845	*/
	1846	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	1847	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	1848	ip->flags \|= HAMMER_INODE_VHELD;
	1849	vref(ip->vp);
	1850	}
	1851
	1852	/*
	1853	* Figure out how many in-memory records we can actually flush
	1854	* (not including inode meta-data, buffers, etc).
	1855	*/
	1856	KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
	1857	if (flags & HAMMER_FLUSH_RECURSION) {
	1858	/*
	1859	* If this is a upwards recursion we do not want to
	1860	* recurse down again!
	1861	*/
	1862	go_count = 1;
	1863	#if 0
	1864	} else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1865	/*
	1866	* No new records are added if we must complete a flush
	1867	* from a previous cycle, but we do have to move the records
	1868	* from the previous cycle to the current one.
	1869	*/
	1870	#if 0
	1871	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1872	hammer_syncgrp_child_callback, NULL);
	1873	#endif
	1874	go_count = 1;
	1875	#endif
	1876	} else {
	1877	/*
	1878	* Normal flush, scan records and bring them into the flush.
	1879	* Directory adds and deletes are usually skipped (they are
	1880	* grouped with the related inode rather then with the
	1881	* directory).
	1882	*
	1883	* go_count can be negative, which means the scan aborted
	1884	* due to the flush group being over-full and we should
	1885	* flush what we have.
	1886	*/
	1887	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1888	hammer_setup_child_callback, NULL);
	1889	}
	1890
	1891	/*
	1892	* This is a more involved test that includes go_count. If we
	1893	* can't flush, flag the inode and return. If go_count is 0 we
	1894	* were are unable to flush any records in our rec_tree and
	1895	* must ignore the XDIRTY flag.
	1896	*/
	1897	if (go_count == 0) {
	1898	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	1899	--ip->hmp->count_iqueued;
	1900	--hammer_count_iqueued;
	1901
	1902	--flg->total_count;
	1903	ip->flush_state = HAMMER_FST_SETUP;
	1904	ip->flush_group = NULL;
	1905	if (ip->flags & HAMMER_INODE_VHELD) {
	1906	ip->flags &= ~HAMMER_INODE_VHELD;
	1907	vrele(ip->vp);
	1908	}
	1909
	1910	/*
	1911	* REFLUSH is needed to trigger dependancy wakeups
	1912	* when an inode is in SETUP.
	1913	*/
	1914	ip->flags \|= HAMMER_INODE_REFLUSH;
	1915	if (flags & HAMMER_FLUSH_SIGNAL) {
	1916	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1917	hammer_flusher_async(ip->hmp, flg);
	1918	}
	1919	if (--ip->hmp->flusher.group_lock == 0)
	1920	wakeup(&ip->hmp->flusher.group_lock);
	1921	return;
	1922	}
	1923	}
	1924
	1925	/*
	1926	* Snapshot the state of the inode for the backend flusher.
	1927	*
	1928	* We continue to retain save_trunc_off even when all truncations
	1929	* have been resolved as an optimization to determine if we can
	1930	* skip the B-Tree lookup for overwrite deletions.
	1931	*
	1932	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	1933	* and stays in ip->flags. Once set, it stays set until the
	1934	* inode is destroyed.
	1935	*/
	1936	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	1937	KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
	1938	ip->sync_trunc_off = ip->trunc_off;
	1939	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	1940	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	1941	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	1942
	1943	/*
	1944	* The save_trunc_off used to cache whether the B-Tree
	1945	* holds any records past that point is not used until
	1946	* after the truncation has succeeded, so we can safely
	1947	* set it now.
	1948	*/
	1949	if (ip->save_trunc_off > ip->sync_trunc_off)
	1950	ip->save_trunc_off = ip->sync_trunc_off;
	1951	}
	1952	ip->sync_flags \|= (ip->flags & HAMMER_INODE_MODMASK &
	1953	~HAMMER_INODE_TRUNCATED);
	1954	ip->sync_ino_leaf = ip->ino_leaf;
	1955	ip->sync_ino_data = ip->ino_data;
	1956	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	1957	#ifdef DEBUG_TRUNCATE
	1958	if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
	1959	kprintf("truncateS %016llx\n", ip->sync_trunc_off);
	1960	#endif
	1961
	1962	/*
	1963	* The flusher list inherits our inode and reference.
	1964	*/
	1965	KKASSERT(flg->running == 0);
	1966	TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry);
	1967	if (--ip->hmp->flusher.group_lock == 0)
	1968	wakeup(&ip->hmp->flusher.group_lock);
	1969
	1970	if (flags & HAMMER_FLUSH_SIGNAL) {
	1971	hammer_flusher_async(ip->hmp, flg);
	1972	}
	1973	}
	1974
	1975	/*
	1976	* Callback for scan of ip->rec_tree. Try to include each record in our
	1977	* flush. ip->flush_group has been set but the inode has not yet been
	1978	* moved into a flushing state.
	1979	*
	1980	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	1981	* both inodes.
	1982	*
	1983	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	1984	* the caller from shortcutting the flush.
	1985	*/
	1986	static int
	1987	hammer_setup_child_callback(hammer_record_t rec, void *data)
	1988	{
	1989	hammer_flush_group_t flg;
	1990	hammer_inode_t target_ip;
	1991	hammer_inode_t ip;
	1992	int r;
	1993
	1994	/*
	1995	* Records deleted or committed by the backend are ignored.
	1996	* Note that the flush detects deleted frontend records at
	1997	* multiple points to deal with races. This is just the first
	1998	* line of defense. The only time HAMMER_RECF_DELETED_FE cannot
	1999	* be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
	2000	* messes up link-count calculations.
	2001	*
	2002	* NOTE: Don't get confused between record deletion and, say,
	2003	* directory entry deletion. The deletion of a directory entry
	2004	* which is on-media has nothing to do with the record deletion
	2005	* flags.
	2006	*/
	2007	if (rec->flags & (HAMMER_RECF_DELETED_FE \| HAMMER_RECF_DELETED_BE \|
	2008	HAMMER_RECF_COMMITTED)) {
	2009	if (rec->flush_state == HAMMER_FST_FLUSH) {
	2010	KKASSERT(rec->flush_group == rec->ip->flush_group);
	2011	r = 1;
	2012	} else {
	2013	r = 0;
	2014	}
	2015	return(r);
	2016	}
	2017
	2018	/*
	2019	* If the record is in an idle state it has no dependancies and
	2020	* can be flushed.
	2021	*/
	2022	ip = rec->ip;
	2023	flg = ip->flush_group;
	2024	r = 0;
	2025
	2026	switch(rec->flush_state) {
	2027	case HAMMER_FST_IDLE:
	2028	/*
	2029	* The record has no setup dependancy, we can flush it.
	2030	*/
	2031	KKASSERT(rec->target_ip == NULL);
	2032	rec->flush_state = HAMMER_FST_FLUSH;
	2033	rec->flush_group = flg;
	2034	++flg->refs;
	2035	hammer_ref(&rec->lock);
	2036	r = 1;
	2037	break;
	2038	case HAMMER_FST_SETUP:
	2039	/*
	2040	* The record has a setup dependancy. These are typically
	2041	* directory entry adds and deletes. Such entries will be
	2042	* flushed when their inodes are flushed so we do not
	2043	* usually have to add them to the flush here. However,
	2044	* if the target_ip has set HAMMER_INODE_CONN_DOWN then
	2045	* it is asking us to flush this record (and it).
	2046	*/
	2047	target_ip = rec->target_ip;
	2048	KKASSERT(target_ip != NULL);
	2049	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	2050
	2051	/*
	2052	* If the target IP is already flushing in our group
	2053	* we could associate the record, but target_ip has
	2054	* already synced ino_data to sync_ino_data and we
	2055	* would also have to adjust nlinks. Plus there are
	2056	* ordering issues for adds and deletes.
	2057	*
	2058	* Reflush downward if this is an ADD, and upward if
	2059	* this is a DEL.
	2060	*/
	2061	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	2062	if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
	2063	ip->flags \|= HAMMER_INODE_REFLUSH;
	2064	else
	2065	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	2066	break;
	2067	}
	2068
	2069	/*
	2070	* Target IP is not yet flushing. This can get complex
	2071	* because we have to be careful about the recursion.
	2072	*
	2073	* Directories create an issue for us in that if a flush
	2074	* of a directory is requested the expectation is to flush
	2075	* any pending directory entries, but this will cause the
	2076	* related inodes to recursively flush as well. We can't
	2077	* really defer the operation so just get as many as we
	2078	* can and
	2079	*/
	2080	#if 0
	2081	if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
	2082	(target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
	2083	/*
	2084	* We aren't reclaiming and the target ip was not
	2085	* previously prevented from flushing due to this
	2086	* record dependancy. Do not flush this record.
	2087	*/
	2088	/r = 0;/
	2089	} else
	2090	#endif
	2091	if (flg->total_count + flg->refs >
	2092	ip->hmp->undo_rec_limit) {
	2093	/*
	2094	* Our flush group is over-full and we risk blowing
	2095	* out the UNDO FIFO. Stop the scan, flush what we
	2096	* have, then reflush the directory.
	2097	*
	2098	* The directory may be forced through multiple
	2099	* flush groups before it can be completely
	2100	* flushed.
	2101	*/
	2102	ip->flags \|= HAMMER_INODE_RESIGNAL \|
	2103	HAMMER_INODE_REFLUSH;
	2104	r = -1;
	2105	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	2106	/*
	2107	* If the target IP is not flushing we can force
	2108	* it to flush, even if it is unable to write out
	2109	* any of its own records we have at least one in
	2110	* hand that we CAN deal with.
	2111	*/
	2112	rec->flush_state = HAMMER_FST_FLUSH;
	2113	rec->flush_group = flg;
	2114	++flg->refs;
	2115	hammer_ref(&rec->lock);
	2116	hammer_flush_inode_core(target_ip, flg,
	2117	HAMMER_FLUSH_RECURSION);
	2118	r = 1;
	2119	} else {
	2120	/*
	2121	* General or delete-on-disk record.
	2122	*
	2123	* XXX this needs help. If a delete-on-disk we could
	2124	* disconnect the target. If the target has its own
	2125	* dependancies they really need to be flushed.
	2126	*
	2127	* XXX
	2128	*/
	2129	rec->flush_state = HAMMER_FST_FLUSH;
	2130	rec->flush_group = flg;
	2131	++flg->refs;
	2132	hammer_ref(&rec->lock);
	2133	hammer_flush_inode_core(target_ip, flg,
	2134	HAMMER_FLUSH_RECURSION);
	2135	r = 1;
	2136	}
	2137	break;
	2138	case HAMMER_FST_FLUSH:
	2139	/*
	2140	* The flush_group should already match.
	2141	*/
	2142	KKASSERT(rec->flush_group == flg);
	2143	r = 1;
	2144	break;
	2145	}
	2146	return(r);
	2147	}
	2148
	2149	#if 0
	2150	/*
	2151	* This version just moves records already in a flush state to the new
	2152	* flush group and that is it.
	2153	*/
	2154	static int
	2155	hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
	2156	{
	2157	hammer_inode_t ip = rec->ip;
	2158
	2159	switch(rec->flush_state) {
	2160	case HAMMER_FST_FLUSH:
	2161	KKASSERT(rec->flush_group == ip->flush_group);
	2162	break;
	2163	default:
	2164	break;
	2165	}
	2166	return(0);
	2167	}
	2168	#endif
	2169
	2170	/*
	2171	* Wait for a previously queued flush to complete.
	2172	*
	2173	* If a critical error occured we don't try to wait.
	2174	*/
	2175	void
	2176	hammer_wait_inode(hammer_inode_t ip)
	2177	{
	2178	hammer_flush_group_t flg;
	2179
	2180	flg = NULL;
	2181	if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2182	while (ip->flush_state != HAMMER_FST_IDLE &&
	2183	(ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2184	if (ip->flush_state == HAMMER_FST_SETUP)
	2185	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2186	if (ip->flush_state != HAMMER_FST_IDLE) {
	2187	ip->flags \|= HAMMER_INODE_FLUSHW;
	2188	tsleep(&ip->flags, 0, "hmrwin", 0);
	2189	}
	2190	}
	2191	}
	2192	}
	2193
	2194	/*
	2195	* Called by the backend code when a flush has been completed.
	2196	* The inode has already been removed from the flush list.
	2197	*
	2198	* A pipelined flush can occur, in which case we must re-enter the
	2199	* inode on the list and re-copy its fields.
	2200	*/
	2201	void
	2202	hammer_flush_inode_done(hammer_inode_t ip, int error)
	2203	{
	2204	hammer_mount_t hmp;
	2205	int dorel;
	2206
	2207	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	2208
	2209	hmp = ip->hmp;
	2210
	2211	/*
	2212	* Auto-reflush if the backend could not completely flush
	2213	* the inode. This fixes a case where a deferred buffer flush
	2214	* could cause fsync to return early.
	2215	*/
	2216	if (ip->sync_flags & HAMMER_INODE_MODMASK)
	2217	ip->flags \|= HAMMER_INODE_REFLUSH;
	2218
	2219	/*
	2220	* Merge left-over flags back into the frontend and fix the state.
	2221	* Incomplete truncations are retained by the backend.
	2222	*/
	2223	ip->error = error;
	2224	ip->flags \|= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
	2225	ip->sync_flags &= HAMMER_INODE_TRUNCATED;
	2226
	2227	/*
	2228	* The backend may have adjusted nlinks, so if the adjusted nlinks
	2229	* does not match the fronttend set the frontend's RDIRTY flag again.
	2230	*/
	2231	if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
	2232	ip->flags \|= HAMMER_INODE_DDIRTY;
	2233
	2234	/*
	2235	* Fix up the dirty buffer status.
	2236	*/
	2237	if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
	2238	ip->flags \|= HAMMER_INODE_BUFS;
	2239	}
	2240
	2241	/*
	2242	* Re-set the XDIRTY flag if some of the inode's in-memory records
	2243	* could not be flushed.
	2244	*/
	2245	KKASSERT((RB_EMPTY(&ip->rec_tree) &&
	2246	(ip->flags & HAMMER_INODE_XDIRTY) == 0) \|\|
	2247	(!RB_EMPTY(&ip->rec_tree) &&
	2248	(ip->flags & HAMMER_INODE_XDIRTY) != 0));
	2249
	2250	/*
	2251	* Do not lose track of inodes which no longer have vnode
	2252	* assocations, otherwise they may never get flushed again.
	2253	*
	2254	* The reflush flag can be set superfluously, causing extra pain
	2255	* for no reason. If the inode is no longer modified it no longer
	2256	* needs to be flushed.
	2257	*/
	2258	if (ip->flags & HAMMER_INODE_MODMASK) {
	2259	if (ip->vp == NULL)
	2260	ip->flags \|= HAMMER_INODE_REFLUSH;
	2261	} else {
	2262	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2263	}
	2264
	2265	/*
	2266	* Adjust the flush state.
	2267	*/
	2268	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	2269	/*
	2270	* We were unable to flush out all our records, leave the
	2271	* inode in a flush state and in the current flush group.
	2272	* The flush group will be re-run.
	2273	*
	2274	* This occurs if the UNDO block gets too full or there is
	2275	* too much dirty meta-data and allows the flusher to
	2276	* finalize the UNDO block and then re-flush.
	2277	*/
	2278	ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
	2279	dorel = 0;
	2280	} else {
	2281	/*
	2282	* Remove from the flush_group
	2283	*/
	2284	TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
	2285	ip->flush_group = NULL;
	2286
	2287	/*
	2288	* Clean up the vnode ref and tracking counts.
	2289	*/
	2290	if (ip->flags & HAMMER_INODE_VHELD) {
	2291	ip->flags &= ~HAMMER_INODE_VHELD;
	2292	vrele(ip->vp);
	2293	}
	2294	--hmp->count_iqueued;
	2295	--hammer_count_iqueued;
	2296
	2297	/*
	2298	* And adjust the state.
	2299	*/
	2300	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	2301	ip->flush_state = HAMMER_FST_IDLE;
	2302	dorel = 1;
	2303	} else {
	2304	ip->flush_state = HAMMER_FST_SETUP;
	2305	dorel = 0;
	2306	}
	2307
	2308	/*
	2309	* If the frontend is waiting for a flush to complete,
	2310	* wake it up.
	2311	*/
	2312	if (ip->flags & HAMMER_INODE_FLUSHW) {
	2313	ip->flags &= ~HAMMER_INODE_FLUSHW;
	2314	wakeup(&ip->flags);
	2315	}
	2316
	2317	/*
	2318	* If the frontend made more changes and requested another
	2319	* flush, then try to get it running.
	2320	*
	2321	* Reflushes are aborted when the inode is errored out.
	2322	*/
	2323	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2324	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2325	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2326	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2327	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2328	} else {
	2329	hammer_flush_inode(ip, 0);
	2330	}
	2331	}
	2332	}
	2333
	2334	/*
	2335	* If we have no parent dependancies we can clear CONN_DOWN
	2336	*/
	2337	if (TAILQ_EMPTY(&ip->target_list))
	2338	ip->flags &= ~HAMMER_INODE_CONN_DOWN;
	2339
	2340	/*
	2341	* If the inode is now clean drop the space reservation.
	2342	*/
	2343	if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	2344	(ip->flags & HAMMER_INODE_RSV_INODES)) {
	2345	ip->flags &= ~HAMMER_INODE_RSV_INODES;
	2346	--hmp->rsv_inodes;
	2347	}
	2348
	2349	if (dorel)
	2350	hammer_rel_inode(ip, 0);
	2351	}
	2352
	2353	/*
	2354	* Called from hammer_sync_inode() to synchronize in-memory records
	2355	* to the media.
	2356	*/
	2357	static int
	2358	hammer_sync_record_callback(hammer_record_t record, void *data)
	2359	{
	2360	hammer_cursor_t cursor = data;
	2361	hammer_transaction_t trans = cursor->trans;
	2362	hammer_mount_t hmp = trans->hmp;
	2363	int error;
	2364
	2365	/*
	2366	* Skip records that do not belong to the current flush.
	2367	*/
	2368	++hammer_stats_record_iterations;
	2369	if (record->flush_state != HAMMER_FST_FLUSH)
	2370	return(0);
	2371
	2372	#if 1
	2373	if (record->flush_group != record->ip->flush_group) {
	2374	kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	2375	Debugger("blah2");
	2376	return(0);
	2377	}
	2378	#endif
	2379	KKASSERT(record->flush_group == record->ip->flush_group);
	2380
	2381	/*
	2382	* Interlock the record using the BE flag. Once BE is set the
	2383	* frontend cannot change the state of FE.
	2384	*
	2385	* NOTE: If FE is set prior to us setting BE we still sync the
	2386	* record out, but the flush completion code converts it to
	2387	* a delete-on-disk record instead of destroying it.
	2388	*/
	2389	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	2390	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	2391
	2392	/*
	2393	* The backend has already disposed of the record.
	2394	*/
	2395	if (record->flags & (HAMMER_RECF_DELETED_BE \| HAMMER_RECF_COMMITTED)) {
	2396	error = 0;
	2397	goto done;
	2398	}
	2399
	2400	/*
	2401	* If the whole inode is being deleting all on-disk records will
	2402	* be deleted very soon, we can't sync any new records to disk
	2403	* because they will be deleted in the same transaction they were
	2404	* created in (delete_tid == create_tid), which will assert.
	2405	*
	2406	* XXX There may be a case with RECORD_ADD with DELETED_FE set
	2407	* that we currently panic on.
	2408	*/
	2409	if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
	2410	switch(record->type) {
	2411	case HAMMER_MEM_RECORD_DATA:
	2412	/*
	2413	* We don't have to do anything, if the record was
	2414	* committed the space will have been accounted for
	2415	* in the blockmap.
	2416	*/
	2417	/* fall through */
	2418	case HAMMER_MEM_RECORD_GENERAL:
	2419	/*
	2420	* Set deleted-by-backend flag. Do not set the
	2421	* backend committed flag, because we are throwing
	2422	* the record away.
	2423	*/
	2424	record->flags \|= HAMMER_RECF_DELETED_BE;
	2425	++record->ip->rec_generation;
	2426	error = 0;
	2427	goto done;
	2428	case HAMMER_MEM_RECORD_ADD:
	2429	panic("hammer_sync_record_callback: illegal add "
	2430	"during inode deletion record %p", record);
	2431	break; /* NOT REACHED */
	2432	case HAMMER_MEM_RECORD_INODE:
	2433	panic("hammer_sync_record_callback: attempt to "
	2434	"sync inode record %p?", record);
	2435	break; /* NOT REACHED */
	2436	case HAMMER_MEM_RECORD_DEL:
	2437	/*
	2438	* Follow through and issue the on-disk deletion
	2439	*/
	2440	break;
	2441	}
	2442	}
	2443
	2444	/*
	2445	* If DELETED_FE is set special handling is needed for directory
	2446	* entries. Dependant pieces related to the directory entry may
	2447	* have already been synced to disk. If this occurs we have to
	2448	* sync the directory entry and then change the in-memory record
	2449	* from an ADD to a DELETE to cover the fact that it's been
	2450	* deleted by the frontend.
	2451	*
	2452	* A directory delete covering record (MEM_RECORD_DEL) can never
	2453	* be deleted by the frontend.
	2454	*
	2455	* Any other record type (aka DATA) can be deleted by the frontend.
	2456	* XXX At the moment the flusher must skip it because there may
	2457	* be another data record in the flush group for the same block,
	2458	* meaning that some frontend data changes can leak into the backend's
	2459	* synchronization point.
	2460	*/
	2461	if (record->flags & HAMMER_RECF_DELETED_FE) {
	2462	if (record->type == HAMMER_MEM_RECORD_ADD) {
	2463	/*
	2464	* Convert a front-end deleted directory-add to
	2465	* a directory-delete entry later.
	2466	*/
	2467	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	2468	} else {
	2469	/*
	2470	* Dispose of the record (race case). Mark as
	2471	* deleted by backend (and not committed).
	2472	*/
	2473	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	2474	record->flags \|= HAMMER_RECF_DELETED_BE;
	2475	++record->ip->rec_generation;
	2476	error = 0;
	2477	goto done;
	2478	}
	2479	}
	2480
	2481	/*
	2482	* Assign the create_tid for new records. Deletions already
	2483	* have the record's entire key properly set up.
	2484	*/
	2485	if (record->type != HAMMER_MEM_RECORD_DEL) {
	2486	record->leaf.base.create_tid = trans->tid;
	2487	record->leaf.create_ts = trans->time32;
	2488	}
	2489	for (;;) {
	2490	error = hammer_ip_sync_record_cursor(cursor, record);
	2491	if (error != EDEADLK)
	2492	break;
	2493	hammer_done_cursor(cursor);
	2494	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	2495	record->ip);
	2496	if (error)
	2497	break;
	2498	}
	2499	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	2500
	2501	if (error)
	2502	error = -error;
	2503	done:
	2504	hammer_flush_record_done(record, error);
	2505
	2506	/*
	2507	* Do partial finalization if we have built up too many dirty
	2508	* buffers. Otherwise a buffer cache deadlock can occur when
	2509	* doing things like creating tens of thousands of tiny files.
	2510	*
	2511	* We must release our cursor lock to avoid a 3-way deadlock
	2512	* due to the exclusive sync lock the finalizer must get.
	2513	*/
	2514	if (hammer_flusher_meta_limit(hmp)) {
	2515	hammer_unlock_cursor(cursor);
	2516	hammer_flusher_finalize(trans, 0);
	2517	hammer_lock_cursor(cursor);
	2518	}
	2519
	2520	return(error);
	2521	}
	2522
	2523	/*
	2524	* Backend function called by the flusher to sync an inode to media.
	2525	*/
	2526	int
	2527	hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
	2528	{
	2529	struct hammer_cursor cursor;
	2530	hammer_node_t tmp_node;
	2531	hammer_record_t depend;
	2532	hammer_record_t next;
	2533	int error, tmp_error;
	2534	u_int64_t nlinks;
	2535
	2536	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	2537	return(0);
	2538
	2539	error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	2540	if (error)
	2541	goto done;
	2542
	2543	/*
	2544	* Any directory records referencing this inode which are not in
	2545	* our current flush group must adjust our nlink count for the
	2546	* purposes of synchronization to disk.
	2547	*
	2548	* Records which are in our flush group can be unlinked from our
	2549	* inode now, potentially allowing the inode to be physically
	2550	* deleted.
	2551	*
	2552	* This cannot block.
	2553	*/
	2554	nlinks = ip->ino_data.nlinks;
	2555	next = TAILQ_FIRST(&ip->target_list);
	2556	while ((depend = next) != NULL) {
	2557	next = TAILQ_NEXT(depend, target_entry);
	2558	if (depend->flush_state == HAMMER_FST_FLUSH &&
	2559	depend->flush_group == ip->flush_group) {
	2560	/*
	2561	* If this is an ADD that was deleted by the frontend
	2562	* the frontend nlinks count will have already been
	2563	* decremented, but the backend is going to sync its
	2564	* directory entry and must account for it. The
	2565	* record will be converted to a delete-on-disk when
	2566	* it gets synced.
	2567	*
	2568	* If the ADD was not deleted by the frontend we
	2569	* can remove the dependancy from our target_list.
	2570	*/
	2571	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	2572	++nlinks;
	2573	} else {
	2574	TAILQ_REMOVE(&ip->target_list, depend,
	2575	target_entry);
	2576	depend->target_ip = NULL;
	2577	}
	2578	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	2579	/*
	2580	* Not part of our flush group and not deleted by
	2581	* the front-end, adjust the link count synced to
	2582	* the media (undo what the frontend did when it
	2583	* queued the record).
	2584	*/
	2585	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	2586	switch(depend->type) {
	2587	case HAMMER_MEM_RECORD_ADD:
	2588	--nlinks;
	2589	break;
	2590	case HAMMER_MEM_RECORD_DEL:
	2591	++nlinks;
	2592	break;
	2593	default:
	2594	break;
	2595	}
	2596	}
	2597	}
	2598
	2599	/*
	2600	* Set dirty if we had to modify the link count.
	2601	*/
	2602	if (ip->sync_ino_data.nlinks != nlinks) {
	2603	KKASSERT((int64_t)nlinks >= 0);
	2604	ip->sync_ino_data.nlinks = nlinks;
	2605	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2606	}
	2607
	2608	/*
	2609	* If there is a trunction queued destroy any data past the (aligned)
	2610	* truncation point. Userland will have dealt with the buffer
	2611	* containing the truncation point for us.
	2612	*
	2613	* We don't flush pending frontend data buffers until after we've
	2614	* dealt with the truncation.
	2615	*/
	2616	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2617	/*
	2618	* Interlock trunc_off. The VOP front-end may continue to
	2619	* make adjustments to it while we are blocked.
	2620	*/
	2621	off_t trunc_off;
	2622	off_t aligned_trunc_off;
	2623	int blkmask;
	2624
	2625	trunc_off = ip->sync_trunc_off;
	2626	blkmask = hammer_blocksize(trunc_off) - 1;
	2627	aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
	2628
	2629	/*
	2630	* Delete any whole blocks on-media. The front-end has
	2631	* already cleaned out any partial block and made it
	2632	* pending. The front-end may have updated trunc_off
	2633	* while we were blocked so we only use sync_trunc_off.
	2634	*
	2635	* This operation can blow out the buffer cache, EWOULDBLOCK
	2636	* means we were unable to complete the deletion. The
	2637	* deletion will update sync_trunc_off in that case.
	2638	*/
	2639	error = hammer_ip_delete_range(&cursor, ip,
	2640	aligned_trunc_off,
	2641	0x7FFFFFFFFFFFFFFFLL, 2);
	2642	if (error == EWOULDBLOCK) {
	2643	ip->flags \|= HAMMER_INODE_WOULDBLOCK;
	2644	error = 0;
	2645	goto defer_buffer_flush;
	2646	}
	2647
	2648	if (error)
	2649	goto done;
	2650
	2651	/*
	2652	* Clear the truncation flag on the backend after we have
	2653	* complete the deletions. Backend data is now good again
	2654	* (including new records we are about to sync, below).
	2655	*
	2656	* Leave sync_trunc_off intact. As we write additional
	2657	* records the backend will update sync_trunc_off. This
	2658	* tells the backend whether it can skip the overwrite
	2659	* test. This should work properly even when the backend
	2660	* writes full blocks where the truncation point straddles
	2661	* the block because the comparison is against the base
	2662	* offset of the record.
	2663	*/
	2664	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2665	/* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
	2666	} else {
	2667	error = 0;
	2668	}
	2669
	2670	/*
	2671	* Now sync related records. These will typically be directory
	2672	* entries, records tracking direct-writes, or delete-on-disk records.
	2673	*/
	2674	if (error == 0) {
	2675	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2676	hammer_sync_record_callback, &cursor);
	2677	if (tmp_error < 0)
	2678	tmp_error = -error;
	2679	if (tmp_error)
	2680	error = tmp_error;
	2681	}
	2682	hammer_cache_node(&ip->cache[1], cursor.node);
	2683
	2684	/*
	2685	* Re-seek for inode update, assuming our cache hasn't been ripped
	2686	* out from under us.
	2687	*/
	2688	if (error == 0) {
	2689	tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
	2690	if (tmp_node) {
	2691	hammer_cursor_downgrade(&cursor);
	2692	hammer_lock_sh(&tmp_node->lock);
	2693	if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
	2694	hammer_cursor_seek(&cursor, tmp_node, 0);
	2695	hammer_unlock(&tmp_node->lock);
	2696	hammer_rel_node(tmp_node);
	2697	}
	2698	error = 0;
	2699	}
	2700
	2701	/*
	2702	* If we are deleting the inode the frontend had better not have
	2703	* any active references on elements making up the inode.
	2704	*
	2705	* The call to hammer_ip_delete_clean() cleans up auxillary records
	2706	* but not DB or DATA records. Those must have already been deleted
	2707	* by the normal truncation mechanic.
	2708	*/
	2709	if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
	2710	RB_EMPTY(&ip->rec_tree) &&
	2711	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	2712	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	2713	int count1 = 0;
	2714
	2715	error = hammer_ip_delete_clean(&cursor, ip, &count1);
	2716	if (error == 0) {
	2717	ip->flags \|= HAMMER_INODE_DELETED;
	2718	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	2719	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2720	KKASSERT(RB_EMPTY(&ip->rec_tree));
	2721
	2722	/*
	2723	* Set delete_tid in both the frontend and backend
	2724	* copy of the inode record. The DELETED flag handles
	2725	* this, do not set RDIRTY.
	2726	*/
	2727	ip->ino_leaf.base.delete_tid = trans->tid;
	2728	ip->sync_ino_leaf.base.delete_tid = trans->tid;
	2729	ip->ino_leaf.delete_ts = trans->time32;
	2730	ip->sync_ino_leaf.delete_ts = trans->time32;
	2731
	2732
	2733	/*
	2734	* Adjust the inode count in the volume header
	2735	*/
	2736	hammer_sync_lock_sh(trans);
	2737	if (ip->flags & HAMMER_INODE_ONDISK) {
	2738	hammer_modify_volume_field(trans,
	2739	trans->rootvol,
	2740	vol0_stat_inodes);
	2741	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	2742	hammer_modify_volume_done(trans->rootvol);
	2743	}
	2744	hammer_sync_unlock(trans);
	2745	}
	2746	}
	2747
	2748	if (error)
	2749	goto done;
	2750	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	2751
	2752	defer_buffer_flush:
	2753	/*
	2754	* Now update the inode's on-disk inode-data and/or on-disk record.
	2755	* DELETED and ONDISK are managed only in ip->flags.
	2756	*
	2757	* In the case of a defered buffer flush we still update the on-disk
	2758	* inode to satisfy visibility requirements if there happen to be
	2759	* directory dependancies.
	2760	*/
	2761	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	2762	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	2763	/*
	2764	* If deleted and on-disk, don't set any additional flags.
	2765	* the delete flag takes care of things.
	2766	*
	2767	* Clear flags which may have been set by the frontend.
	2768	*/
	2769	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2770	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2771	HAMMER_INODE_DELETING);
	2772	break;
	2773	case HAMMER_INODE_DELETED:
	2774	/*
	2775	* Take care of the case where a deleted inode was never
	2776	* flushed to the disk in the first place.
	2777	*
	2778	* Clear flags which may have been set by the frontend.
	2779	*/
	2780	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2781	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2782	HAMMER_INODE_DELETING);
	2783	while (RB_ROOT(&ip->rec_tree)) {
	2784	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	2785	hammer_ref(&record->lock);
	2786	KKASSERT(record->lock.refs == 1);
	2787	record->flags \|= HAMMER_RECF_DELETED_BE;
	2788	++record->ip->rec_generation;
	2789	hammer_rel_mem_record(record);
	2790	}
	2791	break;
	2792	case HAMMER_INODE_ONDISK:
	2793	/*
	2794	* If already on-disk, do not set any additional flags.
	2795	*/
	2796	break;
	2797	default:
	2798	/*
	2799	* If not on-disk and not deleted, set DDIRTY to force
	2800	* an initial record to be written.
	2801	*
	2802	* Also set the create_tid in both the frontend and backend
	2803	* copy of the inode record.
	2804	*/
	2805	ip->ino_leaf.base.create_tid = trans->tid;
	2806	ip->ino_leaf.create_ts = trans->time32;
	2807	ip->sync_ino_leaf.base.create_tid = trans->tid;
	2808	ip->sync_ino_leaf.create_ts = trans->time32;
	2809	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2810	break;
	2811	}
	2812
	2813	/*
	2814	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	2815	* is already on-disk the old record is marked as deleted.
	2816	*
	2817	* If DELETED is set hammer_update_inode() will delete the existing
	2818	* record without writing out a new one.
	2819	*
	2820	* If ONLY the ITIMES flag is set we can update the record in-place.
	2821	*/
	2822	if (ip->flags & HAMMER_INODE_DELETED) {
	2823	error = hammer_update_inode(&cursor, ip);
	2824	} else
	2825	if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
	2826	(ip->sync_flags & (HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME))) {
	2827	error = hammer_update_itimes(&cursor, ip);
	2828	} else
	2829	if (ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) {
	2830	error = hammer_update_inode(&cursor, ip);
	2831	}
	2832	done:
	2833	if (error) {
	2834	hammer_critical_error(ip->hmp, ip, error,
	2835	"while syncing inode");
	2836	}
	2837	hammer_done_cursor(&cursor);
	2838	return(error);
	2839	}
	2840
	2841	/*
	2842	* This routine is called when the OS is no longer actively referencing
	2843	* the inode (but might still be keeping it cached), or when releasing
	2844	* the last reference to an inode.
	2845	*
	2846	* At this point if the inode's nlinks count is zero we want to destroy
	2847	* it, which may mean destroying it on-media too.
	2848	*/
	2849	void
	2850	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	2851	{
	2852	struct vnode *vp;
	2853
	2854	/*
	2855	* Set the DELETING flag when the link count drops to 0 and the
	2856	* OS no longer has any opens on the inode.
	2857	*
	2858	* The backend will clear DELETING (a mod flag) and set DELETED
	2859	* (a state flag) when it is actually able to perform the
	2860	* operation.
	2861	*
	2862	* Don't reflag the deletion if the flusher is currently syncing
	2863	* one that was already flagged. A previously set DELETING flag
	2864	* may bounce around flags and sync_flags until the operation is
	2865	* completely done.
	2866	*/
	2867	if (ip->ino_data.nlinks == 0 &&
	2868	((ip->flags \| ip->sync_flags) & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	2869	ip->flags \|= HAMMER_INODE_DELETING;
	2870	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2871	ip->trunc_off = 0;
	2872	vp = NULL;
	2873	if (getvp) {
	2874	if (hammer_get_vnode(ip, &vp) != 0)
	2875	return;
	2876	}
	2877
	2878	/*
	2879	* Final cleanup
	2880	*/
	2881	if (ip->vp) {
	2882	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	2883	vnode_pager_setsize(ip->vp, 0);
	2884	}
	2885	if (getvp) {
	2886	vput(vp);
	2887	}
	2888	}
	2889	}
	2890
	2891	/*
	2892	* After potentially resolving a dependancy the inode is tested
	2893	* to determine whether it needs to be reflushed.
	2894	*/
	2895	void
	2896	hammer_test_inode(hammer_inode_t ip)
	2897	{
	2898	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2899	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2900	hammer_ref(&ip->lock);
	2901	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2902	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2903	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2904	} else {
	2905	hammer_flush_inode(ip, 0);
	2906	}
	2907	hammer_rel_inode(ip, 0);
	2908	}
	2909	}
	2910
	2911	/*
	2912	* Clear the RECLAIM flag on an inode. This occurs when the inode is
	2913	* reassociated with a vp or just before it gets freed.
	2914	*
	2915	* Pipeline wakeups to threads blocked due to an excessive number of
	2916	* detached inodes. The reclaim count generates a bit of negative
	2917	* feedback.
	2918	*/
	2919	static void
	2920	hammer_inode_wakereclaims(hammer_inode_t ip, int dowake)
	2921	{
	2922	struct hammer_reclaim *reclaim;
	2923	hammer_mount_t hmp = ip->hmp;
	2924
	2925	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
	2926	return;
	2927
	2928	--hammer_count_reclaiming;
	2929	--hmp->inode_reclaims;
	2930	ip->flags &= ~HAMMER_INODE_RECLAIM;
	2931
	2932	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT \|\| dowake) {
	2933	reclaim = TAILQ_FIRST(&hmp->reclaim_list);
	2934	if (reclaim && reclaim->count > 0 && --reclaim->count == 0) {
	2935	TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
	2936	wakeup(reclaim);
	2937	}
	2938	}
	2939	}
	2940
	2941	/*
	2942	* Setup our reclaim pipeline. We only let so many detached (and dirty)
	2943	* inodes build up before we start blocking.
	2944	*
	2945	* When we block we don't care which inode has finished reclaiming,
	2946	* as lone as one does. This is somewhat heuristical... we also put a
	2947	* cap on how long we are willing to wait.
	2948	*/
	2949	void
	2950	hammer_inode_waitreclaims(hammer_mount_t hmp)
	2951	{
	2952	struct hammer_reclaim reclaim;
	2953	int delay;
	2954
	2955	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT)
	2956	return;
	2957	delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
	2958	(HAMMER_RECLAIM_WAIT * 3) + 1;
	2959	if (delay > 0) {
	2960	reclaim.count = 2;
	2961	TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
	2962	tsleep(&reclaim, 0, "hmrrcm", delay);
	2963	if (reclaim.count > 0)
	2964	TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
	2965	}
	2966	}
	2967
	2968	/*
	2969	* A larger then normal backlog of inodes is sitting in the flusher,
	2970	* enforce a general slowdown to let it catch up. This routine is only
	2971	* called on completion of a non-flusher-related transaction which
	2972	* performed B-Tree node I/O.
	2973	*
	2974	* It is possible for the flusher to stall in a continuous load.
	2975	* blogbench -i1000 -o seems to do a good job generating this sort of load.
	2976	* If the flusher is unable to catch up the inode count can bloat until
	2977	* we run out of kvm.
	2978	*
	2979	* This is a bit of a hack.
	2980	*/
	2981	void
	2982	hammer_inode_waithard(hammer_mount_t hmp)
	2983	{
	2984	/*
	2985	* Hysteresis.
	2986	*/
	2987	if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
	2988	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 &&
	2989	hmp->count_iqueued < hmp->count_inodes / 20) {
	2990	hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
	2991	return;
	2992	}
	2993	} else {
	2994	if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT \|\|
	2995	hmp->count_iqueued < hmp->count_inodes / 10) {
	2996	return;
	2997	}
	2998	hmp->flags \|= HAMMER_MOUNT_FLUSH_RECOVERY;
	2999	}
	3000
	3001	/*
	3002	* Block for one flush cycle.
	3003	*/
	3004	hammer_flusher_wait_next(hmp);
	3005	}
	3006