gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39
	40	static int hammer_unload_inode(struct hammer_inode *ip);
	41	static void hammer_free_inode(hammer_inode_t ip);
	42	static void hammer_flush_inode_core(hammer_inode_t ip,
	43	hammer_flush_group_t flg, int flags);
	44	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	45	#if 0
	46	static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
	47	#endif
	48	static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	49	hammer_flush_group_t flg);
	50	static int hammer_setup_parent_inodes_helper(hammer_record_t record,
	51	int depth, hammer_flush_group_t flg);
	52	static void hammer_inode_wakereclaims(hammer_inode_t ip);
	53
	54	#ifdef DEBUG_TRUNCATE
	55	extern struct hammer_inode *HammerTruncIp;
	56	#endif
	57
	58	/*
	59	* RB-Tree support for inode structures
	60	*/
	61	int
	62	hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
	63	{
	64	if (ip1->obj_localization < ip2->obj_localization)
	65	return(-1);
	66	if (ip1->obj_localization > ip2->obj_localization)
	67	return(1);
	68	if (ip1->obj_id < ip2->obj_id)
	69	return(-1);
	70	if (ip1->obj_id > ip2->obj_id)
	71	return(1);
	72	if (ip1->obj_asof < ip2->obj_asof)
	73	return(-1);
	74	if (ip1->obj_asof > ip2->obj_asof)
	75	return(1);
	76	return(0);
	77	}
	78
	79	/*
	80	* RB-Tree support for inode structures / special LOOKUP_INFO
	81	*/
	82	static int
	83	hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
	84	{
	85	if (info->obj_localization < ip->obj_localization)
	86	return(-1);
	87	if (info->obj_localization > ip->obj_localization)
	88	return(1);
	89	if (info->obj_id < ip->obj_id)
	90	return(-1);
	91	if (info->obj_id > ip->obj_id)
	92	return(1);
	93	if (info->obj_asof < ip->obj_asof)
	94	return(-1);
	95	if (info->obj_asof > ip->obj_asof)
	96	return(1);
	97	return(0);
	98	}
	99
	100	/*
	101	* Used by hammer_scan_inode_snapshots() to locate all of an object's
	102	* snapshots. Note that the asof field is not tested, which we can get
	103	* away with because it is the lowest-priority field.
	104	*/
	105	static int
	106	hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
	107	{
	108	hammer_inode_info_t info = data;
	109
	110	if (ip->obj_localization > info->obj_localization)
	111	return(1);
	112	if (ip->obj_localization < info->obj_localization)
	113	return(-1);
	114	if (ip->obj_id > info->obj_id)
	115	return(1);
	116	if (ip->obj_id < info->obj_id)
	117	return(-1);
	118	return(0);
	119	}
	120
	121	/*
	122	* Used by hammer_unload_pseudofs() to locate all inodes associated with
	123	* a particular PFS.
	124	*/
	125	static int
	126	hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
	127	{
	128	u_int32_t localization = (u_int32_t )data;
	129	if (ip->obj_localization > localization)
	130	return(1);
	131	if (ip->obj_localization < localization)
	132	return(-1);
	133	return(0);
	134	}
	135
	136	/*
	137	* RB-Tree support for pseudofs structures
	138	*/
	139	static int
	140	hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
	141	{
	142	if (p1->localization < p2->localization)
	143	return(-1);
	144	if (p1->localization > p2->localization)
	145	return(1);
	146	return(0);
	147	}
	148
	149
	150	RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
	151	RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
	152	hammer_inode_info_cmp, hammer_inode_info_t);
	153	RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
	154	hammer_pfs_rb_compare, u_int32_t, localization);
	155
	156	/*
	157	* The kernel is not actively referencing this vnode but is still holding
	158	* it cached.
	159	*
	160	* This is called from the frontend.
	161	*
	162	* MPALMOSTSAFE
	163	*/
	164	int
	165	hammer_vop_inactive(struct vop_inactive_args *ap)
	166	{
	167	struct hammer_inode *ip = VTOI(ap->a_vp);
	168
	169	/*
	170	* Degenerate case
	171	*/
	172	if (ip == NULL) {
	173	vrecycle(ap->a_vp);
	174	return(0);
	175	}
	176
	177	/*
	178	* If the inode no longer has visibility in the filesystem try to
	179	* recycle it immediately, even if the inode is dirty. Recycling
	180	* it quickly allows the system to reclaim buffer cache and VM
	181	* resources which can matter a lot in a heavily loaded system.
	182	*
	183	* This can deadlock in vfsync() if we aren't careful.
	184	*
	185	* Do not queue the inode to the flusher if we still have visibility,
	186	* otherwise namespace calls such as chmod will unnecessarily generate
	187	* multiple inode updates.
	188	*/
	189	if (ip->ino_data.nlinks == 0) {
	190	get_mplock();
	191	hammer_inode_unloadable_check(ip, 0);
	192	if (ip->flags & HAMMER_INODE_MODMASK)
	193	hammer_flush_inode(ip, 0);
	194	vrecycle(ap->a_vp);
	195	rel_mplock();
	196	}
	197	return(0);
	198	}
	199
	200	/*
	201	* Release the vnode association. This is typically (but not always)
	202	* the last reference on the inode.
	203	*
	204	* Once the association is lost we are on our own with regards to
	205	* flushing the inode.
	206	*/
	207	int
	208	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	209	{
	210	struct hammer_inode *ip;
	211	hammer_mount_t hmp;
	212	struct vnode *vp;
	213
	214	vp = ap->a_vp;
	215
	216	if ((ip = vp->v_data) != NULL) {
	217	hmp = ip->hmp;
	218	vp->v_data = NULL;
	219	ip->vp = NULL;
	220
	221	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
	222	++hammer_count_reclaiming;
	223	++hmp->inode_reclaims;
	224	ip->flags \|= HAMMER_INODE_RECLAIM;
	225	}
	226	hammer_rel_inode(ip, 1);
	227	}
	228	return(0);
	229	}
	230
	231	/*
	232	* Return a locked vnode for the specified inode. The inode must be
	233	* referenced but NOT LOCKED on entry and will remain referenced on
	234	* return.
	235	*
	236	* Called from the frontend.
	237	*/
	238	int
	239	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	240	{
	241	hammer_mount_t hmp;
	242	struct vnode *vp;
	243	int error = 0;
	244	u_int8_t obj_type;
	245
	246	hmp = ip->hmp;
	247
	248	for (;;) {
	249	if ((vp = ip->vp) == NULL) {
	250	error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
	251	if (error)
	252	break;
	253	hammer_lock_ex(&ip->lock);
	254	if (ip->vp != NULL) {
	255	hammer_unlock(&ip->lock);
	256	vp = *vpp;
	257	vp->v_type = VBAD;
	258	vx_put(vp);
	259	continue;
	260	}
	261	hammer_ref(&ip->lock);
	262	vp = *vpp;
	263	ip->vp = vp;
	264
	265	obj_type = ip->ino_data.obj_type;
	266	vp->v_type = hammer_get_vnode_type(obj_type);
	267
	268	hammer_inode_wakereclaims(ip);
	269
	270	switch(ip->ino_data.obj_type) {
	271	case HAMMER_OBJTYPE_CDEV:
	272	case HAMMER_OBJTYPE_BDEV:
	273	vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
	274	addaliasu(vp, ip->ino_data.rmajor,
	275	ip->ino_data.rminor);
	276	break;
	277	case HAMMER_OBJTYPE_FIFO:
	278	vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
	279	break;
	280	case HAMMER_OBJTYPE_REGFILE:
	281	break;
	282	default:
	283	break;
	284	}
	285
	286	/*
	287	* Only mark as the root vnode if the ip is not
	288	* historical, otherwise the VFS cache will get
	289	* confused. The other half of the special handling
	290	* is in hammer_vop_nlookupdotdot().
	291	*
	292	* Pseudo-filesystem roots can be accessed via
	293	* non-root filesystem paths and setting VROOT may
	294	* confuse the namecache. Set VPFSROOT instead.
	295	*/
	296	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	297	ip->obj_asof == hmp->asof) {
	298	if (ip->obj_localization == 0)
	299	vsetflags(vp, VROOT);
	300	else
	301	vsetflags(vp, VPFSROOT);
	302	}
	303
	304	vp->v_data = (void *)ip;
	305	/* vnode locked by getnewvnode() */
	306	/* make related vnode dirty if inode dirty? */
	307	hammer_unlock(&ip->lock);
	308	if (vp->v_type == VREG)
	309	vinitvmio(vp, ip->ino_data.size);
	310	break;
	311	}
	312
	313	/*
	314	* loop if the vget fails (aka races), or if the vp
	315	* no longer matches ip->vp.
	316	*/
	317	if (vget(vp, LK_EXCLUSIVE) == 0) {
	318	if (vp == ip->vp)
	319	break;
	320	vput(vp);
	321	}
	322	}
	323	*vpp = vp;
	324	return(error);
	325	}
	326
	327	/*
	328	* Locate all copies of the inode for obj_id compatible with the specified
	329	* asof, reference, and issue the related call-back. This routine is used
	330	* for direct-io invalidation and does not create any new inodes.
	331	*/
	332	void
	333	hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
	334	int (callback)(hammer_inode_t ip, void data),
	335	void *data)
	336	{
	337	hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
	338	hammer_inode_info_cmp_all_history,
	339	callback, iinfo);
	340	}
	341
	342	/*
	343	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	344	* do not attach or detach the related vnode (use hammer_get_vnode() for
	345	* that).
	346	*
	347	* The flags argument is only applied for newly created inodes, and only
	348	* certain flags are inherited.
	349	*
	350	* Called from the frontend.
	351	*/
	352	struct hammer_inode *
	353	hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
	354	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	355	int flags, int *errorp)
	356	{
	357	hammer_mount_t hmp = trans->hmp;
	358	struct hammer_node_cache *cachep;
	359	struct hammer_inode_info iinfo;
	360	struct hammer_cursor cursor;
	361	struct hammer_inode *ip;
	362
	363
	364	/*
	365	* Determine if we already have an inode cached. If we do then
	366	* we are golden.
	367	*
	368	* If we find an inode with no vnode we have to mark the
	369	* transaction such that hammer_inode_waitreclaims() is
	370	* called later on to avoid building up an infinite number
	371	* of inodes. Otherwise we can continue to * add new inodes
	372	* faster then they can be disposed of, even with the tsleep
	373	* delay.
	374	*
	375	* If we find a dummy inode we return a failure so dounlink
	376	* (which does another lookup) doesn't try to mess with the
	377	* link count. hammer_vop_nresolve() uses hammer_get_dummy_inode()
	378	* to ref dummy inodes.
	379	*/
	380	iinfo.obj_id = obj_id;
	381	iinfo.obj_asof = asof;
	382	iinfo.obj_localization = localization;
	383	loop:
	384	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	385	if (ip) {
	386	if (ip->flags & HAMMER_INODE_DUMMY) {
	387	*errorp = ENOENT;
	388	return(NULL);
	389	}
	390	hammer_ref(&ip->lock);
	391	*errorp = 0;
	392	return(ip);
	393	}
	394
	395	/*
	396	* Allocate a new inode structure and deal with races later.
	397	*/
	398	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	399	++hammer_count_inodes;
	400	++hmp->count_inodes;
	401	ip->obj_id = obj_id;
	402	ip->obj_asof = iinfo.obj_asof;
	403	ip->obj_localization = localization;
	404	ip->hmp = hmp;
	405	ip->flags = flags & HAMMER_INODE_RO;
	406	ip->cache[0].ip = ip;
	407	ip->cache[1].ip = ip;
	408	ip->cache[2].ip = ip;
	409	ip->cache[3].ip = ip;
	410	ip->redo_count = SIZE_T_MAX;
	411	if (hmp->ronly)
	412	ip->flags \|= HAMMER_INODE_RO;
	413	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	414	0x7FFFFFFFFFFFFFFFLL;
	415	RB_INIT(&ip->rec_tree);
	416	TAILQ_INIT(&ip->target_list);
	417	hammer_ref(&ip->lock);
	418
	419	/*
	420	* Locate the on-disk inode. If this is a PFS root we always
	421	* access the current version of the root inode and (if it is not
	422	* a master) always access information under it with a snapshot
	423	* TID.
	424	*
	425	* We cache recent inode lookups in this directory in dip->cache[2].
	426	* If we can't find it we assume the inode we are looking for is
	427	* close to the directory inode.
	428	*/
	429	retry:
	430	cachep = NULL;
	431	if (dip) {
	432	if (dip->cache[2].node)
	433	cachep = &dip->cache[2];
	434	else
	435	cachep = &dip->cache[0];
	436	}
	437	hammer_init_cursor(trans, &cursor, cachep, NULL);
	438	cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
	439	cursor.key_beg.obj_id = ip->obj_id;
	440	cursor.key_beg.key = 0;
	441	cursor.key_beg.create_tid = 0;
	442	cursor.key_beg.delete_tid = 0;
	443	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	444	cursor.key_beg.obj_type = 0;
	445
	446	cursor.asof = iinfo.obj_asof;
	447	cursor.flags = HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_GET_DATA \|
	448	HAMMER_CURSOR_ASOF;
	449
	450	*errorp = hammer_btree_lookup(&cursor);
	451	if (*errorp == EDEADLK) {
	452	hammer_done_cursor(&cursor);
	453	goto retry;
	454	}
	455
	456	/*
	457	* On success the B-Tree lookup will hold the appropriate
	458	* buffer cache buffers and provide a pointer to the requested
	459	* information. Copy the information to the in-memory inode
	460	* and cache the B-Tree node to improve future operations.
	461	*/
	462	if (*errorp == 0) {
	463	ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
	464	ip->ino_data = cursor.data->inode;
	465
	466	/*
	467	* cache[0] tries to cache the location of the object inode.
	468	* The assumption is that it is near the directory inode.
	469	*
	470	* cache[1] tries to cache the location of the object data.
	471	* We might have something in the governing directory from
	472	* scan optimizations (see the strategy code in
	473	* hammer_vnops.c).
	474	*
	475	* We update dip->cache[2], if possible, with the location
	476	* of the object inode for future directory shortcuts.
	477	*/
	478	hammer_cache_node(&ip->cache[0], cursor.node);
	479	if (dip) {
	480	if (dip->cache[3].node) {
	481	hammer_cache_node(&ip->cache[1],
	482	dip->cache[3].node);
	483	}
	484	hammer_cache_node(&dip->cache[2], cursor.node);
	485	}
	486
	487	/*
	488	* The file should not contain any data past the file size
	489	* stored in the inode. Setting save_trunc_off to the
	490	* file size instead of max reduces B-Tree lookup overheads
	491	* on append by allowing the flusher to avoid checking for
	492	* record overwrites.
	493	*/
	494	ip->save_trunc_off = ip->ino_data.size;
	495
	496	/*
	497	* Locate and assign the pseudofs management structure to
	498	* the inode.
	499	*/
	500	if (dip && dip->obj_localization == ip->obj_localization) {
	501	ip->pfsm = dip->pfsm;
	502	hammer_ref(&ip->pfsm->lock);
	503	} else {
	504	ip->pfsm = hammer_load_pseudofs(trans,
	505	ip->obj_localization,
	506	errorp);
	507	errorp = 0; / ignore ENOENT */
	508	}
	509	}
	510
	511	/*
	512	* The inode is placed on the red-black tree and will be synced to
	513	* the media when flushed or by the filesystem sync. If this races
	514	* another instantiation/lookup the insertion will fail.
	515	*/
	516	if (*errorp == 0) {
	517	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	518	hammer_free_inode(ip);
	519	hammer_done_cursor(&cursor);
	520	goto loop;
	521	}
	522	ip->flags \|= HAMMER_INODE_ONDISK;
	523	} else {
	524	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	525	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	526	--hmp->rsv_inodes;
	527	}
	528
	529	hammer_free_inode(ip);
	530	ip = NULL;
	531	}
	532	hammer_done_cursor(&cursor);
	533	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	534	return (ip);
	535	}
	536
	537	/*
	538	* Get a dummy inode to placemark a broken directory entry.
	539	*/
	540	struct hammer_inode *
	541	hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
	542	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	543	int flags, int *errorp)
	544	{
	545	hammer_mount_t hmp = trans->hmp;
	546	struct hammer_inode_info iinfo;
	547	struct hammer_inode *ip;
	548
	549	/*
	550	* Determine if we already have an inode cached. If we do then
	551	* we are golden.
	552	*
	553	* If we find an inode with no vnode we have to mark the
	554	* transaction such that hammer_inode_waitreclaims() is
	555	* called later on to avoid building up an infinite number
	556	* of inodes. Otherwise we can continue to * add new inodes
	557	* faster then they can be disposed of, even with the tsleep
	558	* delay.
	559	*
	560	* If we find a non-fake inode we return an error. Only fake
	561	* inodes can be returned by this routine.
	562	*/
	563	iinfo.obj_id = obj_id;
	564	iinfo.obj_asof = asof;
	565	iinfo.obj_localization = localization;
	566	loop:
	567	*errorp = 0;
	568	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	569	if (ip) {
	570	if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
	571	*errorp = ENOENT;
	572	return(NULL);
	573	}
	574	hammer_ref(&ip->lock);
	575	return(ip);
	576	}
	577
	578	/*
	579	* Allocate a new inode structure and deal with races later.
	580	*/
	581	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	582	++hammer_count_inodes;
	583	++hmp->count_inodes;
	584	ip->obj_id = obj_id;
	585	ip->obj_asof = iinfo.obj_asof;
	586	ip->obj_localization = localization;
	587	ip->hmp = hmp;
	588	ip->flags = flags \| HAMMER_INODE_RO \| HAMMER_INODE_DUMMY;
	589	ip->cache[0].ip = ip;
	590	ip->cache[1].ip = ip;
	591	ip->cache[2].ip = ip;
	592	ip->cache[3].ip = ip;
	593	ip->redo_count = SIZE_T_MAX;
	594	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	595	0x7FFFFFFFFFFFFFFFLL;
	596	RB_INIT(&ip->rec_tree);
	597	TAILQ_INIT(&ip->target_list);
	598	hammer_ref(&ip->lock);
	599
	600	/*
	601	* Populate the dummy inode. Leave everything zero'd out.
	602	*
	603	* (ip->ino_leaf and ip->ino_data)
	604	*
	605	* Make the dummy inode a FIFO object which most copy programs
	606	* will properly ignore.
	607	*/
	608	ip->save_trunc_off = ip->ino_data.size;
	609	ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
	610
	611	/*
	612	* Locate and assign the pseudofs management structure to
	613	* the inode.
	614	*/
	615	if (dip && dip->obj_localization == ip->obj_localization) {
	616	ip->pfsm = dip->pfsm;
	617	hammer_ref(&ip->pfsm->lock);
	618	} else {
	619	ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
	620	errorp);
	621	errorp = 0; / ignore ENOENT */
	622	}
	623
	624	/*
	625	* The inode is placed on the red-black tree and will be synced to
	626	* the media when flushed or by the filesystem sync. If this races
	627	* another instantiation/lookup the insertion will fail.
	628	*
	629	* NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake.
	630	*/
	631	if (*errorp == 0) {
	632	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	633	hammer_free_inode(ip);
	634	goto loop;
	635	}
	636	} else {
	637	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	638	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	639	--hmp->rsv_inodes;
	640	}
	641	hammer_free_inode(ip);
	642	ip = NULL;
	643	}
	644	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	645	return (ip);
	646	}
	647
	648	/*
	649	* Return a referenced inode only if it is in our inode cache.
	650	*
	651	* Dummy inodes do not count.
	652	*/
	653	struct hammer_inode *
	654	hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
	655	hammer_tid_t asof, u_int32_t localization)
	656	{
	657	hammer_mount_t hmp = trans->hmp;
	658	struct hammer_inode_info iinfo;
	659	struct hammer_inode *ip;
	660
	661	iinfo.obj_id = obj_id;
	662	iinfo.obj_asof = asof;
	663	iinfo.obj_localization = localization;
	664
	665	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	666	if (ip) {
	667	if (ip->flags & HAMMER_INODE_DUMMY)
	668	ip = NULL;
	669	else
	670	hammer_ref(&ip->lock);
	671	}
	672	return(ip);
	673	}
	674
	675	/*
	676	* Create a new filesystem object, returning the inode in *ipp. The
	677	* returned inode will be referenced. The inode is created in-memory.
	678	*
	679	* If pfsm is non-NULL the caller wishes to create the root inode for
	680	* a master PFS.
	681	*/
	682	int
	683	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	684	struct ucred *cred,
	685	hammer_inode_t dip, const char *name, int namelen,
	686	hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
	687	{
	688	hammer_mount_t hmp;
	689	hammer_inode_t ip;
	690	uid_t xuid;
	691	int error;
	692	int64_t namekey;
	693	u_int32_t dummy;
	694
	695	hmp = trans->hmp;
	696
	697	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	698	++hammer_count_inodes;
	699	++hmp->count_inodes;
	700	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	701
	702	if (pfsm) {
	703	KKASSERT(pfsm->localization != 0);
	704	ip->obj_id = HAMMER_OBJID_ROOT;
	705	ip->obj_localization = pfsm->localization;
	706	} else {
	707	KKASSERT(dip != NULL);
	708	namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
	709	ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
	710	ip->obj_localization = dip->obj_localization;
	711	}
	712
	713	KKASSERT(ip->obj_id != 0);
	714	ip->obj_asof = hmp->asof;
	715	ip->hmp = hmp;
	716	ip->flush_state = HAMMER_FST_IDLE;
	717	ip->flags = HAMMER_INODE_DDIRTY \|
	718	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME;
	719	ip->cache[0].ip = ip;
	720	ip->cache[1].ip = ip;
	721	ip->cache[2].ip = ip;
	722	ip->cache[3].ip = ip;
	723	ip->redo_count = SIZE_T_MAX;
	724
	725	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	726	/* ip->save_trunc_off = 0; (already zero) */
	727	RB_INIT(&ip->rec_tree);
	728	TAILQ_INIT(&ip->target_list);
	729
	730	ip->ino_data.atime = trans->time;
	731	ip->ino_data.mtime = trans->time;
	732	ip->ino_data.size = 0;
	733	ip->ino_data.nlinks = 0;
	734
	735	/*
	736	* A nohistory designator on the parent directory is inherited by
	737	* the child. We will do this even for pseudo-fs creation... the
	738	* sysad can turn it off.
	739	*/
	740	if (dip) {
	741	ip->ino_data.uflags = dip->ino_data.uflags &
	742	(SF_NOHISTORY\|UF_NOHISTORY\|UF_NODUMP);
	743	}
	744
	745	ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
	746	ip->ino_leaf.base.localization = ip->obj_localization +
	747	HAMMER_LOCALIZE_INODE;
	748	ip->ino_leaf.base.obj_id = ip->obj_id;
	749	ip->ino_leaf.base.key = 0;
	750	ip->ino_leaf.base.create_tid = 0;
	751	ip->ino_leaf.base.delete_tid = 0;
	752	ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
	753	ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
	754
	755	ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
	756	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	757	ip->ino_data.mode = vap->va_mode;
	758	ip->ino_data.ctime = trans->time;
	759
	760	/*
	761	* If we are running version 2 or greater directory entries are
	762	* inode-localized instead of data-localized.
	763	*/
	764	if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
	765	if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	766	ip->ino_data.cap_flags \|=
	767	HAMMER_INODE_CAP_DIR_LOCAL_INO;
	768	}
	769	}
	770
	771	/*
	772	* Setup the ".." pointer. This only needs to be done for directories
	773	* but we do it for all objects as a recovery aid.
	774	*/
	775	if (dip)
	776	ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
	777	#if 0
	778	/*
	779	* The parent_obj_localization field only applies to pseudo-fs roots.
	780	* XXX this is no longer applicable, PFSs are no longer directly
	781	* tied into the parent's directory structure.
	782	*/
	783	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
	784	ip->obj_id == HAMMER_OBJID_ROOT) {
	785	ip->ino_data.ext.obj.parent_obj_localization =
	786	dip->obj_localization;
	787	}
	788	#endif
	789
	790	switch(ip->ino_leaf.base.obj_type) {
	791	case HAMMER_OBJTYPE_CDEV:
	792	case HAMMER_OBJTYPE_BDEV:
	793	ip->ino_data.rmajor = vap->va_rmajor;
	794	ip->ino_data.rminor = vap->va_rminor;
	795	break;
	796	default:
	797	break;
	798	}
	799
	800	/*
	801	* Calculate default uid/gid and overwrite with information from
	802	* the vap.
	803	*/
	804	if (dip) {
	805	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	806	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
	807	xuid, cred, &vap->va_mode);
	808	} else {
	809	xuid = 0;
	810	}
	811	ip->ino_data.mode = vap->va_mode;
	812
	813	if (vap->va_vaflags & VA_UID_UUID_VALID)
	814	ip->ino_data.uid = vap->va_uid_uuid;
	815	else if (vap->va_uid != (uid_t)VNOVAL)
	816	hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
	817	else
	818	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	819
	820	if (vap->va_vaflags & VA_GID_UUID_VALID)
	821	ip->ino_data.gid = vap->va_gid_uuid;
	822	else if (vap->va_gid != (gid_t)VNOVAL)
	823	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	824	else if (dip)
	825	ip->ino_data.gid = dip->ino_data.gid;
	826
	827	hammer_ref(&ip->lock);
	828
	829	if (pfsm) {
	830	ip->pfsm = pfsm;
	831	hammer_ref(&pfsm->lock);
	832	error = 0;
	833	} else if (dip->obj_localization == ip->obj_localization) {
	834	ip->pfsm = dip->pfsm;
	835	hammer_ref(&ip->pfsm->lock);
	836	error = 0;
	837	} else {
	838	ip->pfsm = hammer_load_pseudofs(trans,
	839	ip->obj_localization,
	840	&error);
	841	error = 0; /* ignore ENOENT */
	842	}
	843
	844	if (error) {
	845	hammer_free_inode(ip);
	846	ip = NULL;
	847	} else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	848	panic("hammer_create_inode: duplicate obj_id %llx",
	849	(long long)ip->obj_id);
	850	/* not reached */
	851	hammer_free_inode(ip);
	852	}
	853	*ipp = ip;
	854	return(error);
	855	}
	856
	857	/*
	858	* Final cleanup / freeing of an inode structure
	859	*/
	860	static void
	861	hammer_free_inode(hammer_inode_t ip)
	862	{
	863	struct hammer_mount *hmp;
	864
	865	hmp = ip->hmp;
	866	KKASSERT(ip->lock.refs == 1);
	867	hammer_uncache_node(&ip->cache[0]);
	868	hammer_uncache_node(&ip->cache[1]);
	869	hammer_uncache_node(&ip->cache[2]);
	870	hammer_uncache_node(&ip->cache[3]);
	871	hammer_inode_wakereclaims(ip);
	872	if (ip->objid_cache)
	873	hammer_clear_objid(ip);
	874	--hammer_count_inodes;
	875	--hmp->count_inodes;
	876	if (ip->pfsm) {
	877	hammer_rel_pseudofs(hmp, ip->pfsm);
	878	ip->pfsm = NULL;
	879	}
	880	kfree(ip, hmp->m_inodes);
	881	ip = NULL;
	882	}
	883
	884	/*
	885	* Retrieve pseudo-fs data. NULL will never be returned.
	886	*
	887	* If an error occurs *errorp will be set and a default template is returned,
	888	* otherwise *errorp is set to 0. Typically when an error occurs it will
	889	* be ENOENT.
	890	*/
	891	hammer_pseudofs_inmem_t
	892	hammer_load_pseudofs(hammer_transaction_t trans,
	893	u_int32_t localization, int *errorp)
	894	{
	895	hammer_mount_t hmp = trans->hmp;
	896	hammer_inode_t ip;
	897	hammer_pseudofs_inmem_t pfsm;
	898	struct hammer_cursor cursor;
	899	int bytes;
	900
	901	retry:
	902	pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
	903	if (pfsm) {
	904	hammer_ref(&pfsm->lock);
	905	*errorp = 0;
	906	return(pfsm);
	907	}
	908
	909	/*
	910	* PFS records are stored in the root inode (not the PFS root inode,
	911	* but the real root). Avoid an infinite recursion if loading
	912	* the PFS for the real root.
	913	*/
	914	if (localization) {
	915	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
	916	HAMMER_MAX_TID,
	917	HAMMER_DEF_LOCALIZATION, 0, errorp);
	918	} else {
	919	ip = NULL;
	920	}
	921
	922	pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK \| M_ZERO);
	923	pfsm->localization = localization;
	924	pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
	925	pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
	926
	927	hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
	928	cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
	929	HAMMER_LOCALIZE_MISC;
	930	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	931	cursor.key_beg.create_tid = 0;
	932	cursor.key_beg.delete_tid = 0;
	933	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	934	cursor.key_beg.obj_type = 0;
	935	cursor.key_beg.key = localization;
	936	cursor.asof = HAMMER_MAX_TID;
	937	cursor.flags \|= HAMMER_CURSOR_ASOF;
	938
	939	if (ip)
	940	*errorp = hammer_ip_lookup(&cursor);
	941	else
	942	*errorp = hammer_btree_lookup(&cursor);
	943	if (*errorp == 0) {
	944	*errorp = hammer_ip_resolve_data(&cursor);
	945	if (*errorp == 0) {
	946	if (cursor.data->pfsd.mirror_flags &
	947	HAMMER_PFSD_DELETED) {
	948	*errorp = ENOENT;
	949	} else {
	950	bytes = cursor.leaf->data_len;
	951	if (bytes > sizeof(pfsm->pfsd))
	952	bytes = sizeof(pfsm->pfsd);
	953	bcopy(cursor.data, &pfsm->pfsd, bytes);
	954	}
	955	}
	956	}
	957	hammer_done_cursor(&cursor);
	958
	959	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	960	hammer_ref(&pfsm->lock);
	961	if (ip)
	962	hammer_rel_inode(ip, 0);
	963	if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
	964	kfree(pfsm, hmp->m_misc);
	965	goto retry;
	966	}
	967	return(pfsm);
	968	}
	969
	970	/*
	971	* Store pseudo-fs data. The backend will automatically delete any prior
	972	* on-disk pseudo-fs data but we have to delete in-memory versions.
	973	*/
	974	int
	975	hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
	976	{
	977	struct hammer_cursor cursor;
	978	hammer_record_t record;
	979	hammer_inode_t ip;
	980	int error;
	981
	982	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	983	HAMMER_DEF_LOCALIZATION, 0, &error);
	984	retry:
	985	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	986	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	987	cursor.key_beg.localization = ip->obj_localization +
	988	HAMMER_LOCALIZE_MISC;
	989	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	990	cursor.key_beg.create_tid = 0;
	991	cursor.key_beg.delete_tid = 0;
	992	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	993	cursor.key_beg.obj_type = 0;
	994	cursor.key_beg.key = pfsm->localization;
	995	cursor.asof = HAMMER_MAX_TID;
	996	cursor.flags \|= HAMMER_CURSOR_ASOF;
	997
	998	/*
	999	* Replace any in-memory version of the record.
	1000	*/
	1001	error = hammer_ip_lookup(&cursor);
	1002	if (error == 0 && hammer_cursor_inmem(&cursor)) {
	1003	record = cursor.iprec;
	1004	if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
	1005	KKASSERT(cursor.deadlk_rec == NULL);
	1006	hammer_ref(&record->lock);
	1007	cursor.deadlk_rec = record;
	1008	error = EDEADLK;
	1009	} else {
	1010	record->flags \|= HAMMER_RECF_DELETED_FE;
	1011	error = 0;
	1012	}
	1013	}
	1014
	1015	/*
	1016	* Allocate replacement general record. The backend flush will
	1017	* delete any on-disk version of the record.
	1018	*/
	1019	if (error == 0 \|\| error == ENOENT) {
	1020	record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
	1021	record->type = HAMMER_MEM_RECORD_GENERAL;
	1022
	1023	record->leaf.base.localization = ip->obj_localization +
	1024	HAMMER_LOCALIZE_MISC;
	1025	record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
	1026	record->leaf.base.key = pfsm->localization;
	1027	record->leaf.data_len = sizeof(pfsm->pfsd);
	1028	bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
	1029	error = hammer_ip_add_record(trans, record);
	1030	}
	1031	hammer_done_cursor(&cursor);
	1032	if (error == EDEADLK)
	1033	goto retry;
	1034	hammer_rel_inode(ip, 0);
	1035	return(error);
	1036	}
	1037
	1038	/*
	1039	* Create a root directory for a PFS if one does not alredy exist.
	1040	*
	1041	* The PFS root stands alone so we must also bump the nlinks count
	1042	* to prevent it from being destroyed on release.
	1043	*/
	1044	int
	1045	hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
	1046	hammer_pseudofs_inmem_t pfsm)
	1047	{
	1048	hammer_inode_t ip;
	1049	struct vattr vap;
	1050	int error;
	1051
	1052	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	1053	pfsm->localization, 0, &error);
	1054	if (ip == NULL) {
	1055	vattr_null(&vap);
	1056	vap.va_mode = 0755;
	1057	vap.va_type = VDIR;
	1058	error = hammer_create_inode(trans, &vap, cred,
	1059	NULL, NULL, 0,
	1060	pfsm, &ip);
	1061	if (error == 0) {
	1062	++ip->ino_data.nlinks;
	1063	hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
	1064	}
	1065	}
	1066	if (ip)
	1067	hammer_rel_inode(ip, 0);
	1068	return(error);
	1069	}
	1070
	1071	/*
	1072	* Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
	1073	* if we are unable to disassociate all the inodes.
	1074	*/
	1075	static
	1076	int
	1077	hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
	1078	{
	1079	int res;
	1080
	1081	hammer_ref(&ip->lock);
	1082	if (ip->lock.refs == 2 && ip->vp)
	1083	vclean_unlocked(ip->vp);
	1084	if (ip->lock.refs == 1 && ip->vp == NULL)
	1085	res = 0;
	1086	else
	1087	res = -1; /* stop, someone is using the inode */
	1088	hammer_rel_inode(ip, 0);
	1089	return(res);
	1090	}
	1091
	1092	int
	1093	hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
	1094	{
	1095	int res;
	1096	int try;
	1097
	1098	for (try = res = 0; try < 4; ++try) {
	1099	res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
	1100	hammer_inode_pfs_cmp,
	1101	hammer_unload_pseudofs_callback,
	1102	&localization);
	1103	if (res == 0 && try > 1)
	1104	break;
	1105	hammer_flusher_sync(trans->hmp);
	1106	}
	1107	if (res != 0)
	1108	res = ENOTEMPTY;
	1109	return(res);
	1110	}
	1111
	1112
	1113	/*
	1114	* Release a reference on a PFS
	1115	*/
	1116	void
	1117	hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
	1118	{
	1119	hammer_unref(&pfsm->lock);
	1120	if (pfsm->lock.refs == 0) {
	1121	RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
	1122	kfree(pfsm, hmp->m_misc);
	1123	}
	1124	}
	1125
	1126	/*
	1127	* Called by hammer_sync_inode().
	1128	*/
	1129	static int
	1130	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	1131	{
	1132	hammer_transaction_t trans = cursor->trans;
	1133	hammer_record_t record;
	1134	int error;
	1135	int redirty;
	1136
	1137	retry:
	1138	error = 0;
	1139
	1140	/*
	1141	* If the inode has a presence on-disk then locate it and mark
	1142	* it deleted, setting DELONDISK.
	1143	*
	1144	* The record may or may not be physically deleted, depending on
	1145	* the retention policy.
	1146	*/
	1147	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	1148	HAMMER_INODE_ONDISK) {
	1149	hammer_normalize_cursor(cursor);
	1150	cursor->key_beg.localization = ip->obj_localization +
	1151	HAMMER_LOCALIZE_INODE;
	1152	cursor->key_beg.obj_id = ip->obj_id;
	1153	cursor->key_beg.key = 0;
	1154	cursor->key_beg.create_tid = 0;
	1155	cursor->key_beg.delete_tid = 0;
	1156	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1157	cursor->key_beg.obj_type = 0;
	1158	cursor->asof = ip->obj_asof;
	1159	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1160	cursor->flags \|= HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_ASOF;
	1161	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1162
	1163	error = hammer_btree_lookup(cursor);
	1164	if (hammer_debug_inode)
	1165	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	1166
	1167	if (error == 0) {
	1168	error = hammer_ip_delete_record(cursor, ip, trans->tid);
	1169	if (hammer_debug_inode)
	1170	kprintf(" error %d\n", error);
	1171	if (error == 0) {
	1172	ip->flags \|= HAMMER_INODE_DELONDISK;
	1173	}
	1174	if (cursor->node)
	1175	hammer_cache_node(&ip->cache[0], cursor->node);
	1176	}
	1177	if (error == EDEADLK) {
	1178	hammer_done_cursor(cursor);
	1179	error = hammer_init_cursor(trans, cursor,
	1180	&ip->cache[0], ip);
	1181	if (hammer_debug_inode)
	1182	kprintf("IPDED %p %d\n", ip, error);
	1183	if (error == 0)
	1184	goto retry;
	1185	}
	1186	}
	1187
	1188	/*
	1189	* Ok, write out the initial record or a new record (after deleting
	1190	* the old one), unless the DELETED flag is set. This routine will
	1191	* clear DELONDISK if it writes out a record.
	1192	*
	1193	* Update our inode statistics if this is the first application of
	1194	* the inode on-disk.
	1195	*/
	1196	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	1197	/*
	1198	* Generate a record and write it to the media. We clean-up
	1199	* the state before releasing so we do not have to set-up
	1200	* a flush_group.
	1201	*/
	1202	record = hammer_alloc_mem_record(ip, 0);
	1203	record->type = HAMMER_MEM_RECORD_INODE;
	1204	record->flush_state = HAMMER_FST_FLUSH;
	1205	record->leaf = ip->sync_ino_leaf;
	1206	record->leaf.base.create_tid = trans->tid;
	1207	record->leaf.data_len = sizeof(ip->sync_ino_data);
	1208	record->leaf.create_ts = trans->time32;
	1209	record->data = (void *)&ip->sync_ino_data;
	1210	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1211
	1212	/*
	1213	* If this flag is set we cannot sync the new file size
	1214	* because we haven't finished related truncations. The
	1215	* inode will be flushed in another flush group to finish
	1216	* the job.
	1217	*/
	1218	if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
	1219	ip->sync_ino_data.size != ip->ino_data.size) {
	1220	redirty = 1;
	1221	ip->sync_ino_data.size = ip->ino_data.size;
	1222	} else {
	1223	redirty = 0;
	1224	}
	1225
	1226	for (;;) {
	1227	error = hammer_ip_sync_record_cursor(cursor, record);
	1228	if (hammer_debug_inode)
	1229	kprintf("GENREC %p rec %08x %d\n",
	1230	ip, record->flags, error);
	1231	if (error != EDEADLK)
	1232	break;
	1233	hammer_done_cursor(cursor);
	1234	error = hammer_init_cursor(trans, cursor,
	1235	&ip->cache[0], ip);
	1236	if (hammer_debug_inode)
	1237	kprintf("GENREC reinit %d\n", error);
	1238	if (error)
	1239	break;
	1240	}
	1241
	1242	/*
	1243	* Note: The record was never on the inode's record tree
	1244	* so just wave our hands importantly and destroy it.
	1245	*/
	1246	record->flags \|= HAMMER_RECF_COMMITTED;
	1247	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	1248	record->flush_state = HAMMER_FST_IDLE;
	1249	++ip->rec_generation;
	1250	hammer_rel_mem_record(record);
	1251
	1252	/*
	1253	* Finish up.
	1254	*/
	1255	if (error == 0) {
	1256	if (hammer_debug_inode)
	1257	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	1258	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1259	HAMMER_INODE_SDIRTY \|
	1260	HAMMER_INODE_ATIME \|
	1261	HAMMER_INODE_MTIME);
	1262	ip->flags &= ~HAMMER_INODE_DELONDISK;
	1263	if (redirty)
	1264	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	1265
	1266	/*
	1267	* Root volume count of inodes
	1268	*/
	1269	hammer_sync_lock_sh(trans);
	1270	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	1271	hammer_modify_volume_field(trans,
	1272	trans->rootvol,
	1273	vol0_stat_inodes);
	1274	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1275	hammer_modify_volume_done(trans->rootvol);
	1276	ip->flags \|= HAMMER_INODE_ONDISK;
	1277	if (hammer_debug_inode)
	1278	kprintf("NOWONDISK %p\n", ip);
	1279	}
	1280	hammer_sync_unlock(trans);
	1281	}
	1282	}
	1283
	1284	/*
	1285	* If the inode has been destroyed, clean out any left-over flags
	1286	* that may have been set by the frontend.
	1287	*/
	1288	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	1289	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1290	HAMMER_INODE_SDIRTY \|
	1291	HAMMER_INODE_ATIME \|
	1292	HAMMER_INODE_MTIME);
	1293	}
	1294	return(error);
	1295	}
	1296
	1297	/*
	1298	* Update only the itimes fields.
	1299	*
	1300	* ATIME can be updated without generating any UNDO. MTIME is updated
	1301	* with UNDO so it is guaranteed to be synchronized properly in case of
	1302	* a crash.
	1303	*
	1304	* Neither field is included in the B-Tree leaf element's CRC, which is how
	1305	* we can get away with updating ATIME the way we do.
	1306	*/
	1307	static int
	1308	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	1309	{
	1310	hammer_transaction_t trans = cursor->trans;
	1311	int error;
	1312
	1313	retry:
	1314	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) !=
	1315	HAMMER_INODE_ONDISK) {
	1316	return(0);
	1317	}
	1318
	1319	hammer_normalize_cursor(cursor);
	1320	cursor->key_beg.localization = ip->obj_localization +
	1321	HAMMER_LOCALIZE_INODE;
	1322	cursor->key_beg.obj_id = ip->obj_id;
	1323	cursor->key_beg.key = 0;
	1324	cursor->key_beg.create_tid = 0;
	1325	cursor->key_beg.delete_tid = 0;
	1326	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1327	cursor->key_beg.obj_type = 0;
	1328	cursor->asof = ip->obj_asof;
	1329	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1330	cursor->flags \|= HAMMER_CURSOR_ASOF;
	1331	cursor->flags \|= HAMMER_CURSOR_GET_LEAF;
	1332	cursor->flags \|= HAMMER_CURSOR_GET_DATA;
	1333	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1334
	1335	error = hammer_btree_lookup(cursor);
	1336	if (error == 0) {
	1337	hammer_cache_node(&ip->cache[0], cursor->node);
	1338	if (ip->sync_flags & HAMMER_INODE_MTIME) {
	1339	/*
	1340	* Updating MTIME requires an UNDO. Just cover
	1341	* both atime and mtime.
	1342	*/
	1343	hammer_sync_lock_sh(trans);
	1344	hammer_modify_buffer(trans, cursor->data_buffer,
	1345	HAMMER_ITIMES_BASE(&cursor->data->inode),
	1346	HAMMER_ITIMES_BYTES);
	1347	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1348	cursor->data->inode.mtime = ip->sync_ino_data.mtime;
	1349	hammer_modify_buffer_done(cursor->data_buffer);
	1350	hammer_sync_unlock(trans);
	1351	} else if (ip->sync_flags & HAMMER_INODE_ATIME) {
	1352	/*
	1353	* Updating atime only can be done in-place with
	1354	* no UNDO.
	1355	*/
	1356	hammer_sync_lock_sh(trans);
	1357	hammer_modify_buffer(trans, cursor->data_buffer,
	1358	NULL, 0);
	1359	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1360	hammer_modify_buffer_done(cursor->data_buffer);
	1361	hammer_sync_unlock(trans);
	1362	}
	1363	ip->sync_flags &= ~(HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME);
	1364	}
	1365	if (error == EDEADLK) {
	1366	hammer_done_cursor(cursor);
	1367	error = hammer_init_cursor(trans, cursor,
	1368	&ip->cache[0], ip);
	1369	if (error == 0)
	1370	goto retry;
	1371	}
	1372	return(error);
	1373	}
	1374
	1375	/*
	1376	* Release a reference on an inode, flush as requested.
	1377	*
	1378	* On the last reference we queue the inode to the flusher for its final
	1379	* disposition.
	1380	*/
	1381	void
	1382	hammer_rel_inode(struct hammer_inode *ip, int flush)
	1383	{
	1384	/hammer_mount_t hmp = ip->hmp;/
	1385
	1386	/*
	1387	* Handle disposition when dropping the last ref.
	1388	*/
	1389	for (;;) {
	1390	if (ip->lock.refs == 1) {
	1391	/*
	1392	* Determine whether on-disk action is needed for
	1393	* the inode's final disposition.
	1394	*/
	1395	KKASSERT(ip->vp == NULL);
	1396	hammer_inode_unloadable_check(ip, 0);
	1397	if (ip->flags & HAMMER_INODE_MODMASK) {
	1398	hammer_flush_inode(ip, 0);
	1399	} else if (ip->lock.refs == 1) {
	1400	hammer_unload_inode(ip);
	1401	break;
	1402	}
	1403	} else {
	1404	if (flush)
	1405	hammer_flush_inode(ip, 0);
	1406
	1407	/*
	1408	* The inode still has multiple refs, try to drop
	1409	* one ref.
	1410	*/
	1411	KKASSERT(ip->lock.refs >= 1);
	1412	if (ip->lock.refs > 1) {
	1413	hammer_unref(&ip->lock);
	1414	break;
	1415	}
	1416	}
	1417	}
	1418	}
	1419
	1420	/*
	1421	* Unload and destroy the specified inode. Must be called with one remaining
	1422	* reference. The reference is disposed of.
	1423	*
	1424	* The inode must be completely clean.
	1425	*/
	1426	static int
	1427	hammer_unload_inode(struct hammer_inode *ip)
	1428	{
	1429	hammer_mount_t hmp = ip->hmp;
	1430
	1431	KASSERT(ip->lock.refs == 1,
	1432	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	1433	KKASSERT(ip->vp == NULL);
	1434	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	1435	KKASSERT(ip->cursor_ip_refs == 0);
	1436	KKASSERT(hammer_notlocked(&ip->lock));
	1437	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	1438
	1439	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1440	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	1441
	1442	RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
	1443
	1444	hammer_free_inode(ip);
	1445	return(0);
	1446	}
	1447
	1448	/*
	1449	* Called during unmounting if a critical error occured. The in-memory
	1450	* inode and all related structures are destroyed.
	1451	*
	1452	* If a critical error did not occur the unmount code calls the standard
	1453	* release and asserts that the inode is gone.
	1454	*/
	1455	int
	1456	hammer_destroy_inode_callback(struct hammer_inode ip, void data __unused)
	1457	{
	1458	hammer_record_t rec;
	1459
	1460	/*
	1461	* Get rid of the inodes in-memory records, regardless of their
	1462	* state, and clear the mod-mask.
	1463	*/
	1464	while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
	1465	TAILQ_REMOVE(&ip->target_list, rec, target_entry);
	1466	rec->target_ip = NULL;
	1467	if (rec->flush_state == HAMMER_FST_SETUP)
	1468	rec->flush_state = HAMMER_FST_IDLE;
	1469	}
	1470	while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
	1471	if (rec->flush_state == HAMMER_FST_FLUSH)
	1472	--rec->flush_group->refs;
	1473	else
	1474	hammer_ref(&rec->lock);
	1475	KKASSERT(rec->lock.refs == 1);
	1476	rec->flush_state = HAMMER_FST_IDLE;
	1477	rec->flush_group = NULL;
	1478	rec->flags \|= HAMMER_RECF_DELETED_FE; /* wave hands */
	1479	rec->flags \|= HAMMER_RECF_DELETED_BE; /* wave hands */
	1480	++ip->rec_generation;
	1481	hammer_rel_mem_record(rec);
	1482	}
	1483	ip->flags &= ~HAMMER_INODE_MODMASK;
	1484	ip->sync_flags &= ~HAMMER_INODE_MODMASK;
	1485	KKASSERT(ip->vp == NULL);
	1486
	1487	/*
	1488	* Remove the inode from any flush group, force it idle. FLUSH
	1489	* and SETUP states have an inode ref.
	1490	*/
	1491	switch(ip->flush_state) {
	1492	case HAMMER_FST_FLUSH:
	1493	RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
	1494	--ip->flush_group->refs;
	1495	ip->flush_group = NULL;
	1496	/* fall through */
	1497	case HAMMER_FST_SETUP:
	1498	hammer_unref(&ip->lock);
	1499	ip->flush_state = HAMMER_FST_IDLE;
	1500	/* fall through */
	1501	case HAMMER_FST_IDLE:
	1502	break;
	1503	}
	1504
	1505	/*
	1506	* There shouldn't be any associated vnode. The unload needs at
	1507	* least one ref, if we do have a vp steal its ip ref.
	1508	*/
	1509	if (ip->vp) {
	1510	kprintf("hammer_destroy_inode_callback: Unexpected "
	1511	"vnode association ip %p vp %p\n", ip, ip->vp);
	1512	ip->vp->v_data = NULL;
	1513	ip->vp = NULL;
	1514	} else {
	1515	hammer_ref(&ip->lock);
	1516	}
	1517	hammer_unload_inode(ip);
	1518	return(0);
	1519	}
	1520
	1521	/*
	1522	* Called on mount -u when switching from RW to RO or vise-versa. Adjust
	1523	* the read-only flag for cached inodes.
	1524	*
	1525	* This routine is called from a RB_SCAN().
	1526	*/
	1527	int
	1528	hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
	1529	{
	1530	hammer_mount_t hmp = ip->hmp;
	1531
	1532	if (hmp->ronly \|\| hmp->asof != HAMMER_MAX_TID)
	1533	ip->flags \|= HAMMER_INODE_RO;
	1534	else
	1535	ip->flags &= ~HAMMER_INODE_RO;
	1536	return(0);
	1537	}
	1538
	1539	/*
	1540	* A transaction has modified an inode, requiring updates as specified by
	1541	* the passed flags.
	1542	*
	1543	* HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
	1544	* and not including size changes due to write-append
	1545	* (but other size changes are included).
	1546	* HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
	1547	* write-append.
	1548	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	1549	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	1550	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	1551	* HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
	1552	*/
	1553	void
	1554	hammer_modify_inode(hammer_inode_t ip, int flags)
	1555	{
	1556	/*
	1557	* ronly of 0 or 2 does not trigger assertion.
	1558	* 2 is a special error state
	1559	*/
	1560	KKASSERT(ip->hmp->ronly != 1 \|\|
	1561	(flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	1562	HAMMER_INODE_SDIRTY \|
	1563	HAMMER_INODE_BUFS \| HAMMER_INODE_DELETED \|
	1564	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) == 0);
	1565	if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
	1566	ip->flags \|= HAMMER_INODE_RSV_INODES;
	1567	++ip->hmp->rsv_inodes;
	1568	}
	1569
	1570	ip->flags \|= flags;
	1571	}
	1572
	1573	/*
	1574	* Request that an inode be flushed. This whole mess cannot block and may
	1575	* recurse (if not synchronous). Once requested HAMMER will attempt to
	1576	* actively flush the inode until the flush can be done.
	1577	*
	1578	* The inode may already be flushing, or may be in a setup state. We can
	1579	* place the inode in a flushing state if it is currently idle and flag it
	1580	* to reflush if it is currently flushing.
	1581	*
	1582	* Upon return if the inode could not be flushed due to a setup
	1583	* dependancy, then it will be automatically flushed when the dependancy
	1584	* is satisfied.
	1585	*/
	1586	void
	1587	hammer_flush_inode(hammer_inode_t ip, int flags)
	1588	{
	1589	hammer_mount_t hmp;
	1590	hammer_flush_group_t flg;
	1591	int good;
	1592
	1593	/*
	1594	* next_flush_group is the first flush group we can place the inode
	1595	* in. It may be NULL. If it becomes full we append a new flush
	1596	* group and make that the next_flush_group.
	1597	*/
	1598	hmp = ip->hmp;
	1599	while ((flg = hmp->next_flush_group) != NULL) {
	1600	KKASSERT(flg->running == 0);
	1601	if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
	1602	break;
	1603	hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
	1604	hammer_flusher_async(ip->hmp, flg);
	1605	}
	1606	if (flg == NULL) {
	1607	flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK\|M_ZERO);
	1608	hmp->next_flush_group = flg;
	1609	RB_INIT(&flg->flush_tree);
	1610	TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
	1611	}
	1612
	1613	/*
	1614	* Trivial 'nothing to flush' case. If the inode is in a SETUP
	1615	* state we have to put it back into an IDLE state so we can
	1616	* drop the extra ref.
	1617	*
	1618	* If we have a parent dependancy we must still fall through
	1619	* so we can run it.
	1620	*/
	1621	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	1622	if (ip->flush_state == HAMMER_FST_SETUP &&
	1623	TAILQ_EMPTY(&ip->target_list)) {
	1624	ip->flush_state = HAMMER_FST_IDLE;
	1625	hammer_rel_inode(ip, 0);
	1626	}
	1627	if (ip->flush_state == HAMMER_FST_IDLE)
	1628	return;
	1629	}
	1630
	1631	/*
	1632	* Our flush action will depend on the current state.
	1633	*/
	1634	switch(ip->flush_state) {
	1635	case HAMMER_FST_IDLE:
	1636	/*
	1637	* We have no dependancies and can flush immediately. Some
	1638	* our children may not be flushable so we have to re-test
	1639	* with that additional knowledge.
	1640	*/
	1641	hammer_flush_inode_core(ip, flg, flags);
	1642	break;
	1643	case HAMMER_FST_SETUP:
	1644	/*
	1645	* Recurse upwards through dependancies via target_list
	1646	* and start their flusher actions going if possible.
	1647	*
	1648	* 'good' is our connectivity. -1 means we have none and
	1649	* can't flush, 0 means there weren't any dependancies, and
	1650	* 1 means we have good connectivity.
	1651	*/
	1652	good = hammer_setup_parent_inodes(ip, 0, flg);
	1653
	1654	if (good >= 0) {
	1655	/*
	1656	* We can continue if good >= 0. Determine how
	1657	* many records under our inode can be flushed (and
	1658	* mark them).
	1659	*/
	1660	hammer_flush_inode_core(ip, flg, flags);
	1661	} else {
	1662	/*
	1663	* Parent has no connectivity, tell it to flush
	1664	* us as soon as it does.
	1665	*
	1666	* The REFLUSH flag is also needed to trigger
	1667	* dependancy wakeups.
	1668	*/
	1669	ip->flags \|= HAMMER_INODE_CONN_DOWN \|
	1670	HAMMER_INODE_REFLUSH;
	1671	if (flags & HAMMER_FLUSH_SIGNAL) {
	1672	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1673	hammer_flusher_async(ip->hmp, flg);
	1674	}
	1675	}
	1676	break;
	1677	case HAMMER_FST_FLUSH:
	1678	/*
	1679	* We are already flushing, flag the inode to reflush
	1680	* if needed after it completes its current flush.
	1681	*
	1682	* The REFLUSH flag is also needed to trigger
	1683	* dependancy wakeups.
	1684	*/
	1685	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	1686	ip->flags \|= HAMMER_INODE_REFLUSH;
	1687	if (flags & HAMMER_FLUSH_SIGNAL) {
	1688	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1689	hammer_flusher_async(ip->hmp, flg);
	1690	}
	1691	break;
	1692	}
	1693	}
	1694
	1695	/*
	1696	* Scan ip->target_list, which is a list of records owned by PARENTS to our
	1697	* ip which reference our ip.
	1698	*
	1699	* XXX This is a huge mess of recursive code, but not one bit of it blocks
	1700	* so for now do not ref/deref the structures. Note that if we use the
	1701	* ref/rel code later, the rel CAN block.
	1702	*/
	1703	static int
	1704	hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	1705	hammer_flush_group_t flg)
	1706	{
	1707	hammer_record_t depend;
	1708	int good;
	1709	int r;
	1710
	1711	/*
	1712	* If we hit our recursion limit and we have parent dependencies
	1713	* We cannot continue. Returning < 0 will cause us to be flagged
	1714	* for reflush. Returning -2 cuts off additional dependency checks
	1715	* because they are likely to also hit the depth limit.
	1716	*
	1717	* We cannot return < 0 if there are no dependencies or there might
	1718	* not be anything to wakeup (ip).
	1719	*/
	1720	if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
	1721	kprintf("HAMMER Warning: depth limit reached on "
	1722	"setup recursion, inode %p %016llx\n",
	1723	ip, (long long)ip->obj_id);
	1724	return(-2);
	1725	}
	1726
	1727	/*
	1728	* Scan dependencies
	1729	*/
	1730	good = 0;
	1731	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	1732	r = hammer_setup_parent_inodes_helper(depend, depth, flg);
	1733	KKASSERT(depend->target_ip == ip);
	1734	if (r < 0 && good == 0)
	1735	good = -1;
	1736	if (r > 0)
	1737	good = 1;
	1738
	1739	/*
	1740	* If we failed due to the recursion depth limit then stop
	1741	* now.
	1742	*/
	1743	if (r == -2)
	1744	break;
	1745	}
	1746	return(good);
	1747	}
	1748
	1749	/*
	1750	* This helper function takes a record representing the dependancy between
	1751	* the parent inode and child inode.
	1752	*
	1753	* record->ip = parent inode
	1754	* record->target_ip = child inode
	1755	*
	1756	* We are asked to recurse upwards and convert the record from SETUP
	1757	* to FLUSH if possible.
	1758	*
	1759	* Return 1 if the record gives us connectivity
	1760	*
	1761	* Return 0 if the record is not relevant
	1762	*
	1763	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	1764	*/
	1765	static int
	1766	hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
	1767	hammer_flush_group_t flg)
	1768	{
	1769	hammer_mount_t hmp;
	1770	hammer_inode_t pip;
	1771	int good;
	1772
	1773	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	1774	pip = record->ip;
	1775	hmp = pip->hmp;
	1776
	1777	/*
	1778	* If the record is already flushing, is it in our flush group?
	1779	*
	1780	* If it is in our flush group but it is a general record or a
	1781	* delete-on-disk, it does not improve our connectivity (return 0),
	1782	* and if the target inode is not trying to destroy itself we can't
	1783	* allow the operation yet anyway (the second return -1).
	1784	*/
	1785	if (record->flush_state == HAMMER_FST_FLUSH) {
	1786	/*
	1787	* If not in our flush group ask the parent to reflush
	1788	* us as soon as possible.
	1789	*/
	1790	if (record->flush_group != flg) {
	1791	pip->flags \|= HAMMER_INODE_REFLUSH;
	1792	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1793	return(-1);
	1794	}
	1795
	1796	/*
	1797	* If in our flush group everything is already set up,
	1798	* just return whether the record will improve our
	1799	* visibility or not.
	1800	*/
	1801	if (record->type == HAMMER_MEM_RECORD_ADD)
	1802	return(1);
	1803	return(0);
	1804	}
	1805
	1806	/*
	1807	* It must be a setup record. Try to resolve the setup dependancies
	1808	* by recursing upwards so we can place ip on the flush list.
	1809	*
	1810	* Limit ourselves to 20 levels of recursion to avoid blowing out
	1811	* the kernel stack. If we hit the recursion limit we can't flush
	1812	* until the parent flushes. The parent will flush independantly
	1813	* on its own and ultimately a deep recursion will be resolved.
	1814	*/
	1815	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1816
	1817	good = hammer_setup_parent_inodes(pip, depth + 1, flg);
	1818
	1819	/*
	1820	* If good < 0 the parent has no connectivity and we cannot safely
	1821	* flush the directory entry, which also means we can't flush our
	1822	* ip. Flag us for downward recursion once the parent's
	1823	* connectivity is resolved. Flag the parent for [re]flush or it
	1824	* may not check for downward recursions.
	1825	*/
	1826	if (good < 0) {
	1827	pip->flags \|= HAMMER_INODE_REFLUSH;
	1828	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1829	return(good);
	1830	}
	1831
	1832	/*
	1833	* We are go, place the parent inode in a flushing state so we can
	1834	* place its record in a flushing state. Note that the parent
	1835	* may already be flushing. The record must be in the same flush
	1836	* group as the parent.
	1837	*/
	1838	if (pip->flush_state != HAMMER_FST_FLUSH)
	1839	hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
	1840	KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
	1841	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1842
	1843	#if 0
	1844	if (record->type == HAMMER_MEM_RECORD_DEL &&
	1845	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	1846	/*
	1847	* Regardless of flushing state we cannot sync this path if the
	1848	* record represents a delete-on-disk but the target inode
	1849	* is not ready to sync its own deletion.
	1850	*
	1851	* XXX need to count effective nlinks to determine whether
	1852	* the flush is ok, otherwise removing a hardlink will
	1853	* just leave the DEL record to rot.
	1854	*/
	1855	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1856	return(-1);
	1857	} else
	1858	#endif
	1859	if (pip->flush_group == flg) {
	1860	/*
	1861	* Because we have not calculated nlinks yet we can just
	1862	* set records to the flush state if the parent is in
	1863	* the same flush group as we are.
	1864	*/
	1865	record->flush_state = HAMMER_FST_FLUSH;
	1866	record->flush_group = flg;
	1867	++record->flush_group->refs;
	1868	hammer_ref(&record->lock);
	1869
	1870	/*
	1871	* A general directory-add contributes to our visibility.
	1872	*
	1873	* Otherwise it is probably a directory-delete or
	1874	* delete-on-disk record and does not contribute to our
	1875	* visbility (but we can still flush it).
	1876	*/
	1877	if (record->type == HAMMER_MEM_RECORD_ADD)
	1878	return(1);
	1879	return(0);
	1880	} else {
	1881	/*
	1882	* If the parent is not in our flush group we cannot
	1883	* flush this record yet, there is no visibility.
	1884	* We tell the parent to reflush and mark ourselves
	1885	* so the parent knows it should flush us too.
	1886	*/
	1887	pip->flags \|= HAMMER_INODE_REFLUSH;
	1888	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1889	return(-1);
	1890	}
	1891	}
	1892
	1893	/*
	1894	* This is the core routine placing an inode into the FST_FLUSH state.
	1895	*/
	1896	static void
	1897	hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
	1898	{
	1899	int go_count;
	1900
	1901	/*
	1902	* Set flush state and prevent the flusher from cycling into
	1903	* the next flush group. Do not place the ip on the list yet.
	1904	* Inodes not in the idle state get an extra reference.
	1905	*/
	1906	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	1907	if (ip->flush_state == HAMMER_FST_IDLE)
	1908	hammer_ref(&ip->lock);
	1909	ip->flush_state = HAMMER_FST_FLUSH;
	1910	ip->flush_group = flg;
	1911	++ip->hmp->flusher.group_lock;
	1912	++ip->hmp->count_iqueued;
	1913	++hammer_count_iqueued;
	1914	++flg->total_count;
	1915
	1916	/*
	1917	* If the flush group reaches the autoflush limit we want to signal
	1918	* the flusher. This is particularly important for remove()s.
	1919	*/
	1920	if (flg->total_count == hammer_autoflush)
	1921	flags \|= HAMMER_FLUSH_SIGNAL;
	1922
	1923	/*
	1924	* We need to be able to vfsync/truncate from the backend.
	1925	*/
	1926	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	1927	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	1928	ip->flags \|= HAMMER_INODE_VHELD;
	1929	vref(ip->vp);
	1930	}
	1931
	1932	/*
	1933	* Figure out how many in-memory records we can actually flush
	1934	* (not including inode meta-data, buffers, etc).
	1935	*/
	1936	KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
	1937	if (flags & HAMMER_FLUSH_RECURSION) {
	1938	/*
	1939	* If this is a upwards recursion we do not want to
	1940	* recurse down again!
	1941	*/
	1942	go_count = 1;
	1943	#if 0
	1944	} else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	1945	/*
	1946	* No new records are added if we must complete a flush
	1947	* from a previous cycle, but we do have to move the records
	1948	* from the previous cycle to the current one.
	1949	*/
	1950	#if 0
	1951	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1952	hammer_syncgrp_child_callback, NULL);
	1953	#endif
	1954	go_count = 1;
	1955	#endif
	1956	} else {
	1957	/*
	1958	* Normal flush, scan records and bring them into the flush.
	1959	* Directory adds and deletes are usually skipped (they are
	1960	* grouped with the related inode rather then with the
	1961	* directory).
	1962	*
	1963	* go_count can be negative, which means the scan aborted
	1964	* due to the flush group being over-full and we should
	1965	* flush what we have.
	1966	*/
	1967	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1968	hammer_setup_child_callback, NULL);
	1969	}
	1970
	1971	/*
	1972	* This is a more involved test that includes go_count. If we
	1973	* can't flush, flag the inode and return. If go_count is 0 we
	1974	* were are unable to flush any records in our rec_tree and
	1975	* must ignore the XDIRTY flag.
	1976	*/
	1977	if (go_count == 0) {
	1978	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	1979	--ip->hmp->count_iqueued;
	1980	--hammer_count_iqueued;
	1981
	1982	--flg->total_count;
	1983	ip->flush_state = HAMMER_FST_SETUP;
	1984	ip->flush_group = NULL;
	1985	if (ip->flags & HAMMER_INODE_VHELD) {
	1986	ip->flags &= ~HAMMER_INODE_VHELD;
	1987	vrele(ip->vp);
	1988	}
	1989
	1990	/*
	1991	* REFLUSH is needed to trigger dependancy wakeups
	1992	* when an inode is in SETUP.
	1993	*/
	1994	ip->flags \|= HAMMER_INODE_REFLUSH;
	1995	if (flags & HAMMER_FLUSH_SIGNAL) {
	1996	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1997	hammer_flusher_async(ip->hmp, flg);
	1998	}
	1999	if (--ip->hmp->flusher.group_lock == 0)
	2000	wakeup(&ip->hmp->flusher.group_lock);
	2001	return;
	2002	}
	2003	}
	2004
	2005	/*
	2006	* Snapshot the state of the inode for the backend flusher.
	2007	*
	2008	* We continue to retain save_trunc_off even when all truncations
	2009	* have been resolved as an optimization to determine if we can
	2010	* skip the B-Tree lookup for overwrite deletions.
	2011	*
	2012	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	2013	* and stays in ip->flags. Once set, it stays set until the
	2014	* inode is destroyed.
	2015	*/
	2016	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	2017	KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
	2018	ip->sync_trunc_off = ip->trunc_off;
	2019	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	2020	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	2021	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	2022
	2023	/*
	2024	* The save_trunc_off used to cache whether the B-Tree
	2025	* holds any records past that point is not used until
	2026	* after the truncation has succeeded, so we can safely
	2027	* set it now.
	2028	*/
	2029	if (ip->save_trunc_off > ip->sync_trunc_off)
	2030	ip->save_trunc_off = ip->sync_trunc_off;
	2031	}
	2032	ip->sync_flags \|= (ip->flags & HAMMER_INODE_MODMASK &
	2033	~HAMMER_INODE_TRUNCATED);
	2034	ip->sync_ino_leaf = ip->ino_leaf;
	2035	ip->sync_ino_data = ip->ino_data;
	2036	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	2037	#ifdef DEBUG_TRUNCATE
	2038	if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
	2039	kprintf("truncateS %016llx\n", ip->sync_trunc_off);
	2040	#endif
	2041
	2042	/*
	2043	* The flusher list inherits our inode and reference.
	2044	*/
	2045	KKASSERT(flg->running == 0);
	2046	RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
	2047	if (--ip->hmp->flusher.group_lock == 0)
	2048	wakeup(&ip->hmp->flusher.group_lock);
	2049
	2050	if (flags & HAMMER_FLUSH_SIGNAL) {
	2051	hammer_flusher_async(ip->hmp, flg);
	2052	}
	2053	}
	2054
	2055	/*
	2056	* Callback for scan of ip->rec_tree. Try to include each record in our
	2057	* flush. ip->flush_group has been set but the inode has not yet been
	2058	* moved into a flushing state.
	2059	*
	2060	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	2061	* both inodes.
	2062	*
	2063	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	2064	* the caller from shortcutting the flush.
	2065	*/
	2066	static int
	2067	hammer_setup_child_callback(hammer_record_t rec, void *data)
	2068	{
	2069	hammer_flush_group_t flg;
	2070	hammer_inode_t target_ip;
	2071	hammer_inode_t ip;
	2072	int r;
	2073
	2074	/*
	2075	* Records deleted or committed by the backend are ignored.
	2076	* Note that the flush detects deleted frontend records at
	2077	* multiple points to deal with races. This is just the first
	2078	* line of defense. The only time HAMMER_RECF_DELETED_FE cannot
	2079	* be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
	2080	* messes up link-count calculations.
	2081	*
	2082	* NOTE: Don't get confused between record deletion and, say,
	2083	* directory entry deletion. The deletion of a directory entry
	2084	* which is on-media has nothing to do with the record deletion
	2085	* flags.
	2086	*/
	2087	if (rec->flags & (HAMMER_RECF_DELETED_FE \| HAMMER_RECF_DELETED_BE \|
	2088	HAMMER_RECF_COMMITTED)) {
	2089	if (rec->flush_state == HAMMER_FST_FLUSH) {
	2090	KKASSERT(rec->flush_group == rec->ip->flush_group);
	2091	r = 1;
	2092	} else {
	2093	r = 0;
	2094	}
	2095	return(r);
	2096	}
	2097
	2098	/*
	2099	* If the record is in an idle state it has no dependancies and
	2100	* can be flushed.
	2101	*/
	2102	ip = rec->ip;
	2103	flg = ip->flush_group;
	2104	r = 0;
	2105
	2106	switch(rec->flush_state) {
	2107	case HAMMER_FST_IDLE:
	2108	/*
	2109	* The record has no setup dependancy, we can flush it.
	2110	*/
	2111	KKASSERT(rec->target_ip == NULL);
	2112	rec->flush_state = HAMMER_FST_FLUSH;
	2113	rec->flush_group = flg;
	2114	++flg->refs;
	2115	hammer_ref(&rec->lock);
	2116	r = 1;
	2117	break;
	2118	case HAMMER_FST_SETUP:
	2119	/*
	2120	* The record has a setup dependancy. These are typically
	2121	* directory entry adds and deletes. Such entries will be
	2122	* flushed when their inodes are flushed so we do not
	2123	* usually have to add them to the flush here. However,
	2124	* if the target_ip has set HAMMER_INODE_CONN_DOWN then
	2125	* it is asking us to flush this record (and it).
	2126	*/
	2127	target_ip = rec->target_ip;
	2128	KKASSERT(target_ip != NULL);
	2129	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	2130
	2131	/*
	2132	* If the target IP is already flushing in our group
	2133	* we could associate the record, but target_ip has
	2134	* already synced ino_data to sync_ino_data and we
	2135	* would also have to adjust nlinks. Plus there are
	2136	* ordering issues for adds and deletes.
	2137	*
	2138	* Reflush downward if this is an ADD, and upward if
	2139	* this is a DEL.
	2140	*/
	2141	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	2142	if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
	2143	ip->flags \|= HAMMER_INODE_REFLUSH;
	2144	else
	2145	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	2146	break;
	2147	}
	2148
	2149	/*
	2150	* Target IP is not yet flushing. This can get complex
	2151	* because we have to be careful about the recursion.
	2152	*
	2153	* Directories create an issue for us in that if a flush
	2154	* of a directory is requested the expectation is to flush
	2155	* any pending directory entries, but this will cause the
	2156	* related inodes to recursively flush as well. We can't
	2157	* really defer the operation so just get as many as we
	2158	* can and
	2159	*/
	2160	#if 0
	2161	if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
	2162	(target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
	2163	/*
	2164	* We aren't reclaiming and the target ip was not
	2165	* previously prevented from flushing due to this
	2166	* record dependancy. Do not flush this record.
	2167	*/
	2168	/r = 0;/
	2169	} else
	2170	#endif
	2171	if (flg->total_count + flg->refs >
	2172	ip->hmp->undo_rec_limit) {
	2173	/*
	2174	* Our flush group is over-full and we risk blowing
	2175	* out the UNDO FIFO. Stop the scan, flush what we
	2176	* have, then reflush the directory.
	2177	*
	2178	* The directory may be forced through multiple
	2179	* flush groups before it can be completely
	2180	* flushed.
	2181	*/
	2182	ip->flags \|= HAMMER_INODE_RESIGNAL \|
	2183	HAMMER_INODE_REFLUSH;
	2184	r = -1;
	2185	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	2186	/*
	2187	* If the target IP is not flushing we can force
	2188	* it to flush, even if it is unable to write out
	2189	* any of its own records we have at least one in
	2190	* hand that we CAN deal with.
	2191	*/
	2192	rec->flush_state = HAMMER_FST_FLUSH;
	2193	rec->flush_group = flg;
	2194	++flg->refs;
	2195	hammer_ref(&rec->lock);
	2196	hammer_flush_inode_core(target_ip, flg,
	2197	HAMMER_FLUSH_RECURSION);
	2198	r = 1;
	2199	} else {
	2200	/*
	2201	* General or delete-on-disk record.
	2202	*
	2203	* XXX this needs help. If a delete-on-disk we could
	2204	* disconnect the target. If the target has its own
	2205	* dependancies they really need to be flushed.
	2206	*
	2207	* XXX
	2208	*/
	2209	rec->flush_state = HAMMER_FST_FLUSH;
	2210	rec->flush_group = flg;
	2211	++flg->refs;
	2212	hammer_ref(&rec->lock);
	2213	hammer_flush_inode_core(target_ip, flg,
	2214	HAMMER_FLUSH_RECURSION);
	2215	r = 1;
	2216	}
	2217	break;
	2218	case HAMMER_FST_FLUSH:
	2219	/*
	2220	* The flush_group should already match.
	2221	*/
	2222	KKASSERT(rec->flush_group == flg);
	2223	r = 1;
	2224	break;
	2225	}
	2226	return(r);
	2227	}
	2228
	2229	#if 0
	2230	/*
	2231	* This version just moves records already in a flush state to the new
	2232	* flush group and that is it.
	2233	*/
	2234	static int
	2235	hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
	2236	{
	2237	hammer_inode_t ip = rec->ip;
	2238
	2239	switch(rec->flush_state) {
	2240	case HAMMER_FST_FLUSH:
	2241	KKASSERT(rec->flush_group == ip->flush_group);
	2242	break;
	2243	default:
	2244	break;
	2245	}
	2246	return(0);
	2247	}
	2248	#endif
	2249
	2250	/*
	2251	* Wait for a previously queued flush to complete.
	2252	*
	2253	* If a critical error occured we don't try to wait.
	2254	*/
	2255	void
	2256	hammer_wait_inode(hammer_inode_t ip)
	2257	{
	2258	hammer_flush_group_t flg;
	2259
	2260	flg = NULL;
	2261	if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2262	while (ip->flush_state != HAMMER_FST_IDLE &&
	2263	(ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2264	if (ip->flush_state == HAMMER_FST_SETUP)
	2265	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2266	if (ip->flush_state != HAMMER_FST_IDLE) {
	2267	ip->flags \|= HAMMER_INODE_FLUSHW;
	2268	tsleep(&ip->flags, 0, "hmrwin", 0);
	2269	}
	2270	}
	2271	}
	2272	}
	2273
	2274	/*
	2275	* Called by the backend code when a flush has been completed.
	2276	* The inode has already been removed from the flush list.
	2277	*
	2278	* A pipelined flush can occur, in which case we must re-enter the
	2279	* inode on the list and re-copy its fields.
	2280	*/
	2281	void
	2282	hammer_flush_inode_done(hammer_inode_t ip, int error)
	2283	{
	2284	hammer_mount_t hmp;
	2285	int dorel;
	2286
	2287	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	2288
	2289	hmp = ip->hmp;
	2290
	2291	/*
	2292	* Auto-reflush if the backend could not completely flush
	2293	* the inode. This fixes a case where a deferred buffer flush
	2294	* could cause fsync to return early.
	2295	*/
	2296	if (ip->sync_flags & HAMMER_INODE_MODMASK)
	2297	ip->flags \|= HAMMER_INODE_REFLUSH;
	2298
	2299	/*
	2300	* Merge left-over flags back into the frontend and fix the state.
	2301	* Incomplete truncations are retained by the backend.
	2302	*/
	2303	ip->error = error;
	2304	ip->flags \|= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
	2305	ip->sync_flags &= HAMMER_INODE_TRUNCATED;
	2306
	2307	/*
	2308	* The backend may have adjusted nlinks, so if the adjusted nlinks
	2309	* does not match the fronttend set the frontend's RDIRTY flag again.
	2310	*/
	2311	if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
	2312	ip->flags \|= HAMMER_INODE_DDIRTY;
	2313
	2314	/*
	2315	* Fix up the dirty buffer status.
	2316	*/
	2317	if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
	2318	ip->flags \|= HAMMER_INODE_BUFS;
	2319	}
	2320
	2321	/*
	2322	* Re-set the XDIRTY flag if some of the inode's in-memory records
	2323	* could not be flushed.
	2324	*/
	2325	KKASSERT((RB_EMPTY(&ip->rec_tree) &&
	2326	(ip->flags & HAMMER_INODE_XDIRTY) == 0) \|\|
	2327	(!RB_EMPTY(&ip->rec_tree) &&
	2328	(ip->flags & HAMMER_INODE_XDIRTY) != 0));
	2329
	2330	/*
	2331	* Do not lose track of inodes which no longer have vnode
	2332	* assocations, otherwise they may never get flushed again.
	2333	*
	2334	* The reflush flag can be set superfluously, causing extra pain
	2335	* for no reason. If the inode is no longer modified it no longer
	2336	* needs to be flushed.
	2337	*/
	2338	if (ip->flags & HAMMER_INODE_MODMASK) {
	2339	if (ip->vp == NULL)
	2340	ip->flags \|= HAMMER_INODE_REFLUSH;
	2341	} else {
	2342	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2343	}
	2344
	2345	/*
	2346	* Adjust the flush state.
	2347	*/
	2348	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	2349	/*
	2350	* We were unable to flush out all our records, leave the
	2351	* inode in a flush state and in the current flush group.
	2352	* The flush group will be re-run.
	2353	*
	2354	* This occurs if the UNDO block gets too full or there is
	2355	* too much dirty meta-data and allows the flusher to
	2356	* finalize the UNDO block and then re-flush.
	2357	*/
	2358	ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
	2359	dorel = 0;
	2360	} else {
	2361	/*
	2362	* Remove from the flush_group
	2363	*/
	2364	RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
	2365	ip->flush_group = NULL;
	2366
	2367	/*
	2368	* Clean up the vnode ref and tracking counts.
	2369	*/
	2370	if (ip->flags & HAMMER_INODE_VHELD) {
	2371	ip->flags &= ~HAMMER_INODE_VHELD;
	2372	vrele(ip->vp);
	2373	}
	2374	--hmp->count_iqueued;
	2375	--hammer_count_iqueued;
	2376
	2377	/*
	2378	* And adjust the state.
	2379	*/
	2380	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	2381	ip->flush_state = HAMMER_FST_IDLE;
	2382	dorel = 1;
	2383	} else {
	2384	ip->flush_state = HAMMER_FST_SETUP;
	2385	dorel = 0;
	2386	}
	2387
	2388	/*
	2389	* If the frontend is waiting for a flush to complete,
	2390	* wake it up.
	2391	*/
	2392	if (ip->flags & HAMMER_INODE_FLUSHW) {
	2393	ip->flags &= ~HAMMER_INODE_FLUSHW;
	2394	wakeup(&ip->flags);
	2395	}
	2396
	2397	/*
	2398	* If the frontend made more changes and requested another
	2399	* flush, then try to get it running.
	2400	*
	2401	* Reflushes are aborted when the inode is errored out.
	2402	*/
	2403	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2404	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2405	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2406	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2407	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2408	} else {
	2409	hammer_flush_inode(ip, 0);
	2410	}
	2411	}
	2412	}
	2413
	2414	/*
	2415	* If we have no parent dependancies we can clear CONN_DOWN
	2416	*/
	2417	if (TAILQ_EMPTY(&ip->target_list))
	2418	ip->flags &= ~HAMMER_INODE_CONN_DOWN;
	2419
	2420	/*
	2421	* If the inode is now clean drop the space reservation.
	2422	*/
	2423	if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	2424	(ip->flags & HAMMER_INODE_RSV_INODES)) {
	2425	ip->flags &= ~HAMMER_INODE_RSV_INODES;
	2426	--hmp->rsv_inodes;
	2427	}
	2428
	2429	if (dorel)
	2430	hammer_rel_inode(ip, 0);
	2431	}
	2432
	2433	/*
	2434	* Called from hammer_sync_inode() to synchronize in-memory records
	2435	* to the media.
	2436	*/
	2437	static int
	2438	hammer_sync_record_callback(hammer_record_t record, void *data)
	2439	{
	2440	hammer_cursor_t cursor = data;
	2441	hammer_transaction_t trans = cursor->trans;
	2442	hammer_mount_t hmp = trans->hmp;
	2443	int error;
	2444
	2445	/*
	2446	* Skip records that do not belong to the current flush.
	2447	*/
	2448	++hammer_stats_record_iterations;
	2449	if (record->flush_state != HAMMER_FST_FLUSH)
	2450	return(0);
	2451
	2452	#if 1
	2453	if (record->flush_group != record->ip->flush_group) {
	2454	kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	2455	if (hammer_debug_critical)
	2456	Debugger("blah2");
	2457	return(0);
	2458	}
	2459	#endif
	2460	KKASSERT(record->flush_group == record->ip->flush_group);
	2461
	2462	/*
	2463	* Interlock the record using the BE flag. Once BE is set the
	2464	* frontend cannot change the state of FE.
	2465	*
	2466	* NOTE: If FE is set prior to us setting BE we still sync the
	2467	* record out, but the flush completion code converts it to
	2468	* a delete-on-disk record instead of destroying it.
	2469	*/
	2470	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	2471	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	2472
	2473	/*
	2474	* The backend has already disposed of the record.
	2475	*/
	2476	if (record->flags & (HAMMER_RECF_DELETED_BE \| HAMMER_RECF_COMMITTED)) {
	2477	error = 0;
	2478	goto done;
	2479	}
	2480
	2481	/*
	2482	* If the whole inode is being deleting all on-disk records will
	2483	* be deleted very soon, we can't sync any new records to disk
	2484	* because they will be deleted in the same transaction they were
	2485	* created in (delete_tid == create_tid), which will assert.
	2486	*
	2487	* XXX There may be a case with RECORD_ADD with DELETED_FE set
	2488	* that we currently panic on.
	2489	*/
	2490	if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
	2491	switch(record->type) {
	2492	case HAMMER_MEM_RECORD_DATA:
	2493	/*
	2494	* We don't have to do anything, if the record was
	2495	* committed the space will have been accounted for
	2496	* in the blockmap.
	2497	*/
	2498	/* fall through */
	2499	case HAMMER_MEM_RECORD_GENERAL:
	2500	/*
	2501	* Set deleted-by-backend flag. Do not set the
	2502	* backend committed flag, because we are throwing
	2503	* the record away.
	2504	*/
	2505	record->flags \|= HAMMER_RECF_DELETED_BE;
	2506	++record->ip->rec_generation;
	2507	error = 0;
	2508	goto done;
	2509	case HAMMER_MEM_RECORD_ADD:
	2510	panic("hammer_sync_record_callback: illegal add "
	2511	"during inode deletion record %p", record);
	2512	break; /* NOT REACHED */
	2513	case HAMMER_MEM_RECORD_INODE:
	2514	panic("hammer_sync_record_callback: attempt to "
	2515	"sync inode record %p?", record);
	2516	break; /* NOT REACHED */
	2517	case HAMMER_MEM_RECORD_DEL:
	2518	/*
	2519	* Follow through and issue the on-disk deletion
	2520	*/
	2521	break;
	2522	}
	2523	}
	2524
	2525	/*
	2526	* If DELETED_FE is set special handling is needed for directory
	2527	* entries. Dependant pieces related to the directory entry may
	2528	* have already been synced to disk. If this occurs we have to
	2529	* sync the directory entry and then change the in-memory record
	2530	* from an ADD to a DELETE to cover the fact that it's been
	2531	* deleted by the frontend.
	2532	*
	2533	* A directory delete covering record (MEM_RECORD_DEL) can never
	2534	* be deleted by the frontend.
	2535	*
	2536	* Any other record type (aka DATA) can be deleted by the frontend.
	2537	* XXX At the moment the flusher must skip it because there may
	2538	* be another data record in the flush group for the same block,
	2539	* meaning that some frontend data changes can leak into the backend's
	2540	* synchronization point.
	2541	*/
	2542	if (record->flags & HAMMER_RECF_DELETED_FE) {
	2543	if (record->type == HAMMER_MEM_RECORD_ADD) {
	2544	/*
	2545	* Convert a front-end deleted directory-add to
	2546	* a directory-delete entry later.
	2547	*/
	2548	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	2549	} else {
	2550	/*
	2551	* Dispose of the record (race case). Mark as
	2552	* deleted by backend (and not committed).
	2553	*/
	2554	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	2555	record->flags \|= HAMMER_RECF_DELETED_BE;
	2556	++record->ip->rec_generation;
	2557	error = 0;
	2558	goto done;
	2559	}
	2560	}
	2561
	2562	/*
	2563	* Assign the create_tid for new records. Deletions already
	2564	* have the record's entire key properly set up.
	2565	*/
	2566	if (record->type != HAMMER_MEM_RECORD_DEL) {
	2567	record->leaf.base.create_tid = trans->tid;
	2568	record->leaf.create_ts = trans->time32;
	2569	}
	2570	for (;;) {
	2571	error = hammer_ip_sync_record_cursor(cursor, record);
	2572	if (error != EDEADLK)
	2573	break;
	2574	hammer_done_cursor(cursor);
	2575	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	2576	record->ip);
	2577	if (error)
	2578	break;
	2579	}
	2580	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	2581
	2582	if (error)
	2583	error = -error;
	2584	done:
	2585	hammer_flush_record_done(record, error);
	2586
	2587	/*
	2588	* Do partial finalization if we have built up too many dirty
	2589	* buffers. Otherwise a buffer cache deadlock can occur when
	2590	* doing things like creating tens of thousands of tiny files.
	2591	*
	2592	* We must release our cursor lock to avoid a 3-way deadlock
	2593	* due to the exclusive sync lock the finalizer must get.
	2594	*
	2595	* WARNING: See warnings in hammer_unlock_cursor() function.
	2596	*/
	2597	if (hammer_flusher_meta_limit(hmp)) {
	2598	hammer_unlock_cursor(cursor);
	2599	hammer_flusher_finalize(trans, 0);
	2600	hammer_lock_cursor(cursor);
	2601	}
	2602
	2603	return(error);
	2604	}
	2605
	2606	/*
	2607	* Backend function called by the flusher to sync an inode to media.
	2608	*/
	2609	int
	2610	hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
	2611	{
	2612	struct hammer_cursor cursor;
	2613	hammer_node_t tmp_node;
	2614	hammer_record_t depend;
	2615	hammer_record_t next;
	2616	int error, tmp_error;
	2617	u_int64_t nlinks;
	2618
	2619	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	2620	return(0);
	2621
	2622	error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	2623	if (error)
	2624	goto done;
	2625
	2626	/*
	2627	* Any directory records referencing this inode which are not in
	2628	* our current flush group must adjust our nlink count for the
	2629	* purposes of synchronization to disk.
	2630	*
	2631	* Records which are in our flush group can be unlinked from our
	2632	* inode now, potentially allowing the inode to be physically
	2633	* deleted.
	2634	*
	2635	* This cannot block.
	2636	*/
	2637	nlinks = ip->ino_data.nlinks;
	2638	next = TAILQ_FIRST(&ip->target_list);
	2639	while ((depend = next) != NULL) {
	2640	next = TAILQ_NEXT(depend, target_entry);
	2641	if (depend->flush_state == HAMMER_FST_FLUSH &&
	2642	depend->flush_group == ip->flush_group) {
	2643	/*
	2644	* If this is an ADD that was deleted by the frontend
	2645	* the frontend nlinks count will have already been
	2646	* decremented, but the backend is going to sync its
	2647	* directory entry and must account for it. The
	2648	* record will be converted to a delete-on-disk when
	2649	* it gets synced.
	2650	*
	2651	* If the ADD was not deleted by the frontend we
	2652	* can remove the dependancy from our target_list.
	2653	*/
	2654	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	2655	++nlinks;
	2656	} else {
	2657	TAILQ_REMOVE(&ip->target_list, depend,
	2658	target_entry);
	2659	depend->target_ip = NULL;
	2660	}
	2661	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	2662	/*
	2663	* Not part of our flush group and not deleted by
	2664	* the front-end, adjust the link count synced to
	2665	* the media (undo what the frontend did when it
	2666	* queued the record).
	2667	*/
	2668	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	2669	switch(depend->type) {
	2670	case HAMMER_MEM_RECORD_ADD:
	2671	--nlinks;
	2672	break;
	2673	case HAMMER_MEM_RECORD_DEL:
	2674	++nlinks;
	2675	break;
	2676	default:
	2677	break;
	2678	}
	2679	}
	2680	}
	2681
	2682	/*
	2683	* Set dirty if we had to modify the link count.
	2684	*/
	2685	if (ip->sync_ino_data.nlinks != nlinks) {
	2686	KKASSERT((int64_t)nlinks >= 0);
	2687	ip->sync_ino_data.nlinks = nlinks;
	2688	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2689	}
	2690
	2691	/*
	2692	* If there is a trunction queued destroy any data past the (aligned)
	2693	* truncation point. Userland will have dealt with the buffer
	2694	* containing the truncation point for us.
	2695	*
	2696	* We don't flush pending frontend data buffers until after we've
	2697	* dealt with the truncation.
	2698	*/
	2699	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2700	/*
	2701	* Interlock trunc_off. The VOP front-end may continue to
	2702	* make adjustments to it while we are blocked.
	2703	*/
	2704	off_t trunc_off;
	2705	off_t aligned_trunc_off;
	2706	int blkmask;
	2707
	2708	trunc_off = ip->sync_trunc_off;
	2709	blkmask = hammer_blocksize(trunc_off) - 1;
	2710	aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
	2711
	2712	/*
	2713	* Delete any whole blocks on-media. The front-end has
	2714	* already cleaned out any partial block and made it
	2715	* pending. The front-end may have updated trunc_off
	2716	* while we were blocked so we only use sync_trunc_off.
	2717	*
	2718	* This operation can blow out the buffer cache, EWOULDBLOCK
	2719	* means we were unable to complete the deletion. The
	2720	* deletion will update sync_trunc_off in that case.
	2721	*/
	2722	error = hammer_ip_delete_range(&cursor, ip,
	2723	aligned_trunc_off,
	2724	0x7FFFFFFFFFFFFFFFLL, 2);
	2725	if (error == EWOULDBLOCK) {
	2726	ip->flags \|= HAMMER_INODE_WOULDBLOCK;
	2727	error = 0;
	2728	goto defer_buffer_flush;
	2729	}
	2730
	2731	if (error)
	2732	goto done;
	2733
	2734	/*
	2735	* Clear the truncation flag on the backend after we have
	2736	* complete the deletions. Backend data is now good again
	2737	* (including new records we are about to sync, below).
	2738	*
	2739	* Leave sync_trunc_off intact. As we write additional
	2740	* records the backend will update sync_trunc_off. This
	2741	* tells the backend whether it can skip the overwrite
	2742	* test. This should work properly even when the backend
	2743	* writes full blocks where the truncation point straddles
	2744	* the block because the comparison is against the base
	2745	* offset of the record.
	2746	*/
	2747	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2748	/* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
	2749	} else {
	2750	error = 0;
	2751	}
	2752
	2753	/*
	2754	* Now sync related records. These will typically be directory
	2755	* entries, records tracking direct-writes, or delete-on-disk records.
	2756	*/
	2757	if (error == 0) {
	2758	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2759	hammer_sync_record_callback, &cursor);
	2760	if (tmp_error < 0)
	2761	tmp_error = -error;
	2762	if (tmp_error)
	2763	error = tmp_error;
	2764	}
	2765	hammer_cache_node(&ip->cache[1], cursor.node);
	2766
	2767	/*
	2768	* Re-seek for inode update, assuming our cache hasn't been ripped
	2769	* out from under us.
	2770	*/
	2771	if (error == 0) {
	2772	tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
	2773	if (tmp_node) {
	2774	hammer_cursor_downgrade(&cursor);
	2775	hammer_lock_sh(&tmp_node->lock);
	2776	if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
	2777	hammer_cursor_seek(&cursor, tmp_node, 0);
	2778	hammer_unlock(&tmp_node->lock);
	2779	hammer_rel_node(tmp_node);
	2780	}
	2781	error = 0;
	2782	}
	2783
	2784	/*
	2785	* If we are deleting the inode the frontend had better not have
	2786	* any active references on elements making up the inode.
	2787	*
	2788	* The call to hammer_ip_delete_clean() cleans up auxillary records
	2789	* but not DB or DATA records. Those must have already been deleted
	2790	* by the normal truncation mechanic.
	2791	*/
	2792	if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
	2793	RB_EMPTY(&ip->rec_tree) &&
	2794	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	2795	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	2796	int count1 = 0;
	2797
	2798	error = hammer_ip_delete_clean(&cursor, ip, &count1);
	2799	if (error == 0) {
	2800	ip->flags \|= HAMMER_INODE_DELETED;
	2801	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	2802	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2803	KKASSERT(RB_EMPTY(&ip->rec_tree));
	2804
	2805	/*
	2806	* Set delete_tid in both the frontend and backend
	2807	* copy of the inode record. The DELETED flag handles
	2808	* this, do not set RDIRTY.
	2809	*/
	2810	ip->ino_leaf.base.delete_tid = trans->tid;
	2811	ip->sync_ino_leaf.base.delete_tid = trans->tid;
	2812	ip->ino_leaf.delete_ts = trans->time32;
	2813	ip->sync_ino_leaf.delete_ts = trans->time32;
	2814
	2815
	2816	/*
	2817	* Adjust the inode count in the volume header
	2818	*/
	2819	hammer_sync_lock_sh(trans);
	2820	if (ip->flags & HAMMER_INODE_ONDISK) {
	2821	hammer_modify_volume_field(trans,
	2822	trans->rootvol,
	2823	vol0_stat_inodes);
	2824	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	2825	hammer_modify_volume_done(trans->rootvol);
	2826	}
	2827	hammer_sync_unlock(trans);
	2828	}
	2829	}
	2830
	2831	if (error)
	2832	goto done;
	2833	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	2834
	2835	defer_buffer_flush:
	2836	/*
	2837	* Now update the inode's on-disk inode-data and/or on-disk record.
	2838	* DELETED and ONDISK are managed only in ip->flags.
	2839	*
	2840	* In the case of a defered buffer flush we still update the on-disk
	2841	* inode to satisfy visibility requirements if there happen to be
	2842	* directory dependancies.
	2843	*/
	2844	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	2845	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	2846	/*
	2847	* If deleted and on-disk, don't set any additional flags.
	2848	* the delete flag takes care of things.
	2849	*
	2850	* Clear flags which may have been set by the frontend.
	2851	*/
	2852	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2853	HAMMER_INODE_SDIRTY \|
	2854	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2855	HAMMER_INODE_DELETING);
	2856	break;
	2857	case HAMMER_INODE_DELETED:
	2858	/*
	2859	* Take care of the case where a deleted inode was never
	2860	* flushed to the disk in the first place.
	2861	*
	2862	* Clear flags which may have been set by the frontend.
	2863	*/
	2864	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	2865	HAMMER_INODE_SDIRTY \|
	2866	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	2867	HAMMER_INODE_DELETING);
	2868	while (RB_ROOT(&ip->rec_tree)) {
	2869	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	2870	hammer_ref(&record->lock);
	2871	KKASSERT(record->lock.refs == 1);
	2872	record->flags \|= HAMMER_RECF_DELETED_BE;
	2873	++record->ip->rec_generation;
	2874	hammer_rel_mem_record(record);
	2875	}
	2876	break;
	2877	case HAMMER_INODE_ONDISK:
	2878	/*
	2879	* If already on-disk, do not set any additional flags.
	2880	*/
	2881	break;
	2882	default:
	2883	/*
	2884	* If not on-disk and not deleted, set DDIRTY to force
	2885	* an initial record to be written.
	2886	*
	2887	* Also set the create_tid in both the frontend and backend
	2888	* copy of the inode record.
	2889	*/
	2890	ip->ino_leaf.base.create_tid = trans->tid;
	2891	ip->ino_leaf.create_ts = trans->time32;
	2892	ip->sync_ino_leaf.base.create_tid = trans->tid;
	2893	ip->sync_ino_leaf.create_ts = trans->time32;
	2894	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2895	break;
	2896	}
	2897
	2898	/*
	2899	* If RDIRTY, DDIRTY, or SDIRTY is set, write out a new record.
	2900	* If the inode is already on-disk the old record is marked as
	2901	* deleted.
	2902	*
	2903	* If DELETED is set hammer_update_inode() will delete the existing
	2904	* record without writing out a new one.
	2905	*
	2906	* If ONLY the ITIMES flag is set we can update the record in-place.
	2907	*/
	2908	if (ip->flags & HAMMER_INODE_DELETED) {
	2909	error = hammer_update_inode(&cursor, ip);
	2910	} else
	2911	if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_SDIRTY)) &&
	2912	(ip->sync_flags & (HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME))) {
	2913	error = hammer_update_itimes(&cursor, ip);
	2914	} else
	2915	if (ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_SDIRTY \|
	2916	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) {
	2917	error = hammer_update_inode(&cursor, ip);
	2918	}
	2919	done:
	2920	if (error) {
	2921	hammer_critical_error(ip->hmp, ip, error,
	2922	"while syncing inode");
	2923	}
	2924	hammer_done_cursor(&cursor);
	2925	return(error);
	2926	}
	2927
	2928	/*
	2929	* This routine is called when the OS is no longer actively referencing
	2930	* the inode (but might still be keeping it cached), or when releasing
	2931	* the last reference to an inode.
	2932	*
	2933	* At this point if the inode's nlinks count is zero we want to destroy
	2934	* it, which may mean destroying it on-media too.
	2935	*/
	2936	void
	2937	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	2938	{
	2939	struct vnode *vp;
	2940
	2941	/*
	2942	* Set the DELETING flag when the link count drops to 0 and the
	2943	* OS no longer has any opens on the inode.
	2944	*
	2945	* The backend will clear DELETING (a mod flag) and set DELETED
	2946	* (a state flag) when it is actually able to perform the
	2947	* operation.
	2948	*
	2949	* Don't reflag the deletion if the flusher is currently syncing
	2950	* one that was already flagged. A previously set DELETING flag
	2951	* may bounce around flags and sync_flags until the operation is
	2952	* completely done.
	2953	*/
	2954	if (ip->ino_data.nlinks == 0 &&
	2955	((ip->flags \| ip->sync_flags) & (HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	2956	ip->flags \|= HAMMER_INODE_DELETING;
	2957	ip->flags \|= HAMMER_INODE_TRUNCATED;
	2958	ip->trunc_off = 0;
	2959	vp = NULL;
	2960	if (getvp) {
	2961	if (hammer_get_vnode(ip, &vp) != 0)
	2962	return;
	2963	}
	2964
	2965	/*
	2966	* Final cleanup
	2967	*/
	2968	if (ip->vp) {
	2969	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	2970	vnode_pager_setsize(ip->vp, 0);
	2971	}
	2972	if (getvp) {
	2973	vput(vp);
	2974	}
	2975	}
	2976	}
	2977
	2978	/*
	2979	* After potentially resolving a dependancy the inode is tested
	2980	* to determine whether it needs to be reflushed.
	2981	*/
	2982	void
	2983	hammer_test_inode(hammer_inode_t ip)
	2984	{
	2985	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2986	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2987	hammer_ref(&ip->lock);
	2988	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2989	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2990	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2991	} else {
	2992	hammer_flush_inode(ip, 0);
	2993	}
	2994	hammer_rel_inode(ip, 0);
	2995	}
	2996	}
	2997
	2998	/*
	2999	* Clear the RECLAIM flag on an inode. This occurs when the inode is
	3000	* reassociated with a vp or just before it gets freed.
	3001	*
	3002	* Pipeline wakeups to threads blocked due to an excessive number of
	3003	* detached inodes. This typically occurs when atime updates accumulate
	3004	* while scanning a directory tree.
	3005	*/
	3006	static void
	3007	hammer_inode_wakereclaims(hammer_inode_t ip)
	3008	{
	3009	struct hammer_reclaim *reclaim;
	3010	hammer_mount_t hmp = ip->hmp;
	3011
	3012	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
	3013	return;
	3014
	3015	--hammer_count_reclaiming;
	3016	--hmp->inode_reclaims;
	3017	ip->flags &= ~HAMMER_INODE_RECLAIM;
	3018
	3019	while ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
	3020	if (reclaim->count > 0 && --reclaim->count == 0) {
	3021	TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
	3022	wakeup(reclaim);
	3023	}
	3024	if (hmp->inode_reclaims > hammer_limit_reclaim / 2)
	3025	break;
	3026	}
	3027	}
	3028
	3029	/*
	3030	* Setup our reclaim pipeline. We only let so many detached (and dirty)
	3031	* inodes build up before we start blocking. This routine is called
	3032	* if a new inode is created or an inode is loaded from media.
	3033	*
	3034	* When we block we don't care which inode has finished reclaiming,
	3035	* as lone as one does.
	3036	*/
	3037	void
	3038	hammer_inode_waitreclaims(hammer_mount_t hmp)
	3039	{
	3040	struct hammer_reclaim reclaim;
	3041
	3042	if (hmp->inode_reclaims < hammer_limit_reclaim)
	3043	return;
	3044	reclaim.count = 1;
	3045	TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
	3046	tsleep(&reclaim, 0, "hmrrcm", hz);
	3047	if (reclaim.count > 0)
	3048	TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
	3049	}
	3050
	3051	#if 0
	3052
	3053	/*
	3054	* XXX not used, doesn't work very well due to the large batching nature
	3055	* of flushes.
	3056	*
	3057	* A larger then normal backlog of inodes is sitting in the flusher,
	3058	* enforce a general slowdown to let it catch up. This routine is only
	3059	* called on completion of a non-flusher-related transaction which
	3060	* performed B-Tree node I/O.
	3061	*
	3062	* It is possible for the flusher to stall in a continuous load.
	3063	* blogbench -i1000 -o seems to do a good job generating this sort of load.
	3064	* If the flusher is unable to catch up the inode count can bloat until
	3065	* we run out of kvm.
	3066	*
	3067	* This is a bit of a hack.
	3068	*/
	3069	void
	3070	hammer_inode_waithard(hammer_mount_t hmp)
	3071	{
	3072	/*
	3073	* Hysteresis.
	3074	*/
	3075	if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
	3076	if (hmp->inode_reclaims < hammer_limit_reclaim / 2 &&
	3077	hmp->count_iqueued < hmp->count_inodes / 20) {
	3078	hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
	3079	return;
	3080	}
	3081	} else {
	3082	if (hmp->inode_reclaims < hammer_limit_reclaim \|\|
	3083	hmp->count_iqueued < hmp->count_inodes / 10) {
	3084	return;
	3085	}
	3086	hmp->flags \|= HAMMER_MOUNT_FLUSH_RECOVERY;
	3087	}
	3088
	3089	/*
	3090	* Block for one flush cycle.
	3091	*/
	3092	hammer_flusher_wait_next(hmp);
	3093	}
	3094
	3095	#endif