gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <vm/vm_extern.h>
	39
	40	static int hammer_unload_inode(struct hammer_inode *ip);
	41	static void hammer_free_inode(hammer_inode_t ip);
	42	static void hammer_flush_inode_core(hammer_inode_t ip,
	43	hammer_flush_group_t flg, int flags);
	44	static int hammer_setup_child_callback(hammer_record_t rec, void *data);
	45	#if 0
	46	static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
	47	#endif
	48	static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	49	hammer_flush_group_t flg);
	50	static int hammer_setup_parent_inodes_helper(hammer_record_t record,
	51	int depth, hammer_flush_group_t flg);
	52	static void hammer_inode_wakereclaims(hammer_inode_t ip);
	53	static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
	54	pid_t pid);
	55
	56	#ifdef DEBUG_TRUNCATE
	57	extern struct hammer_inode *HammerTruncIp;
	58	#endif
	59
	60	/*
	61	* RB-Tree support for inode structures
	62	*/
	63	int
	64	hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
	65	{
	66	if (ip1->obj_localization < ip2->obj_localization)
	67	return(-1);
	68	if (ip1->obj_localization > ip2->obj_localization)
	69	return(1);
	70	if (ip1->obj_id < ip2->obj_id)
	71	return(-1);
	72	if (ip1->obj_id > ip2->obj_id)
	73	return(1);
	74	if (ip1->obj_asof < ip2->obj_asof)
	75	return(-1);
	76	if (ip1->obj_asof > ip2->obj_asof)
	77	return(1);
	78	return(0);
	79	}
	80
	81	int
	82	hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
	83	{
	84	if (ip1->redo_fifo_start < ip2->redo_fifo_start)
	85	return(-1);
	86	if (ip1->redo_fifo_start > ip2->redo_fifo_start)
	87	return(1);
	88	return(0);
	89	}
	90
	91	/*
	92	* RB-Tree support for inode structures / special LOOKUP_INFO
	93	*/
	94	static int
	95	hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
	96	{
	97	if (info->obj_localization < ip->obj_localization)
	98	return(-1);
	99	if (info->obj_localization > ip->obj_localization)
	100	return(1);
	101	if (info->obj_id < ip->obj_id)
	102	return(-1);
	103	if (info->obj_id > ip->obj_id)
	104	return(1);
	105	if (info->obj_asof < ip->obj_asof)
	106	return(-1);
	107	if (info->obj_asof > ip->obj_asof)
	108	return(1);
	109	return(0);
	110	}
	111
	112	/*
	113	* Used by hammer_scan_inode_snapshots() to locate all of an object's
	114	* snapshots. Note that the asof field is not tested, which we can get
	115	* away with because it is the lowest-priority field.
	116	*/
	117	static int
	118	hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
	119	{
	120	hammer_inode_info_t info = data;
	121
	122	if (ip->obj_localization > info->obj_localization)
	123	return(1);
	124	if (ip->obj_localization < info->obj_localization)
	125	return(-1);
	126	if (ip->obj_id > info->obj_id)
	127	return(1);
	128	if (ip->obj_id < info->obj_id)
	129	return(-1);
	130	return(0);
	131	}
	132
	133	/*
	134	* Used by hammer_unload_pseudofs() to locate all inodes associated with
	135	* a particular PFS.
	136	*/
	137	static int
	138	hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
	139	{
	140	u_int32_t localization = (u_int32_t )data;
	141	if (ip->obj_localization > localization)
	142	return(1);
	143	if (ip->obj_localization < localization)
	144	return(-1);
	145	return(0);
	146	}
	147
	148	/*
	149	* RB-Tree support for pseudofs structures
	150	*/
	151	static int
	152	hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
	153	{
	154	if (p1->localization < p2->localization)
	155	return(-1);
	156	if (p1->localization > p2->localization)
	157	return(1);
	158	return(0);
	159	}
	160
	161
	162	RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
	163	RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
	164	hammer_inode_info_cmp, hammer_inode_info_t);
	165	RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
	166	hammer_pfs_rb_compare, u_int32_t, localization);
	167
	168	/*
	169	* The kernel is not actively referencing this vnode but is still holding
	170	* it cached.
	171	*
	172	* This is called from the frontend.
	173	*
	174	* MPALMOSTSAFE
	175	*/
	176	int
	177	hammer_vop_inactive(struct vop_inactive_args *ap)
	178	{
	179	struct hammer_inode *ip = VTOI(ap->a_vp);
	180	hammer_mount_t hmp;
	181
	182	/*
	183	* Degenerate case
	184	*/
	185	if (ip == NULL) {
	186	vrecycle(ap->a_vp);
	187	return(0);
	188	}
	189
	190	/*
	191	* If the inode no longer has visibility in the filesystem try to
	192	* recycle it immediately, even if the inode is dirty. Recycling
	193	* it quickly allows the system to reclaim buffer cache and VM
	194	* resources which can matter a lot in a heavily loaded system.
	195	*
	196	* This can deadlock in vfsync() if we aren't careful.
	197	*
	198	* Do not queue the inode to the flusher if we still have visibility,
	199	* otherwise namespace calls such as chmod will unnecessarily generate
	200	* multiple inode updates.
	201	*/
	202	if (ip->ino_data.nlinks == 0) {
	203	hmp = ip->hmp;
	204	lwkt_gettoken(&hmp->fs_token);
	205	hammer_inode_unloadable_check(ip, 0);
	206	if (ip->flags & HAMMER_INODE_MODMASK)
	207	hammer_flush_inode(ip, 0);
	208	lwkt_reltoken(&hmp->fs_token);
	209	vrecycle(ap->a_vp);
	210	}
	211	return(0);
	212	}
	213
	214	/*
	215	* Release the vnode association. This is typically (but not always)
	216	* the last reference on the inode.
	217	*
	218	* Once the association is lost we are on our own with regards to
	219	* flushing the inode.
	220	*
	221	* We must interlock ip->vp so hammer_get_vnode() can avoid races.
	222	*/
	223	int
	224	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	225	{
	226	struct hammer_inode *ip;
	227	hammer_mount_t hmp;
	228	struct vnode *vp;
	229
	230	vp = ap->a_vp;
	231
	232	if ((ip = vp->v_data) != NULL) {
	233	hmp = ip->hmp;
	234	lwkt_gettoken(&hmp->fs_token);
	235	hammer_lock_ex(&ip->lock);
	236	vp->v_data = NULL;
	237	ip->vp = NULL;
	238
	239	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
	240	++hammer_count_reclaiming;
	241	++hmp->inode_reclaims;
	242	ip->flags \|= HAMMER_INODE_RECLAIM;
	243	}
	244	hammer_unlock(&ip->lock);
	245	hammer_rel_inode(ip, 1);
	246	lwkt_reltoken(&hmp->fs_token);
	247	}
	248	return(0);
	249	}
	250
	251	/*
	252	* Return a locked vnode for the specified inode. The inode must be
	253	* referenced but NOT LOCKED on entry and will remain referenced on
	254	* return.
	255	*
	256	* Called from the frontend.
	257	*/
	258	int
	259	hammer_get_vnode(struct hammer_inode ip, struct vnode *vpp)
	260	{
	261	hammer_mount_t hmp;
	262	struct vnode *vp;
	263	int error = 0;
	264	u_int8_t obj_type;
	265
	266	hmp = ip->hmp;
	267
	268	for (;;) {
	269	if ((vp = ip->vp) == NULL) {
	270	error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
	271	if (error)
	272	break;
	273	hammer_lock_ex(&ip->lock);
	274	if (ip->vp != NULL) {
	275	hammer_unlock(&ip->lock);
	276	vp = *vpp;
	277	vp->v_type = VBAD;
	278	vx_put(vp);
	279	continue;
	280	}
	281	hammer_ref(&ip->lock);
	282	vp = *vpp;
	283	ip->vp = vp;
	284
	285	obj_type = ip->ino_data.obj_type;
	286	vp->v_type = hammer_get_vnode_type(obj_type);
	287
	288	hammer_inode_wakereclaims(ip);
	289
	290	switch(ip->ino_data.obj_type) {
	291	case HAMMER_OBJTYPE_CDEV:
	292	case HAMMER_OBJTYPE_BDEV:
	293	vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
	294	addaliasu(vp, ip->ino_data.rmajor,
	295	ip->ino_data.rminor);
	296	break;
	297	case HAMMER_OBJTYPE_FIFO:
	298	vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
	299	break;
	300	case HAMMER_OBJTYPE_REGFILE:
	301	break;
	302	default:
	303	break;
	304	}
	305
	306	/*
	307	* Only mark as the root vnode if the ip is not
	308	* historical, otherwise the VFS cache will get
	309	* confused. The other half of the special handling
	310	* is in hammer_vop_nlookupdotdot().
	311	*
	312	* Pseudo-filesystem roots can be accessed via
	313	* non-root filesystem paths and setting VROOT may
	314	* confuse the namecache. Set VPFSROOT instead.
	315	*/
	316	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	317	ip->obj_asof == hmp->asof) {
	318	if (ip->obj_localization == 0)
	319	vsetflags(vp, VROOT);
	320	else
	321	vsetflags(vp, VPFSROOT);
	322	}
	323
	324	vp->v_data = (void *)ip;
	325	/* vnode locked by getnewvnode() */
	326	/* make related vnode dirty if inode dirty? */
	327	hammer_unlock(&ip->lock);
	328	if (vp->v_type == VREG) {
	329	vinitvmio(vp, ip->ino_data.size,
	330	hammer_blocksize(ip->ino_data.size),
	331	hammer_blockoff(ip->ino_data.size));
	332	}
	333	break;
	334	}
	335
	336	/*
	337	* Interlock vnode clearing. This does not prevent the
	338	* vnode from going into a reclaimed state but it does
	339	* prevent it from being destroyed or reused so the vget()
	340	* will properly fail.
	341	*/
	342	hammer_lock_ex(&ip->lock);
	343	if ((vp = ip->vp) == NULL) {
	344	hammer_unlock(&ip->lock);
	345	continue;
	346	}
	347	vhold_interlocked(vp);
	348	hammer_unlock(&ip->lock);
	349
	350	/*
	351	* loop if the vget fails (aka races), or if the vp
	352	* no longer matches ip->vp.
	353	*/
	354	if (vget(vp, LK_EXCLUSIVE) == 0) {
	355	if (vp == ip->vp) {
	356	vdrop(vp);
	357	break;
	358	}
	359	vput(vp);
	360	}
	361	vdrop(vp);
	362	}
	363	*vpp = vp;
	364	return(error);
	365	}
	366
	367	/*
	368	* Locate all copies of the inode for obj_id compatible with the specified
	369	* asof, reference, and issue the related call-back. This routine is used
	370	* for direct-io invalidation and does not create any new inodes.
	371	*/
	372	void
	373	hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
	374	int (callback)(hammer_inode_t ip, void data),
	375	void *data)
	376	{
	377	hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
	378	hammer_inode_info_cmp_all_history,
	379	callback, iinfo);
	380	}
	381
	382	/*
	383	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	384	* do not attach or detach the related vnode (use hammer_get_vnode() for
	385	* that).
	386	*
	387	* The flags argument is only applied for newly created inodes, and only
	388	* certain flags are inherited.
	389	*
	390	* Called from the frontend.
	391	*/
	392	struct hammer_inode *
	393	hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
	394	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	395	int flags, int *errorp)
	396	{
	397	hammer_mount_t hmp = trans->hmp;
	398	struct hammer_node_cache *cachep;
	399	struct hammer_inode_info iinfo;
	400	struct hammer_cursor cursor;
	401	struct hammer_inode *ip;
	402
	403
	404	/*
	405	* Determine if we already have an inode cached. If we do then
	406	* we are golden.
	407	*
	408	* If we find an inode with no vnode we have to mark the
	409	* transaction such that hammer_inode_waitreclaims() is
	410	* called later on to avoid building up an infinite number
	411	* of inodes. Otherwise we can continue to * add new inodes
	412	* faster then they can be disposed of, even with the tsleep
	413	* delay.
	414	*
	415	* If we find a dummy inode we return a failure so dounlink
	416	* (which does another lookup) doesn't try to mess with the
	417	* link count. hammer_vop_nresolve() uses hammer_get_dummy_inode()
	418	* to ref dummy inodes.
	419	*/
	420	iinfo.obj_id = obj_id;
	421	iinfo.obj_asof = asof;
	422	iinfo.obj_localization = localization;
	423	loop:
	424	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	425	if (ip) {
	426	if (ip->flags & HAMMER_INODE_DUMMY) {
	427	*errorp = ENOENT;
	428	return(NULL);
	429	}
	430	hammer_ref(&ip->lock);
	431	*errorp = 0;
	432	return(ip);
	433	}
	434
	435	/*
	436	* Allocate a new inode structure and deal with races later.
	437	*/
	438	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	439	++hammer_count_inodes;
	440	++hmp->count_inodes;
	441	ip->obj_id = obj_id;
	442	ip->obj_asof = iinfo.obj_asof;
	443	ip->obj_localization = localization;
	444	ip->hmp = hmp;
	445	ip->flags = flags & HAMMER_INODE_RO;
	446	ip->cache[0].ip = ip;
	447	ip->cache[1].ip = ip;
	448	ip->cache[2].ip = ip;
	449	ip->cache[3].ip = ip;
	450	if (hmp->ronly)
	451	ip->flags \|= HAMMER_INODE_RO;
	452	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	453	0x7FFFFFFFFFFFFFFFLL;
	454	RB_INIT(&ip->rec_tree);
	455	TAILQ_INIT(&ip->target_list);
	456	hammer_ref(&ip->lock);
	457
	458	/*
	459	* Locate the on-disk inode. If this is a PFS root we always
	460	* access the current version of the root inode and (if it is not
	461	* a master) always access information under it with a snapshot
	462	* TID.
	463	*
	464	* We cache recent inode lookups in this directory in dip->cache[2].
	465	* If we can't find it we assume the inode we are looking for is
	466	* close to the directory inode.
	467	*/
	468	retry:
	469	cachep = NULL;
	470	if (dip) {
	471	if (dip->cache[2].node)
	472	cachep = &dip->cache[2];
	473	else
	474	cachep = &dip->cache[0];
	475	}
	476	hammer_init_cursor(trans, &cursor, cachep, NULL);
	477	cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
	478	cursor.key_beg.obj_id = ip->obj_id;
	479	cursor.key_beg.key = 0;
	480	cursor.key_beg.create_tid = 0;
	481	cursor.key_beg.delete_tid = 0;
	482	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	483	cursor.key_beg.obj_type = 0;
	484
	485	cursor.asof = iinfo.obj_asof;
	486	cursor.flags = HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_GET_DATA \|
	487	HAMMER_CURSOR_ASOF;
	488
	489	*errorp = hammer_btree_lookup(&cursor);
	490	if (*errorp == EDEADLK) {
	491	hammer_done_cursor(&cursor);
	492	goto retry;
	493	}
	494
	495	/*
	496	* On success the B-Tree lookup will hold the appropriate
	497	* buffer cache buffers and provide a pointer to the requested
	498	* information. Copy the information to the in-memory inode
	499	* and cache the B-Tree node to improve future operations.
	500	*/
	501	if (*errorp == 0) {
	502	ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
	503	ip->ino_data = cursor.data->inode;
	504
	505	/*
	506	* cache[0] tries to cache the location of the object inode.
	507	* The assumption is that it is near the directory inode.
	508	*
	509	* cache[1] tries to cache the location of the object data.
	510	* We might have something in the governing directory from
	511	* scan optimizations (see the strategy code in
	512	* hammer_vnops.c).
	513	*
	514	* We update dip->cache[2], if possible, with the location
	515	* of the object inode for future directory shortcuts.
	516	*/
	517	hammer_cache_node(&ip->cache[0], cursor.node);
	518	if (dip) {
	519	if (dip->cache[3].node) {
	520	hammer_cache_node(&ip->cache[1],
	521	dip->cache[3].node);
	522	}
	523	hammer_cache_node(&dip->cache[2], cursor.node);
	524	}
	525
	526	/*
	527	* The file should not contain any data past the file size
	528	* stored in the inode. Setting save_trunc_off to the
	529	* file size instead of max reduces B-Tree lookup overheads
	530	* on append by allowing the flusher to avoid checking for
	531	* record overwrites.
	532	*/
	533	ip->save_trunc_off = ip->ino_data.size;
	534
	535	/*
	536	* Locate and assign the pseudofs management structure to
	537	* the inode.
	538	*/
	539	if (dip && dip->obj_localization == ip->obj_localization) {
	540	ip->pfsm = dip->pfsm;
	541	hammer_ref(&ip->pfsm->lock);
	542	} else {
	543	ip->pfsm = hammer_load_pseudofs(trans,
	544	ip->obj_localization,
	545	errorp);
	546	errorp = 0; / ignore ENOENT */
	547	}
	548	}
	549
	550	/*
	551	* The inode is placed on the red-black tree and will be synced to
	552	* the media when flushed or by the filesystem sync. If this races
	553	* another instantiation/lookup the insertion will fail.
	554	*/
	555	if (*errorp == 0) {
	556	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	557	hammer_free_inode(ip);
	558	hammer_done_cursor(&cursor);
	559	goto loop;
	560	}
	561	ip->flags \|= HAMMER_INODE_ONDISK;
	562	} else {
	563	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	564	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	565	--hmp->rsv_inodes;
	566	}
	567
	568	hammer_free_inode(ip);
	569	ip = NULL;
	570	}
	571	hammer_done_cursor(&cursor);
	572
	573	/*
	574	* NEWINODE is only set if the inode becomes dirty later,
	575	* setting it here just leads to unnecessary stalls.
	576	*
	577	* trans->flags \|= HAMMER_TRANSF_NEWINODE;
	578	*/
	579	return (ip);
	580	}
	581
	582	/*
	583	* Get a dummy inode to placemark a broken directory entry.
	584	*/
	585	struct hammer_inode *
	586	hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
	587	int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
	588	int flags, int *errorp)
	589	{
	590	hammer_mount_t hmp = trans->hmp;
	591	struct hammer_inode_info iinfo;
	592	struct hammer_inode *ip;
	593
	594	/*
	595	* Determine if we already have an inode cached. If we do then
	596	* we are golden.
	597	*
	598	* If we find an inode with no vnode we have to mark the
	599	* transaction such that hammer_inode_waitreclaims() is
	600	* called later on to avoid building up an infinite number
	601	* of inodes. Otherwise we can continue to * add new inodes
	602	* faster then they can be disposed of, even with the tsleep
	603	* delay.
	604	*
	605	* If we find a non-fake inode we return an error. Only fake
	606	* inodes can be returned by this routine.
	607	*/
	608	iinfo.obj_id = obj_id;
	609	iinfo.obj_asof = asof;
	610	iinfo.obj_localization = localization;
	611	loop:
	612	*errorp = 0;
	613	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	614	if (ip) {
	615	if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
	616	*errorp = ENOENT;
	617	return(NULL);
	618	}
	619	hammer_ref(&ip->lock);
	620	return(ip);
	621	}
	622
	623	/*
	624	* Allocate a new inode structure and deal with races later.
	625	*/
	626	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	627	++hammer_count_inodes;
	628	++hmp->count_inodes;
	629	ip->obj_id = obj_id;
	630	ip->obj_asof = iinfo.obj_asof;
	631	ip->obj_localization = localization;
	632	ip->hmp = hmp;
	633	ip->flags = flags \| HAMMER_INODE_RO \| HAMMER_INODE_DUMMY;
	634	ip->cache[0].ip = ip;
	635	ip->cache[1].ip = ip;
	636	ip->cache[2].ip = ip;
	637	ip->cache[3].ip = ip;
	638	ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
	639	0x7FFFFFFFFFFFFFFFLL;
	640	RB_INIT(&ip->rec_tree);
	641	TAILQ_INIT(&ip->target_list);
	642	hammer_ref(&ip->lock);
	643
	644	/*
	645	* Populate the dummy inode. Leave everything zero'd out.
	646	*
	647	* (ip->ino_leaf and ip->ino_data)
	648	*
	649	* Make the dummy inode a FIFO object which most copy programs
	650	* will properly ignore.
	651	*/
	652	ip->save_trunc_off = ip->ino_data.size;
	653	ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
	654
	655	/*
	656	* Locate and assign the pseudofs management structure to
	657	* the inode.
	658	*/
	659	if (dip && dip->obj_localization == ip->obj_localization) {
	660	ip->pfsm = dip->pfsm;
	661	hammer_ref(&ip->pfsm->lock);
	662	} else {
	663	ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
	664	errorp);
	665	errorp = 0; / ignore ENOENT */
	666	}
	667
	668	/*
	669	* The inode is placed on the red-black tree and will be synced to
	670	* the media when flushed or by the filesystem sync. If this races
	671	* another instantiation/lookup the insertion will fail.
	672	*
	673	* NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake.
	674	*/
	675	if (*errorp == 0) {
	676	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	677	hammer_free_inode(ip);
	678	goto loop;
	679	}
	680	} else {
	681	if (ip->flags & HAMMER_INODE_RSV_INODES) {
	682	ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
	683	--hmp->rsv_inodes;
	684	}
	685	hammer_free_inode(ip);
	686	ip = NULL;
	687	}
	688	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	689	return (ip);
	690	}
	691
	692	/*
	693	* Return a referenced inode only if it is in our inode cache.
	694	*
	695	* Dummy inodes do not count.
	696	*/
	697	struct hammer_inode *
	698	hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
	699	hammer_tid_t asof, u_int32_t localization)
	700	{
	701	hammer_mount_t hmp = trans->hmp;
	702	struct hammer_inode_info iinfo;
	703	struct hammer_inode *ip;
	704
	705	iinfo.obj_id = obj_id;
	706	iinfo.obj_asof = asof;
	707	iinfo.obj_localization = localization;
	708
	709	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	710	if (ip) {
	711	if (ip->flags & HAMMER_INODE_DUMMY)
	712	ip = NULL;
	713	else
	714	hammer_ref(&ip->lock);
	715	}
	716	return(ip);
	717	}
	718
	719	/*
	720	* Create a new filesystem object, returning the inode in *ipp. The
	721	* returned inode will be referenced. The inode is created in-memory.
	722	*
	723	* If pfsm is non-NULL the caller wishes to create the root inode for
	724	* a master PFS.
	725	*/
	726	int
	727	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	728	struct ucred *cred,
	729	hammer_inode_t dip, const char *name, int namelen,
	730	hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
	731	{
	732	hammer_mount_t hmp;
	733	hammer_inode_t ip;
	734	uid_t xuid;
	735	int error;
	736	int64_t namekey;
	737	u_int32_t dummy;
	738
	739	hmp = trans->hmp;
	740
	741	ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK\|M_ZERO);
	742	++hammer_count_inodes;
	743	++hmp->count_inodes;
	744	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	745
	746	if (pfsm) {
	747	KKASSERT(pfsm->localization != 0);
	748	ip->obj_id = HAMMER_OBJID_ROOT;
	749	ip->obj_localization = pfsm->localization;
	750	} else {
	751	KKASSERT(dip != NULL);
	752	namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
	753	ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
	754	ip->obj_localization = dip->obj_localization;
	755	}
	756
	757	KKASSERT(ip->obj_id != 0);
	758	ip->obj_asof = hmp->asof;
	759	ip->hmp = hmp;
	760	ip->flush_state = HAMMER_FST_IDLE;
	761	ip->flags = HAMMER_INODE_DDIRTY \|
	762	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME;
	763	ip->cache[0].ip = ip;
	764	ip->cache[1].ip = ip;
	765	ip->cache[2].ip = ip;
	766	ip->cache[3].ip = ip;
	767
	768	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	769	/* ip->save_trunc_off = 0; (already zero) */
	770	RB_INIT(&ip->rec_tree);
	771	TAILQ_INIT(&ip->target_list);
	772
	773	ip->ino_data.atime = trans->time;
	774	ip->ino_data.mtime = trans->time;
	775	ip->ino_data.size = 0;
	776	ip->ino_data.nlinks = 0;
	777
	778	/*
	779	* A nohistory designator on the parent directory is inherited by
	780	* the child. We will do this even for pseudo-fs creation... the
	781	* sysad can turn it off.
	782	*/
	783	if (dip) {
	784	ip->ino_data.uflags = dip->ino_data.uflags &
	785	(SF_NOHISTORY\|UF_NOHISTORY\|UF_NODUMP);
	786	}
	787
	788	ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
	789	ip->ino_leaf.base.localization = ip->obj_localization +
	790	HAMMER_LOCALIZE_INODE;
	791	ip->ino_leaf.base.obj_id = ip->obj_id;
	792	ip->ino_leaf.base.key = 0;
	793	ip->ino_leaf.base.create_tid = 0;
	794	ip->ino_leaf.base.delete_tid = 0;
	795	ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
	796	ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
	797
	798	ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
	799	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	800	ip->ino_data.mode = vap->va_mode;
	801	ip->ino_data.ctime = trans->time;
	802
	803	/*
	804	* If we are running version 2 or greater directory entries are
	805	* inode-localized instead of data-localized.
	806	*/
	807	if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
	808	if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	809	ip->ino_data.cap_flags \|=
	810	HAMMER_INODE_CAP_DIR_LOCAL_INO;
	811	}
	812	}
	813	if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
	814	if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
	815	ip->ino_data.cap_flags \|=
	816	HAMMER_INODE_CAP_DIRHASH_ALG1;
	817	}
	818	}
	819
	820	/*
	821	* Setup the ".." pointer. This only needs to be done for directories
	822	* but we do it for all objects as a recovery aid.
	823	*/
	824	if (dip)
	825	ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
	826	#if 0
	827	/*
	828	* The parent_obj_localization field only applies to pseudo-fs roots.
	829	* XXX this is no longer applicable, PFSs are no longer directly
	830	* tied into the parent's directory structure.
	831	*/
	832	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
	833	ip->obj_id == HAMMER_OBJID_ROOT) {
	834	ip->ino_data.ext.obj.parent_obj_localization =
	835	dip->obj_localization;
	836	}
	837	#endif
	838
	839	switch(ip->ino_leaf.base.obj_type) {
	840	case HAMMER_OBJTYPE_CDEV:
	841	case HAMMER_OBJTYPE_BDEV:
	842	ip->ino_data.rmajor = vap->va_rmajor;
	843	ip->ino_data.rminor = vap->va_rminor;
	844	break;
	845	default:
	846	break;
	847	}
	848
	849	/*
	850	* Calculate default uid/gid and overwrite with information from
	851	* the vap.
	852	*/
	853	if (dip) {
	854	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	855	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
	856	xuid, cred, &vap->va_mode);
	857	} else {
	858	xuid = 0;
	859	}
	860	ip->ino_data.mode = vap->va_mode;
	861
	862	if (vap->va_vaflags & VA_UID_UUID_VALID)
	863	ip->ino_data.uid = vap->va_uid_uuid;
	864	else if (vap->va_uid != (uid_t)VNOVAL)
	865	hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
	866	else
	867	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	868
	869	if (vap->va_vaflags & VA_GID_UUID_VALID)
	870	ip->ino_data.gid = vap->va_gid_uuid;
	871	else if (vap->va_gid != (gid_t)VNOVAL)
	872	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	873	else if (dip)
	874	ip->ino_data.gid = dip->ino_data.gid;
	875
	876	hammer_ref(&ip->lock);
	877
	878	if (pfsm) {
	879	ip->pfsm = pfsm;
	880	hammer_ref(&pfsm->lock);
	881	error = 0;
	882	} else if (dip->obj_localization == ip->obj_localization) {
	883	ip->pfsm = dip->pfsm;
	884	hammer_ref(&ip->pfsm->lock);
	885	error = 0;
	886	} else {
	887	ip->pfsm = hammer_load_pseudofs(trans,
	888	ip->obj_localization,
	889	&error);
	890	error = 0; /* ignore ENOENT */
	891	}
	892
	893	if (error) {
	894	hammer_free_inode(ip);
	895	ip = NULL;
	896	} else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	897	panic("hammer_create_inode: duplicate obj_id %llx",
	898	(long long)ip->obj_id);
	899	/* not reached */
	900	hammer_free_inode(ip);
	901	}
	902	*ipp = ip;
	903	return(error);
	904	}
	905
	906	/*
	907	* Final cleanup / freeing of an inode structure
	908	*/
	909	static void
	910	hammer_free_inode(hammer_inode_t ip)
	911	{
	912	struct hammer_mount *hmp;
	913
	914	hmp = ip->hmp;
	915	KKASSERT(hammer_oneref(&ip->lock));
	916	hammer_uncache_node(&ip->cache[0]);
	917	hammer_uncache_node(&ip->cache[1]);
	918	hammer_uncache_node(&ip->cache[2]);
	919	hammer_uncache_node(&ip->cache[3]);
	920	hammer_inode_wakereclaims(ip);
	921	if (ip->objid_cache)
	922	hammer_clear_objid(ip);
	923	--hammer_count_inodes;
	924	--hmp->count_inodes;
	925	if (ip->pfsm) {
	926	hammer_rel_pseudofs(hmp, ip->pfsm);
	927	ip->pfsm = NULL;
	928	}
	929	kfree(ip, hmp->m_inodes);
	930	ip = NULL;
	931	}
	932
	933	/*
	934	* Retrieve pseudo-fs data. NULL will never be returned.
	935	*
	936	* If an error occurs *errorp will be set and a default template is returned,
	937	* otherwise *errorp is set to 0. Typically when an error occurs it will
	938	* be ENOENT.
	939	*/
	940	hammer_pseudofs_inmem_t
	941	hammer_load_pseudofs(hammer_transaction_t trans,
	942	u_int32_t localization, int *errorp)
	943	{
	944	hammer_mount_t hmp = trans->hmp;
	945	hammer_inode_t ip;
	946	hammer_pseudofs_inmem_t pfsm;
	947	struct hammer_cursor cursor;
	948	int bytes;
	949
	950	retry:
	951	pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
	952	if (pfsm) {
	953	hammer_ref(&pfsm->lock);
	954	*errorp = 0;
	955	return(pfsm);
	956	}
	957
	958	/*
	959	* PFS records are stored in the root inode (not the PFS root inode,
	960	* but the real root). Avoid an infinite recursion if loading
	961	* the PFS for the real root.
	962	*/
	963	if (localization) {
	964	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
	965	HAMMER_MAX_TID,
	966	HAMMER_DEF_LOCALIZATION, 0, errorp);
	967	} else {
	968	ip = NULL;
	969	}
	970
	971	pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK \| M_ZERO);
	972	pfsm->localization = localization;
	973	pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
	974	pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
	975
	976	hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
	977	cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
	978	HAMMER_LOCALIZE_MISC;
	979	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	980	cursor.key_beg.create_tid = 0;
	981	cursor.key_beg.delete_tid = 0;
	982	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	983	cursor.key_beg.obj_type = 0;
	984	cursor.key_beg.key = localization;
	985	cursor.asof = HAMMER_MAX_TID;
	986	cursor.flags \|= HAMMER_CURSOR_ASOF;
	987
	988	if (ip)
	989	*errorp = hammer_ip_lookup(&cursor);
	990	else
	991	*errorp = hammer_btree_lookup(&cursor);
	992	if (*errorp == 0) {
	993	*errorp = hammer_ip_resolve_data(&cursor);
	994	if (*errorp == 0) {
	995	if (cursor.data->pfsd.mirror_flags &
	996	HAMMER_PFSD_DELETED) {
	997	*errorp = ENOENT;
	998	} else {
	999	bytes = cursor.leaf->data_len;
	1000	if (bytes > sizeof(pfsm->pfsd))
	1001	bytes = sizeof(pfsm->pfsd);
	1002	bcopy(cursor.data, &pfsm->pfsd, bytes);
	1003	}
	1004	}
	1005	}
	1006	hammer_done_cursor(&cursor);
	1007
	1008	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	1009	hammer_ref(&pfsm->lock);
	1010	if (ip)
	1011	hammer_rel_inode(ip, 0);
	1012	if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
	1013	kfree(pfsm, hmp->m_misc);
	1014	goto retry;
	1015	}
	1016	return(pfsm);
	1017	}
	1018
	1019	/*
	1020	* Store pseudo-fs data. The backend will automatically delete any prior
	1021	* on-disk pseudo-fs data but we have to delete in-memory versions.
	1022	*/
	1023	int
	1024	hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
	1025	{
	1026	struct hammer_cursor cursor;
	1027	hammer_record_t record;
	1028	hammer_inode_t ip;
	1029	int error;
	1030
	1031	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	1032	HAMMER_DEF_LOCALIZATION, 0, &error);
	1033	retry:
	1034	pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
	1035	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	1036	cursor.key_beg.localization = ip->obj_localization +
	1037	HAMMER_LOCALIZE_MISC;
	1038	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
	1039	cursor.key_beg.create_tid = 0;
	1040	cursor.key_beg.delete_tid = 0;
	1041	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
	1042	cursor.key_beg.obj_type = 0;
	1043	cursor.key_beg.key = pfsm->localization;
	1044	cursor.asof = HAMMER_MAX_TID;
	1045	cursor.flags \|= HAMMER_CURSOR_ASOF;
	1046
	1047	/*
	1048	* Replace any in-memory version of the record.
	1049	*/
	1050	error = hammer_ip_lookup(&cursor);
	1051	if (error == 0 && hammer_cursor_inmem(&cursor)) {
	1052	record = cursor.iprec;
	1053	if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
	1054	KKASSERT(cursor.deadlk_rec == NULL);
	1055	hammer_ref(&record->lock);
	1056	cursor.deadlk_rec = record;
	1057	error = EDEADLK;
	1058	} else {
	1059	record->flags \|= HAMMER_RECF_DELETED_FE;
	1060	error = 0;
	1061	}
	1062	}
	1063
	1064	/*
	1065	* Allocate replacement general record. The backend flush will
	1066	* delete any on-disk version of the record.
	1067	*/
	1068	if (error == 0 \|\| error == ENOENT) {
	1069	record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
	1070	record->type = HAMMER_MEM_RECORD_GENERAL;
	1071
	1072	record->leaf.base.localization = ip->obj_localization +
	1073	HAMMER_LOCALIZE_MISC;
	1074	record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
	1075	record->leaf.base.key = pfsm->localization;
	1076	record->leaf.data_len = sizeof(pfsm->pfsd);
	1077	bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
	1078	error = hammer_ip_add_record(trans, record);
	1079	}
	1080	hammer_done_cursor(&cursor);
	1081	if (error == EDEADLK)
	1082	goto retry;
	1083	hammer_rel_inode(ip, 0);
	1084	return(error);
	1085	}
	1086
	1087	/*
	1088	* Create a root directory for a PFS if one does not alredy exist.
	1089	*
	1090	* The PFS root stands alone so we must also bump the nlinks count
	1091	* to prevent it from being destroyed on release.
	1092	*/
	1093	int
	1094	hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
	1095	hammer_pseudofs_inmem_t pfsm)
	1096	{
	1097	hammer_inode_t ip;
	1098	struct vattr vap;
	1099	int error;
	1100
	1101	ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
	1102	pfsm->localization, 0, &error);
	1103	if (ip == NULL) {
	1104	vattr_null(&vap);
	1105	vap.va_mode = 0755;
	1106	vap.va_type = VDIR;
	1107	error = hammer_create_inode(trans, &vap, cred,
	1108	NULL, NULL, 0,
	1109	pfsm, &ip);
	1110	if (error == 0) {
	1111	++ip->ino_data.nlinks;
	1112	hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
	1113	}
	1114	}
	1115	if (ip)
	1116	hammer_rel_inode(ip, 0);
	1117	return(error);
	1118	}
	1119
	1120	/*
	1121	* Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
	1122	* if we are unable to disassociate all the inodes.
	1123	*/
	1124	static
	1125	int
	1126	hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
	1127	{
	1128	int res;
	1129
	1130	hammer_ref(&ip->lock);
	1131	if (hammer_isactive(&ip->lock) == 2 && ip->vp)
	1132	vclean_unlocked(ip->vp);
	1133	if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
	1134	res = 0;
	1135	else
	1136	res = -1; /* stop, someone is using the inode */
	1137	hammer_rel_inode(ip, 0);
	1138	return(res);
	1139	}
	1140
	1141	int
	1142	hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
	1143	{
	1144	int res;
	1145	int try;
	1146
	1147	for (try = res = 0; try < 4; ++try) {
	1148	res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
	1149	hammer_inode_pfs_cmp,
	1150	hammer_unload_pseudofs_callback,
	1151	&localization);
	1152	if (res == 0 && try > 1)
	1153	break;
	1154	hammer_flusher_sync(trans->hmp);
	1155	}
	1156	if (res != 0)
	1157	res = ENOTEMPTY;
	1158	return(res);
	1159	}
	1160
	1161
	1162	/*
	1163	* Release a reference on a PFS
	1164	*/
	1165	void
	1166	hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
	1167	{
	1168	hammer_rel(&pfsm->lock);
	1169	if (hammer_norefs(&pfsm->lock)) {
	1170	RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
	1171	kfree(pfsm, hmp->m_misc);
	1172	}
	1173	}
	1174
	1175	/*
	1176	* Called by hammer_sync_inode().
	1177	*/
	1178	static int
	1179	hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
	1180	{
	1181	hammer_transaction_t trans = cursor->trans;
	1182	hammer_record_t record;
	1183	int error;
	1184	int redirty;
	1185
	1186	retry:
	1187	error = 0;
	1188
	1189	/*
	1190	* If the inode has a presence on-disk then locate it and mark
	1191	* it deleted, setting DELONDISK.
	1192	*
	1193	* The record may or may not be physically deleted, depending on
	1194	* the retention policy.
	1195	*/
	1196	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	1197	HAMMER_INODE_ONDISK) {
	1198	hammer_normalize_cursor(cursor);
	1199	cursor->key_beg.localization = ip->obj_localization +
	1200	HAMMER_LOCALIZE_INODE;
	1201	cursor->key_beg.obj_id = ip->obj_id;
	1202	cursor->key_beg.key = 0;
	1203	cursor->key_beg.create_tid = 0;
	1204	cursor->key_beg.delete_tid = 0;
	1205	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1206	cursor->key_beg.obj_type = 0;
	1207	cursor->asof = ip->obj_asof;
	1208	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1209	cursor->flags \|= HAMMER_CURSOR_GET_LEAF \| HAMMER_CURSOR_ASOF;
	1210	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1211
	1212	error = hammer_btree_lookup(cursor);
	1213	if (hammer_debug_inode)
	1214	kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
	1215
	1216	if (error == 0) {
	1217	error = hammer_ip_delete_record(cursor, ip, trans->tid);
	1218	if (hammer_debug_inode)
	1219	kprintf(" error %d\n", error);
	1220	if (error == 0) {
	1221	ip->flags \|= HAMMER_INODE_DELONDISK;
	1222	}
	1223	if (cursor->node)
	1224	hammer_cache_node(&ip->cache[0], cursor->node);
	1225	}
	1226	if (error == EDEADLK) {
	1227	hammer_done_cursor(cursor);
	1228	error = hammer_init_cursor(trans, cursor,
	1229	&ip->cache[0], ip);
	1230	if (hammer_debug_inode)
	1231	kprintf("IPDED %p %d\n", ip, error);
	1232	if (error == 0)
	1233	goto retry;
	1234	}
	1235	}
	1236
	1237	/*
	1238	* Ok, write out the initial record or a new record (after deleting
	1239	* the old one), unless the DELETED flag is set. This routine will
	1240	* clear DELONDISK if it writes out a record.
	1241	*
	1242	* Update our inode statistics if this is the first application of
	1243	* the inode on-disk.
	1244	*/
	1245	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	1246	/*
	1247	* Generate a record and write it to the media. We clean-up
	1248	* the state before releasing so we do not have to set-up
	1249	* a flush_group.
	1250	*/
	1251	record = hammer_alloc_mem_record(ip, 0);
	1252	record->type = HAMMER_MEM_RECORD_INODE;
	1253	record->flush_state = HAMMER_FST_FLUSH;
	1254	record->leaf = ip->sync_ino_leaf;
	1255	record->leaf.base.create_tid = trans->tid;
	1256	record->leaf.data_len = sizeof(ip->sync_ino_data);
	1257	record->leaf.create_ts = trans->time32;
	1258	record->data = (void *)&ip->sync_ino_data;
	1259	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	1260
	1261	/*
	1262	* If this flag is set we cannot sync the new file size
	1263	* because we haven't finished related truncations. The
	1264	* inode will be flushed in another flush group to finish
	1265	* the job.
	1266	*/
	1267	if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
	1268	ip->sync_ino_data.size != ip->ino_data.size) {
	1269	redirty = 1;
	1270	ip->sync_ino_data.size = ip->ino_data.size;
	1271	} else {
	1272	redirty = 0;
	1273	}
	1274
	1275	for (;;) {
	1276	error = hammer_ip_sync_record_cursor(cursor, record);
	1277	if (hammer_debug_inode)
	1278	kprintf("GENREC %p rec %08x %d\n",
	1279	ip, record->flags, error);
	1280	if (error != EDEADLK)
	1281	break;
	1282	hammer_done_cursor(cursor);
	1283	error = hammer_init_cursor(trans, cursor,
	1284	&ip->cache[0], ip);
	1285	if (hammer_debug_inode)
	1286	kprintf("GENREC reinit %d\n", error);
	1287	if (error)
	1288	break;
	1289	}
	1290
	1291	/*
	1292	* Note: The record was never on the inode's record tree
	1293	* so just wave our hands importantly and destroy it.
	1294	*/
	1295	record->flags \|= HAMMER_RECF_COMMITTED;
	1296	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
	1297	record->flush_state = HAMMER_FST_IDLE;
	1298	++ip->rec_generation;
	1299	hammer_rel_mem_record(record);
	1300
	1301	/*
	1302	* Finish up.
	1303	*/
	1304	if (error == 0) {
	1305	if (hammer_debug_inode)
	1306	kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
	1307	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1308	HAMMER_INODE_SDIRTY \|
	1309	HAMMER_INODE_ATIME \|
	1310	HAMMER_INODE_MTIME);
	1311	ip->flags &= ~HAMMER_INODE_DELONDISK;
	1312	if (redirty)
	1313	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	1314
	1315	/*
	1316	* Root volume count of inodes
	1317	*/
	1318	hammer_sync_lock_sh(trans);
	1319	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	1320	hammer_modify_volume_field(trans,
	1321	trans->rootvol,
	1322	vol0_stat_inodes);
	1323	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1324	hammer_modify_volume_done(trans->rootvol);
	1325	ip->flags \|= HAMMER_INODE_ONDISK;
	1326	if (hammer_debug_inode)
	1327	kprintf("NOWONDISK %p\n", ip);
	1328	}
	1329	hammer_sync_unlock(trans);
	1330	}
	1331	}
	1332
	1333	/*
	1334	* If the inode has been destroyed, clean out any left-over flags
	1335	* that may have been set by the frontend.
	1336	*/
	1337	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	1338	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \|
	1339	HAMMER_INODE_SDIRTY \|
	1340	HAMMER_INODE_ATIME \|
	1341	HAMMER_INODE_MTIME);
	1342	}
	1343	return(error);
	1344	}
	1345
	1346	/*
	1347	* Update only the itimes fields.
	1348	*
	1349	* ATIME can be updated without generating any UNDO. MTIME is updated
	1350	* with UNDO so it is guaranteed to be synchronized properly in case of
	1351	* a crash.
	1352	*
	1353	* Neither field is included in the B-Tree leaf element's CRC, which is how
	1354	* we can get away with updating ATIME the way we do.
	1355	*/
	1356	static int
	1357	hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
	1358	{
	1359	hammer_transaction_t trans = cursor->trans;
	1360	int error;
	1361
	1362	retry:
	1363	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) !=
	1364	HAMMER_INODE_ONDISK) {
	1365	return(0);
	1366	}
	1367
	1368	hammer_normalize_cursor(cursor);
	1369	cursor->key_beg.localization = ip->obj_localization +
	1370	HAMMER_LOCALIZE_INODE;
	1371	cursor->key_beg.obj_id = ip->obj_id;
	1372	cursor->key_beg.key = 0;
	1373	cursor->key_beg.create_tid = 0;
	1374	cursor->key_beg.delete_tid = 0;
	1375	cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
	1376	cursor->key_beg.obj_type = 0;
	1377	cursor->asof = ip->obj_asof;
	1378	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
	1379	cursor->flags \|= HAMMER_CURSOR_ASOF;
	1380	cursor->flags \|= HAMMER_CURSOR_GET_LEAF;
	1381	cursor->flags \|= HAMMER_CURSOR_GET_DATA;
	1382	cursor->flags \|= HAMMER_CURSOR_BACKEND;
	1383
	1384	error = hammer_btree_lookup(cursor);
	1385	if (error == 0) {
	1386	hammer_cache_node(&ip->cache[0], cursor->node);
	1387	if (ip->sync_flags & HAMMER_INODE_MTIME) {
	1388	/*
	1389	* Updating MTIME requires an UNDO. Just cover
	1390	* both atime and mtime.
	1391	*/
	1392	hammer_sync_lock_sh(trans);
	1393	hammer_modify_buffer(trans, cursor->data_buffer,
	1394	HAMMER_ITIMES_BASE(&cursor->data->inode),
	1395	HAMMER_ITIMES_BYTES);
	1396	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1397	cursor->data->inode.mtime = ip->sync_ino_data.mtime;
	1398	hammer_modify_buffer_done(cursor->data_buffer);
	1399	hammer_sync_unlock(trans);
	1400	} else if (ip->sync_flags & HAMMER_INODE_ATIME) {
	1401	/*
	1402	* Updating atime only can be done in-place with
	1403	* no UNDO.
	1404	*/
	1405	hammer_sync_lock_sh(trans);
	1406	hammer_modify_buffer(trans, cursor->data_buffer,
	1407	NULL, 0);
	1408	cursor->data->inode.atime = ip->sync_ino_data.atime;
	1409	hammer_modify_buffer_done(cursor->data_buffer);
	1410	hammer_sync_unlock(trans);
	1411	}
	1412	ip->sync_flags &= ~(HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME);
	1413	}
	1414	if (error == EDEADLK) {
	1415	hammer_done_cursor(cursor);
	1416	error = hammer_init_cursor(trans, cursor,
	1417	&ip->cache[0], ip);
	1418	if (error == 0)
	1419	goto retry;
	1420	}
	1421	return(error);
	1422	}
	1423
	1424	/*
	1425	* Release a reference on an inode, flush as requested.
	1426	*
	1427	* On the last reference we queue the inode to the flusher for its final
	1428	* disposition.
	1429	*/
	1430	void
	1431	hammer_rel_inode(struct hammer_inode *ip, int flush)
	1432	{
	1433	/hammer_mount_t hmp = ip->hmp;/
	1434
	1435	/*
	1436	* Handle disposition when dropping the last ref.
	1437	*/
	1438	for (;;) {
	1439	if (hammer_oneref(&ip->lock)) {
	1440	/*
	1441	* Determine whether on-disk action is needed for
	1442	* the inode's final disposition.
	1443	*/
	1444	KKASSERT(ip->vp == NULL);
	1445	hammer_inode_unloadable_check(ip, 0);
	1446	if (ip->flags & HAMMER_INODE_MODMASK) {
	1447	hammer_flush_inode(ip, 0);
	1448	} else if (hammer_oneref(&ip->lock)) {
	1449	hammer_unload_inode(ip);
	1450	break;
	1451	}
	1452	} else {
	1453	if (flush)
	1454	hammer_flush_inode(ip, 0);
	1455
	1456	/*
	1457	* The inode still has multiple refs, try to drop
	1458	* one ref.
	1459	*/
	1460	KKASSERT(hammer_isactive(&ip->lock) >= 1);
	1461	if (hammer_isactive(&ip->lock) > 1) {
	1462	hammer_rel(&ip->lock);
	1463	break;
	1464	}
	1465	}
	1466	}
	1467	}
	1468
	1469	/*
	1470	* Unload and destroy the specified inode. Must be called with one remaining
	1471	* reference. The reference is disposed of.
	1472	*
	1473	* The inode must be completely clean.
	1474	*/
	1475	static int
	1476	hammer_unload_inode(struct hammer_inode *ip)
	1477	{
	1478	hammer_mount_t hmp = ip->hmp;
	1479
	1480	KASSERT(hammer_oneref(&ip->lock),
	1481	("hammer_unload_inode: %d refs\n", hammer_isactive(&ip->lock)));
	1482	KKASSERT(ip->vp == NULL);
	1483	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	1484	KKASSERT(ip->cursor_ip_refs == 0);
	1485	KKASSERT(hammer_notlocked(&ip->lock));
	1486	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	1487
	1488	KKASSERT(RB_EMPTY(&ip->rec_tree));
	1489	KKASSERT(TAILQ_EMPTY(&ip->target_list));
	1490
	1491	if (ip->flags & HAMMER_INODE_RDIRTY) {
	1492	RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
	1493	ip->flags &= ~HAMMER_INODE_RDIRTY;
	1494	}
	1495	RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
	1496
	1497	hammer_free_inode(ip);
	1498	return(0);
	1499	}
	1500
	1501	/*
	1502	* Called during unmounting if a critical error occured. The in-memory
	1503	* inode and all related structures are destroyed.
	1504	*
	1505	* If a critical error did not occur the unmount code calls the standard
	1506	* release and asserts that the inode is gone.
	1507	*/
	1508	int
	1509	hammer_destroy_inode_callback(struct hammer_inode ip, void data __unused)
	1510	{
	1511	hammer_record_t rec;
	1512
	1513	/*
	1514	* Get rid of the inodes in-memory records, regardless of their
	1515	* state, and clear the mod-mask.
	1516	*/
	1517	while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
	1518	TAILQ_REMOVE(&ip->target_list, rec, target_entry);
	1519	rec->target_ip = NULL;
	1520	if (rec->flush_state == HAMMER_FST_SETUP)
	1521	rec->flush_state = HAMMER_FST_IDLE;
	1522	}
	1523	while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
	1524	if (rec->flush_state == HAMMER_FST_FLUSH)
	1525	--rec->flush_group->refs;
	1526	else
	1527	hammer_ref(&rec->lock);
	1528	KKASSERT(hammer_oneref(&rec->lock));
	1529	rec->flush_state = HAMMER_FST_IDLE;
	1530	rec->flush_group = NULL;
	1531	rec->flags \|= HAMMER_RECF_DELETED_FE; /* wave hands */
	1532	rec->flags \|= HAMMER_RECF_DELETED_BE; /* wave hands */
	1533	++ip->rec_generation;
	1534	hammer_rel_mem_record(rec);
	1535	}
	1536	ip->flags &= ~HAMMER_INODE_MODMASK;
	1537	ip->sync_flags &= ~HAMMER_INODE_MODMASK;
	1538	KKASSERT(ip->vp == NULL);
	1539
	1540	/*
	1541	* Remove the inode from any flush group, force it idle. FLUSH
	1542	* and SETUP states have an inode ref.
	1543	*/
	1544	switch(ip->flush_state) {
	1545	case HAMMER_FST_FLUSH:
	1546	RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
	1547	--ip->flush_group->refs;
	1548	ip->flush_group = NULL;
	1549	/* fall through */
	1550	case HAMMER_FST_SETUP:
	1551	hammer_rel(&ip->lock);
	1552	ip->flush_state = HAMMER_FST_IDLE;
	1553	/* fall through */
	1554	case HAMMER_FST_IDLE:
	1555	break;
	1556	}
	1557
	1558	/*
	1559	* There shouldn't be any associated vnode. The unload needs at
	1560	* least one ref, if we do have a vp steal its ip ref.
	1561	*/
	1562	if (ip->vp) {
	1563	kprintf("hammer_destroy_inode_callback: Unexpected "
	1564	"vnode association ip %p vp %p\n", ip, ip->vp);
	1565	ip->vp->v_data = NULL;
	1566	ip->vp = NULL;
	1567	} else {
	1568	hammer_ref(&ip->lock);
	1569	}
	1570	hammer_unload_inode(ip);
	1571	return(0);
	1572	}
	1573
	1574	/*
	1575	* Called on mount -u when switching from RW to RO or vise-versa. Adjust
	1576	* the read-only flag for cached inodes.
	1577	*
	1578	* This routine is called from a RB_SCAN().
	1579	*/
	1580	int
	1581	hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
	1582	{
	1583	hammer_mount_t hmp = ip->hmp;
	1584
	1585	if (hmp->ronly \|\| hmp->asof != HAMMER_MAX_TID)
	1586	ip->flags \|= HAMMER_INODE_RO;
	1587	else
	1588	ip->flags &= ~HAMMER_INODE_RO;
	1589	return(0);
	1590	}
	1591
	1592	/*
	1593	* A transaction has modified an inode, requiring updates as specified by
	1594	* the passed flags.
	1595	*
	1596	* HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
	1597	* and not including size changes due to write-append
	1598	* (but other size changes are included).
	1599	* HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
	1600	* write-append.
	1601	* HAMMER_INODE_XDIRTY: Dirty in-memory records
	1602	* HAMMER_INODE_BUFS: Dirty buffer cache buffers
	1603	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	1604	* HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
	1605	*/
	1606	void
	1607	hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
	1608	{
	1609	/*
	1610	* ronly of 0 or 2 does not trigger assertion.
	1611	* 2 is a special error state
	1612	*/
	1613	KKASSERT(ip->hmp->ronly != 1 \|\|
	1614	(flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	1615	HAMMER_INODE_SDIRTY \|
	1616	HAMMER_INODE_BUFS \| HAMMER_INODE_DELETED \|
	1617	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) == 0);
	1618	if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
	1619	ip->flags \|= HAMMER_INODE_RSV_INODES;
	1620	++ip->hmp->rsv_inodes;
	1621	}
	1622
	1623	/*
	1624	* Set the NEWINODE flag in the transaction if the inode
	1625	* transitions to a dirty state. This is used to track
	1626	* the load on the inode cache.
	1627	*/
	1628	if (trans &&
	1629	(ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	1630	(flags & HAMMER_INODE_MODMASK)) {
	1631	trans->flags \|= HAMMER_TRANSF_NEWINODE;
	1632	}
	1633
	1634	ip->flags \|= flags;
	1635	}
	1636
	1637	/*
	1638	* Request that an inode be flushed. This whole mess cannot block and may
	1639	* recurse (if not synchronous). Once requested HAMMER will attempt to
	1640	* actively flush the inode until the flush can be done.
	1641	*
	1642	* The inode may already be flushing, or may be in a setup state. We can
	1643	* place the inode in a flushing state if it is currently idle and flag it
	1644	* to reflush if it is currently flushing.
	1645	*
	1646	* Upon return if the inode could not be flushed due to a setup
	1647	* dependancy, then it will be automatically flushed when the dependancy
	1648	* is satisfied.
	1649	*/
	1650	void
	1651	hammer_flush_inode(hammer_inode_t ip, int flags)
	1652	{
	1653	hammer_mount_t hmp;
	1654	hammer_flush_group_t flg;
	1655	int good;
	1656
	1657	/*
	1658	* fill_flush_group is the first flush group we may be able to
	1659	* continue filling, it may be open or closed but it will always
	1660	* be past the currently flushing (running) flg.
	1661	*
	1662	* next_flush_group is the next open flush group.
	1663	*/
	1664	hmp = ip->hmp;
	1665	while ((flg = hmp->fill_flush_group) != NULL) {
	1666	KKASSERT(flg->running == 0);
	1667	if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
	1668	flg->total_count <= hammer_autoflush) {
	1669	break;
	1670	}
	1671	hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
	1672	hammer_flusher_async(ip->hmp, flg);
	1673	}
	1674	if (flg == NULL) {
	1675	flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK\|M_ZERO);
	1676	flg->seq = hmp->flusher.next++;
	1677	if (hmp->next_flush_group == NULL)
	1678	hmp->next_flush_group = flg;
	1679	if (hmp->fill_flush_group == NULL)
	1680	hmp->fill_flush_group = flg;
	1681	RB_INIT(&flg->flush_tree);
	1682	TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
	1683	}
	1684
	1685	/*
	1686	* Trivial 'nothing to flush' case. If the inode is in a SETUP
	1687	* state we have to put it back into an IDLE state so we can
	1688	* drop the extra ref.
	1689	*
	1690	* If we have a parent dependancy we must still fall through
	1691	* so we can run it.
	1692	*/
	1693	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	1694	if (ip->flush_state == HAMMER_FST_SETUP &&
	1695	TAILQ_EMPTY(&ip->target_list)) {
	1696	ip->flush_state = HAMMER_FST_IDLE;
	1697	hammer_rel_inode(ip, 0);
	1698	}
	1699	if (ip->flush_state == HAMMER_FST_IDLE)
	1700	return;
	1701	}
	1702
	1703	/*
	1704	* Our flush action will depend on the current state.
	1705	*/
	1706	switch(ip->flush_state) {
	1707	case HAMMER_FST_IDLE:
	1708	/*
	1709	* We have no dependancies and can flush immediately. Some
	1710	* our children may not be flushable so we have to re-test
	1711	* with that additional knowledge.
	1712	*/
	1713	hammer_flush_inode_core(ip, flg, flags);
	1714	break;
	1715	case HAMMER_FST_SETUP:
	1716	/*
	1717	* Recurse upwards through dependancies via target_list
	1718	* and start their flusher actions going if possible.
	1719	*
	1720	* 'good' is our connectivity. -1 means we have none and
	1721	* can't flush, 0 means there weren't any dependancies, and
	1722	* 1 means we have good connectivity.
	1723	*/
	1724	good = hammer_setup_parent_inodes(ip, 0, flg);
	1725
	1726	if (good >= 0) {
	1727	/*
	1728	* We can continue if good >= 0. Determine how
	1729	* many records under our inode can be flushed (and
	1730	* mark them).
	1731	*/
	1732	hammer_flush_inode_core(ip, flg, flags);
	1733	} else {
	1734	/*
	1735	* Parent has no connectivity, tell it to flush
	1736	* us as soon as it does.
	1737	*
	1738	* The REFLUSH flag is also needed to trigger
	1739	* dependancy wakeups.
	1740	*/
	1741	ip->flags \|= HAMMER_INODE_CONN_DOWN \|
	1742	HAMMER_INODE_REFLUSH;
	1743	if (flags & HAMMER_FLUSH_SIGNAL) {
	1744	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1745	hammer_flusher_async(ip->hmp, flg);
	1746	}
	1747	}
	1748	break;
	1749	case HAMMER_FST_FLUSH:
	1750	/*
	1751	* We are already flushing, flag the inode to reflush
	1752	* if needed after it completes its current flush.
	1753	*
	1754	* The REFLUSH flag is also needed to trigger
	1755	* dependancy wakeups.
	1756	*/
	1757	if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
	1758	ip->flags \|= HAMMER_INODE_REFLUSH;
	1759	if (flags & HAMMER_FLUSH_SIGNAL) {
	1760	ip->flags \|= HAMMER_INODE_RESIGNAL;
	1761	hammer_flusher_async(ip->hmp, flg);
	1762	}
	1763	break;
	1764	}
	1765	}
	1766
	1767	/*
	1768	* Scan ip->target_list, which is a list of records owned by PARENTS to our
	1769	* ip which reference our ip.
	1770	*
	1771	* XXX This is a huge mess of recursive code, but not one bit of it blocks
	1772	* so for now do not ref/deref the structures. Note that if we use the
	1773	* ref/rel code later, the rel CAN block.
	1774	*/
	1775	static int
	1776	hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
	1777	hammer_flush_group_t flg)
	1778	{
	1779	hammer_record_t depend;
	1780	int good;
	1781	int r;
	1782
	1783	/*
	1784	* If we hit our recursion limit and we have parent dependencies
	1785	* We cannot continue. Returning < 0 will cause us to be flagged
	1786	* for reflush. Returning -2 cuts off additional dependency checks
	1787	* because they are likely to also hit the depth limit.
	1788	*
	1789	* We cannot return < 0 if there are no dependencies or there might
	1790	* not be anything to wakeup (ip).
	1791	*/
	1792	if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
	1793	kprintf("HAMMER Warning: depth limit reached on "
	1794	"setup recursion, inode %p %016llx\n",
	1795	ip, (long long)ip->obj_id);
	1796	return(-2);
	1797	}
	1798
	1799	/*
	1800	* Scan dependencies
	1801	*/
	1802	good = 0;
	1803	TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
	1804	r = hammer_setup_parent_inodes_helper(depend, depth, flg);
	1805	KKASSERT(depend->target_ip == ip);
	1806	if (r < 0 && good == 0)
	1807	good = -1;
	1808	if (r > 0)
	1809	good = 1;
	1810
	1811	/*
	1812	* If we failed due to the recursion depth limit then stop
	1813	* now.
	1814	*/
	1815	if (r == -2)
	1816	break;
	1817	}
	1818	return(good);
	1819	}
	1820
	1821	/*
	1822	* This helper function takes a record representing the dependancy between
	1823	* the parent inode and child inode.
	1824	*
	1825	* record->ip = parent inode
	1826	* record->target_ip = child inode
	1827	*
	1828	* We are asked to recurse upwards and convert the record from SETUP
	1829	* to FLUSH if possible.
	1830	*
	1831	* Return 1 if the record gives us connectivity
	1832	*
	1833	* Return 0 if the record is not relevant
	1834	*
	1835	* Return -1 if we can't resolve the dependancy and there is no connectivity.
	1836	*/
	1837	static int
	1838	hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
	1839	hammer_flush_group_t flg)
	1840	{
	1841	hammer_mount_t hmp;
	1842	hammer_inode_t pip;
	1843	int good;
	1844
	1845	KKASSERT(record->flush_state != HAMMER_FST_IDLE);
	1846	pip = record->ip;
	1847	hmp = pip->hmp;
	1848
	1849	/*
	1850	* If the record is already flushing, is it in our flush group?
	1851	*
	1852	* If it is in our flush group but it is a general record or a
	1853	* delete-on-disk, it does not improve our connectivity (return 0),
	1854	* and if the target inode is not trying to destroy itself we can't
	1855	* allow the operation yet anyway (the second return -1).
	1856	*/
	1857	if (record->flush_state == HAMMER_FST_FLUSH) {
	1858	/*
	1859	* If not in our flush group ask the parent to reflush
	1860	* us as soon as possible.
	1861	*/
	1862	if (record->flush_group != flg) {
	1863	pip->flags \|= HAMMER_INODE_REFLUSH;
	1864	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1865	return(-1);
	1866	}
	1867
	1868	/*
	1869	* If in our flush group everything is already set up,
	1870	* just return whether the record will improve our
	1871	* visibility or not.
	1872	*/
	1873	if (record->type == HAMMER_MEM_RECORD_ADD)
	1874	return(1);
	1875	return(0);
	1876	}
	1877
	1878	/*
	1879	* It must be a setup record. Try to resolve the setup dependancies
	1880	* by recursing upwards so we can place ip on the flush list.
	1881	*
	1882	* Limit ourselves to 20 levels of recursion to avoid blowing out
	1883	* the kernel stack. If we hit the recursion limit we can't flush
	1884	* until the parent flushes. The parent will flush independantly
	1885	* on its own and ultimately a deep recursion will be resolved.
	1886	*/
	1887	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1888
	1889	good = hammer_setup_parent_inodes(pip, depth + 1, flg);
	1890
	1891	/*
	1892	* If good < 0 the parent has no connectivity and we cannot safely
	1893	* flush the directory entry, which also means we can't flush our
	1894	* ip. Flag us for downward recursion once the parent's
	1895	* connectivity is resolved. Flag the parent for [re]flush or it
	1896	* may not check for downward recursions.
	1897	*/
	1898	if (good < 0) {
	1899	pip->flags \|= HAMMER_INODE_REFLUSH;
	1900	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1901	return(good);
	1902	}
	1903
	1904	/*
	1905	* We are go, place the parent inode in a flushing state so we can
	1906	* place its record in a flushing state. Note that the parent
	1907	* may already be flushing. The record must be in the same flush
	1908	* group as the parent.
	1909	*/
	1910	if (pip->flush_state != HAMMER_FST_FLUSH)
	1911	hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
	1912	KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
	1913
	1914	/*
	1915	* It is possible for a rename to create a loop in the recursion
	1916	* and revisit a record. This will result in the record being
	1917	* placed in a flush state unexpectedly. This check deals with
	1918	* the case.
	1919	*/
	1920	if (record->flush_state == HAMMER_FST_FLUSH) {
	1921	if (record->type == HAMMER_MEM_RECORD_ADD)
	1922	return(1);
	1923	return(0);
	1924	}
	1925
	1926	KKASSERT(record->flush_state == HAMMER_FST_SETUP);
	1927
	1928	#if 0
	1929	if (record->type == HAMMER_MEM_RECORD_DEL &&
	1930	(record->target_ip->flags & (HAMMER_INODE_DELETED\|HAMMER_INODE_DELONDISK)) == 0) {
	1931	/*
	1932	* Regardless of flushing state we cannot sync this path if the
	1933	* record represents a delete-on-disk but the target inode
	1934	* is not ready to sync its own deletion.
	1935	*
	1936	* XXX need to count effective nlinks to determine whether
	1937	* the flush is ok, otherwise removing a hardlink will
	1938	* just leave the DEL record to rot.
	1939	*/
	1940	record->target_ip->flags \|= HAMMER_INODE_REFLUSH;
	1941	return(-1);
	1942	} else
	1943	#endif
	1944	if (pip->flush_group == flg) {
	1945	/*
	1946	* Because we have not calculated nlinks yet we can just
	1947	* set records to the flush state if the parent is in
	1948	* the same flush group as we are.
	1949	*/
	1950	record->flush_state = HAMMER_FST_FLUSH;
	1951	record->flush_group = flg;
	1952	++record->flush_group->refs;
	1953	hammer_ref(&record->lock);
	1954
	1955	/*
	1956	* A general directory-add contributes to our visibility.
	1957	*
	1958	* Otherwise it is probably a directory-delete or
	1959	* delete-on-disk record and does not contribute to our
	1960	* visbility (but we can still flush it).
	1961	*/
	1962	if (record->type == HAMMER_MEM_RECORD_ADD)
	1963	return(1);
	1964	return(0);
	1965	} else {
	1966	/*
	1967	* If the parent is not in our flush group we cannot
	1968	* flush this record yet, there is no visibility.
	1969	* We tell the parent to reflush and mark ourselves
	1970	* so the parent knows it should flush us too.
	1971	*/
	1972	pip->flags \|= HAMMER_INODE_REFLUSH;
	1973	record->target_ip->flags \|= HAMMER_INODE_CONN_DOWN;
	1974	return(-1);
	1975	}
	1976	}
	1977
	1978	/*
	1979	* This is the core routine placing an inode into the FST_FLUSH state.
	1980	*/
	1981	static void
	1982	hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
	1983	{
	1984	hammer_mount_t hmp = ip->hmp;
	1985	int go_count;
	1986
	1987	/*
	1988	* Set flush state and prevent the flusher from cycling into
	1989	* the next flush group. Do not place the ip on the list yet.
	1990	* Inodes not in the idle state get an extra reference.
	1991	*/
	1992	KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
	1993	if (ip->flush_state == HAMMER_FST_IDLE)
	1994	hammer_ref(&ip->lock);
	1995	ip->flush_state = HAMMER_FST_FLUSH;
	1996	ip->flush_group = flg;
	1997	++hmp->flusher.group_lock;
	1998	++hmp->count_iqueued;
	1999	++hammer_count_iqueued;
	2000	++flg->total_count;
	2001	hammer_redo_fifo_start_flush(ip);
	2002
	2003	#if 0
	2004	/*
	2005	* We need to be able to vfsync/truncate from the backend.
	2006	*
	2007	* XXX Any truncation from the backend will acquire the vnode
	2008	* independently.
	2009	*/
	2010	KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
	2011	if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
	2012	ip->flags \|= HAMMER_INODE_VHELD;
	2013	vref(ip->vp);
	2014	}
	2015	#endif
	2016
	2017	/*
	2018	* Figure out how many in-memory records we can actually flush
	2019	* (not including inode meta-data, buffers, etc).
	2020	*/
	2021	KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
	2022	if (flags & HAMMER_FLUSH_RECURSION) {
	2023	/*
	2024	* If this is a upwards recursion we do not want to
	2025	* recurse down again!
	2026	*/
	2027	go_count = 1;
	2028	#if 0
	2029	} else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	2030	/*
	2031	* No new records are added if we must complete a flush
	2032	* from a previous cycle, but we do have to move the records
	2033	* from the previous cycle to the current one.
	2034	*/
	2035	#if 0
	2036	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2037	hammer_syncgrp_child_callback, NULL);
	2038	#endif
	2039	go_count = 1;
	2040	#endif
	2041	} else {
	2042	/*
	2043	* Normal flush, scan records and bring them into the flush.
	2044	* Directory adds and deletes are usually skipped (they are
	2045	* grouped with the related inode rather then with the
	2046	* directory).
	2047	*
	2048	* go_count can be negative, which means the scan aborted
	2049	* due to the flush group being over-full and we should
	2050	* flush what we have.
	2051	*/
	2052	go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2053	hammer_setup_child_callback, NULL);
	2054	}
	2055
	2056	/*
	2057	* This is a more involved test that includes go_count. If we
	2058	* can't flush, flag the inode and return. If go_count is 0 we
	2059	* were are unable to flush any records in our rec_tree and
	2060	* must ignore the XDIRTY flag.
	2061	*/
	2062	if (go_count == 0) {
	2063	if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
	2064	--hmp->count_iqueued;
	2065	--hammer_count_iqueued;
	2066
	2067	--flg->total_count;
	2068	ip->flush_state = HAMMER_FST_SETUP;
	2069	ip->flush_group = NULL;
	2070	if (flags & HAMMER_FLUSH_SIGNAL) {
	2071	ip->flags \|= HAMMER_INODE_REFLUSH \|
	2072	HAMMER_INODE_RESIGNAL;
	2073	} else {
	2074	ip->flags \|= HAMMER_INODE_REFLUSH;
	2075	}
	2076	#if 0
	2077	if (ip->flags & HAMMER_INODE_VHELD) {
	2078	ip->flags &= ~HAMMER_INODE_VHELD;
	2079	vrele(ip->vp);
	2080	}
	2081	#endif
	2082
	2083	/*
	2084	* REFLUSH is needed to trigger dependancy wakeups
	2085	* when an inode is in SETUP.
	2086	*/
	2087	ip->flags \|= HAMMER_INODE_REFLUSH;
	2088	if (--hmp->flusher.group_lock == 0)
	2089	wakeup(&hmp->flusher.group_lock);
	2090	return;
	2091	}
	2092	}
	2093
	2094	/*
	2095	* Snapshot the state of the inode for the backend flusher.
	2096	*
	2097	* We continue to retain save_trunc_off even when all truncations
	2098	* have been resolved as an optimization to determine if we can
	2099	* skip the B-Tree lookup for overwrite deletions.
	2100	*
	2101	* NOTE: The DELETING flag is a mod flag, but it is also sticky,
	2102	* and stays in ip->flags. Once set, it stays set until the
	2103	* inode is destroyed.
	2104	*/
	2105	if (ip->flags & HAMMER_INODE_TRUNCATED) {
	2106	KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
	2107	ip->sync_trunc_off = ip->trunc_off;
	2108	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	2109	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	2110	ip->sync_flags \|= HAMMER_INODE_TRUNCATED;
	2111
	2112	/*
	2113	* The save_trunc_off used to cache whether the B-Tree
	2114	* holds any records past that point is not used until
	2115	* after the truncation has succeeded, so we can safely
	2116	* set it now.
	2117	*/
	2118	if (ip->save_trunc_off > ip->sync_trunc_off)
	2119	ip->save_trunc_off = ip->sync_trunc_off;
	2120	}
	2121	ip->sync_flags \|= (ip->flags & HAMMER_INODE_MODMASK &
	2122	~HAMMER_INODE_TRUNCATED);
	2123	ip->sync_ino_leaf = ip->ino_leaf;
	2124	ip->sync_ino_data = ip->ino_data;
	2125	ip->flags &= ~HAMMER_INODE_MODMASK \| HAMMER_INODE_TRUNCATED;
	2126	#ifdef DEBUG_TRUNCATE
	2127	if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
	2128	kprintf("truncateS %016llx\n", ip->sync_trunc_off);
	2129	#endif
	2130
	2131	/*
	2132	* The flusher list inherits our inode and reference.
	2133	*/
	2134	KKASSERT(flg->running == 0);
	2135	RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
	2136	if (--hmp->flusher.group_lock == 0)
	2137	wakeup(&hmp->flusher.group_lock);
	2138
	2139	/*
	2140	* Auto-flush the group if it grows too large. Make sure the
	2141	* inode reclaim wait pipeline continues to work.
	2142	*/
	2143	if (flg->total_count >= hammer_autoflush \|\|
	2144	flg->total_count >= hammer_limit_reclaim / 4) {
	2145	if (hmp->fill_flush_group == flg)
	2146	hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
	2147	hammer_flusher_async(hmp, flg);
	2148	}
	2149	}
	2150
	2151	/*
	2152	* Callback for scan of ip->rec_tree. Try to include each record in our
	2153	* flush. ip->flush_group has been set but the inode has not yet been
	2154	* moved into a flushing state.
	2155	*
	2156	* If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
	2157	* both inodes.
	2158	*
	2159	* We return 1 for any record placed or found in FST_FLUSH, which prevents
	2160	* the caller from shortcutting the flush.
	2161	*/
	2162	static int
	2163	hammer_setup_child_callback(hammer_record_t rec, void *data)
	2164	{
	2165	hammer_flush_group_t flg;
	2166	hammer_inode_t target_ip;
	2167	hammer_inode_t ip;
	2168	int r;
	2169
	2170	/*
	2171	* Records deleted or committed by the backend are ignored.
	2172	* Note that the flush detects deleted frontend records at
	2173	* multiple points to deal with races. This is just the first
	2174	* line of defense. The only time HAMMER_RECF_DELETED_FE cannot
	2175	* be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
	2176	* messes up link-count calculations.
	2177	*
	2178	* NOTE: Don't get confused between record deletion and, say,
	2179	* directory entry deletion. The deletion of a directory entry
	2180	* which is on-media has nothing to do with the record deletion
	2181	* flags.
	2182	*/
	2183	if (rec->flags & (HAMMER_RECF_DELETED_FE \| HAMMER_RECF_DELETED_BE \|
	2184	HAMMER_RECF_COMMITTED)) {
	2185	if (rec->flush_state == HAMMER_FST_FLUSH) {
	2186	KKASSERT(rec->flush_group == rec->ip->flush_group);
	2187	r = 1;
	2188	} else {
	2189	r = 0;
	2190	}
	2191	return(r);
	2192	}
	2193
	2194	/*
	2195	* If the record is in an idle state it has no dependancies and
	2196	* can be flushed.
	2197	*/
	2198	ip = rec->ip;
	2199	flg = ip->flush_group;
	2200	r = 0;
	2201
	2202	switch(rec->flush_state) {
	2203	case HAMMER_FST_IDLE:
	2204	/*
	2205	* The record has no setup dependancy, we can flush it.
	2206	*/
	2207	KKASSERT(rec->target_ip == NULL);
	2208	rec->flush_state = HAMMER_FST_FLUSH;
	2209	rec->flush_group = flg;
	2210	++flg->refs;
	2211	hammer_ref(&rec->lock);
	2212	r = 1;
	2213	break;
	2214	case HAMMER_FST_SETUP:
	2215	/*
	2216	* The record has a setup dependancy. These are typically
	2217	* directory entry adds and deletes. Such entries will be
	2218	* flushed when their inodes are flushed so we do not
	2219	* usually have to add them to the flush here. However,
	2220	* if the target_ip has set HAMMER_INODE_CONN_DOWN then
	2221	* it is asking us to flush this record (and it).
	2222	*/
	2223	target_ip = rec->target_ip;
	2224	KKASSERT(target_ip != NULL);
	2225	KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
	2226
	2227	/*
	2228	* If the target IP is already flushing in our group
	2229	* we could associate the record, but target_ip has
	2230	* already synced ino_data to sync_ino_data and we
	2231	* would also have to adjust nlinks. Plus there are
	2232	* ordering issues for adds and deletes.
	2233	*
	2234	* Reflush downward if this is an ADD, and upward if
	2235	* this is a DEL.
	2236	*/
	2237	if (target_ip->flush_state == HAMMER_FST_FLUSH) {
	2238	if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
	2239	ip->flags \|= HAMMER_INODE_REFLUSH;
	2240	else
	2241	target_ip->flags \|= HAMMER_INODE_REFLUSH;
	2242	break;
	2243	}
	2244
	2245	/*
	2246	* Target IP is not yet flushing. This can get complex
	2247	* because we have to be careful about the recursion.
	2248	*
	2249	* Directories create an issue for us in that if a flush
	2250	* of a directory is requested the expectation is to flush
	2251	* any pending directory entries, but this will cause the
	2252	* related inodes to recursively flush as well. We can't
	2253	* really defer the operation so just get as many as we
	2254	* can and
	2255	*/
	2256	#if 0
	2257	if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
	2258	(target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
	2259	/*
	2260	* We aren't reclaiming and the target ip was not
	2261	* previously prevented from flushing due to this
	2262	* record dependancy. Do not flush this record.
	2263	*/
	2264	/r = 0;/
	2265	} else
	2266	#endif
	2267	if (flg->total_count + flg->refs >
	2268	ip->hmp->undo_rec_limit) {
	2269	/*
	2270	* Our flush group is over-full and we risk blowing
	2271	* out the UNDO FIFO. Stop the scan, flush what we
	2272	* have, then reflush the directory.
	2273	*
	2274	* The directory may be forced through multiple
	2275	* flush groups before it can be completely
	2276	* flushed.
	2277	*/
	2278	ip->flags \|= HAMMER_INODE_RESIGNAL \|
	2279	HAMMER_INODE_REFLUSH;
	2280	r = -1;
	2281	} else if (rec->type == HAMMER_MEM_RECORD_ADD) {
	2282	/*
	2283	* If the target IP is not flushing we can force
	2284	* it to flush, even if it is unable to write out
	2285	* any of its own records we have at least one in
	2286	* hand that we CAN deal with.
	2287	*/
	2288	rec->flush_state = HAMMER_FST_FLUSH;
	2289	rec->flush_group = flg;
	2290	++flg->refs;
	2291	hammer_ref(&rec->lock);
	2292	hammer_flush_inode_core(target_ip, flg,
	2293	HAMMER_FLUSH_RECURSION);
	2294	r = 1;
	2295	} else {
	2296	/*
	2297	* General or delete-on-disk record.
	2298	*
	2299	* XXX this needs help. If a delete-on-disk we could
	2300	* disconnect the target. If the target has its own
	2301	* dependancies they really need to be flushed.
	2302	*
	2303	* XXX
	2304	*/
	2305	rec->flush_state = HAMMER_FST_FLUSH;
	2306	rec->flush_group = flg;
	2307	++flg->refs;
	2308	hammer_ref(&rec->lock);
	2309	hammer_flush_inode_core(target_ip, flg,
	2310	HAMMER_FLUSH_RECURSION);
	2311	r = 1;
	2312	}
	2313	break;
	2314	case HAMMER_FST_FLUSH:
	2315	/*
	2316	* The record could be part of a previous flush group if the
	2317	* inode is a directory (the record being a directory entry).
	2318	* Once the flush group was closed a hammer_test_inode()
	2319	* function can cause a new flush group to be setup, placing
	2320	* the directory inode itself in a new flush group.
	2321	*
	2322	* When associated with a previous flush group we count it
	2323	* as if it were in our current flush group, since it will
	2324	* effectively be flushed by the time we flush our current
	2325	* flush group.
	2326	*/
	2327	KKASSERT(
	2328	rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY \|\|
	2329	rec->flush_group == flg);
	2330	r = 1;
	2331	break;
	2332	}
	2333	return(r);
	2334	}
	2335
	2336	#if 0
	2337	/*
	2338	* This version just moves records already in a flush state to the new
	2339	* flush group and that is it.
	2340	*/
	2341	static int
	2342	hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
	2343	{
	2344	hammer_inode_t ip = rec->ip;
	2345
	2346	switch(rec->flush_state) {
	2347	case HAMMER_FST_FLUSH:
	2348	KKASSERT(rec->flush_group == ip->flush_group);
	2349	break;
	2350	default:
	2351	break;
	2352	}
	2353	return(0);
	2354	}
	2355	#endif
	2356
	2357	/*
	2358	* Wait for a previously queued flush to complete.
	2359	*
	2360	* If a critical error occured we don't try to wait.
	2361	*/
	2362	void
	2363	hammer_wait_inode(hammer_inode_t ip)
	2364	{
	2365	hammer_flush_group_t flg;
	2366
	2367	flg = NULL;
	2368	if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2369	while (ip->flush_state != HAMMER_FST_IDLE &&
	2370	(ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
	2371	if (ip->flush_state == HAMMER_FST_SETUP)
	2372	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2373
	2374	/*
	2375	* If the inode was already being flushed its flg
	2376	* may not have been queued to the backend. We have
	2377	* to make sure it gets queued or we can wind up
	2378	* blocked or deadlocked (particularly if we are
	2379	* the vnlru thread).
	2380	*/
	2381	KKASSERT(ip->flush_group);
	2382	if (ip->flush_group->closed == 0) {
	2383	kprintf("hammer: debug: forcing async "
	2384	"flush ip %016jx\n",
	2385	(intmax_t)ip->obj_id);
	2386	hammer_flusher_async(ip->hmp, ip->flush_group);
	2387	}
	2388	if (ip->flush_state != HAMMER_FST_IDLE) {
	2389	ip->flags \|= HAMMER_INODE_FLUSHW;
	2390	tsleep(&ip->flags, 0, "hmrwin", 0);
	2391	}
	2392	}
	2393	}
	2394	}
	2395
	2396	/*
	2397	* Called by the backend code when a flush has been completed.
	2398	* The inode has already been removed from the flush list.
	2399	*
	2400	* A pipelined flush can occur, in which case we must re-enter the
	2401	* inode on the list and re-copy its fields.
	2402	*/
	2403	void
	2404	hammer_flush_inode_done(hammer_inode_t ip, int error)
	2405	{
	2406	hammer_mount_t hmp;
	2407	int dorel;
	2408
	2409	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	2410
	2411	hmp = ip->hmp;
	2412
	2413	/*
	2414	* Auto-reflush if the backend could not completely flush
	2415	* the inode. This fixes a case where a deferred buffer flush
	2416	* could cause fsync to return early.
	2417	*/
	2418	if (ip->sync_flags & HAMMER_INODE_MODMASK)
	2419	ip->flags \|= HAMMER_INODE_REFLUSH;
	2420
	2421	/*
	2422	* Merge left-over flags back into the frontend and fix the state.
	2423	* Incomplete truncations are retained by the backend.
	2424	*/
	2425	ip->error = error;
	2426	ip->flags \|= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
	2427	ip->sync_flags &= HAMMER_INODE_TRUNCATED;
	2428
	2429	/*
	2430	* The backend may have adjusted nlinks, so if the adjusted nlinks
	2431	* does not match the fronttend set the frontend's DDIRTY flag again.
	2432	*/
	2433	if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
	2434	ip->flags \|= HAMMER_INODE_DDIRTY;
	2435
	2436	/*
	2437	* Fix up the dirty buffer status.
	2438	*/
	2439	if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
	2440	ip->flags \|= HAMMER_INODE_BUFS;
	2441	}
	2442	hammer_redo_fifo_end_flush(ip);
	2443
	2444	/*
	2445	* Re-set the XDIRTY flag if some of the inode's in-memory records
	2446	* could not be flushed.
	2447	*/
	2448	KKASSERT((RB_EMPTY(&ip->rec_tree) &&
	2449	(ip->flags & HAMMER_INODE_XDIRTY) == 0) \|\|
	2450	(!RB_EMPTY(&ip->rec_tree) &&
	2451	(ip->flags & HAMMER_INODE_XDIRTY) != 0));
	2452
	2453	/*
	2454	* Do not lose track of inodes which no longer have vnode
	2455	* assocations, otherwise they may never get flushed again.
	2456	*
	2457	* The reflush flag can be set superfluously, causing extra pain
	2458	* for no reason. If the inode is no longer modified it no longer
	2459	* needs to be flushed.
	2460	*/
	2461	if (ip->flags & HAMMER_INODE_MODMASK) {
	2462	if (ip->vp == NULL)
	2463	ip->flags \|= HAMMER_INODE_REFLUSH;
	2464	} else {
	2465	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2466	}
	2467
	2468	/*
	2469	* Adjust the flush state.
	2470	*/
	2471	if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
	2472	/*
	2473	* We were unable to flush out all our records, leave the
	2474	* inode in a flush state and in the current flush group.
	2475	* The flush group will be re-run.
	2476	*
	2477	* This occurs if the UNDO block gets too full or there is
	2478	* too much dirty meta-data and allows the flusher to
	2479	* finalize the UNDO block and then re-flush.
	2480	*/
	2481	ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
	2482	dorel = 0;
	2483	} else {
	2484	/*
	2485	* Remove from the flush_group
	2486	*/
	2487	RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
	2488	ip->flush_group = NULL;
	2489
	2490	#if 0
	2491	/*
	2492	* Clean up the vnode ref and tracking counts.
	2493	*/
	2494	if (ip->flags & HAMMER_INODE_VHELD) {
	2495	ip->flags &= ~HAMMER_INODE_VHELD;
	2496	vrele(ip->vp);
	2497	}
	2498	#endif
	2499	--hmp->count_iqueued;
	2500	--hammer_count_iqueued;
	2501
	2502	/*
	2503	* And adjust the state.
	2504	*/
	2505	if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
	2506	ip->flush_state = HAMMER_FST_IDLE;
	2507	dorel = 1;
	2508	} else {
	2509	ip->flush_state = HAMMER_FST_SETUP;
	2510	dorel = 0;
	2511	}
	2512
	2513	/*
	2514	* If the frontend is waiting for a flush to complete,
	2515	* wake it up.
	2516	*/
	2517	if (ip->flags & HAMMER_INODE_FLUSHW) {
	2518	ip->flags &= ~HAMMER_INODE_FLUSHW;
	2519	wakeup(&ip->flags);
	2520	}
	2521
	2522	/*
	2523	* If the frontend made more changes and requested another
	2524	* flush, then try to get it running.
	2525	*
	2526	* Reflushes are aborted when the inode is errored out.
	2527	*/
	2528	if (ip->flags & HAMMER_INODE_REFLUSH) {
	2529	ip->flags &= ~HAMMER_INODE_REFLUSH;
	2530	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	2531	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	2532	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	2533	} else {
	2534	hammer_flush_inode(ip, 0);
	2535	}
	2536	}
	2537	}
	2538
	2539	/*
	2540	* If we have no parent dependancies we can clear CONN_DOWN
	2541	*/
	2542	if (TAILQ_EMPTY(&ip->target_list))
	2543	ip->flags &= ~HAMMER_INODE_CONN_DOWN;
	2544
	2545	/*
	2546	* If the inode is now clean drop the space reservation.
	2547	*/
	2548	if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
	2549	(ip->flags & HAMMER_INODE_RSV_INODES)) {
	2550	ip->flags &= ~HAMMER_INODE_RSV_INODES;
	2551	--hmp->rsv_inodes;
	2552	}
	2553
	2554	if (dorel)
	2555	hammer_rel_inode(ip, 0);
	2556	}
	2557
	2558	/*
	2559	* Called from hammer_sync_inode() to synchronize in-memory records
	2560	* to the media.
	2561	*/
	2562	static int
	2563	hammer_sync_record_callback(hammer_record_t record, void *data)
	2564	{
	2565	hammer_cursor_t cursor = data;
	2566	hammer_transaction_t trans = cursor->trans;
	2567	hammer_mount_t hmp = trans->hmp;
	2568	int error;
	2569
	2570	/*
	2571	* Skip records that do not belong to the current flush.
	2572	*/
	2573	++hammer_stats_record_iterations;
	2574	if (record->flush_state != HAMMER_FST_FLUSH)
	2575	return(0);
	2576
	2577	#if 1
	2578	if (record->flush_group != record->ip->flush_group) {
	2579	kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
	2580	if (hammer_debug_critical)
	2581	Debugger("blah2");
	2582	return(0);
	2583	}
	2584	#endif
	2585	KKASSERT(record->flush_group == record->ip->flush_group);
	2586
	2587	/*
	2588	* Interlock the record using the BE flag. Once BE is set the
	2589	* frontend cannot change the state of FE.
	2590	*
	2591	* NOTE: If FE is set prior to us setting BE we still sync the
	2592	* record out, but the flush completion code converts it to
	2593	* a delete-on-disk record instead of destroying it.
	2594	*/
	2595	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
	2596	record->flags \|= HAMMER_RECF_INTERLOCK_BE;
	2597
	2598	/*
	2599	* The backend has already disposed of the record.
	2600	*/
	2601	if (record->flags & (HAMMER_RECF_DELETED_BE \| HAMMER_RECF_COMMITTED)) {
	2602	error = 0;
	2603	goto done;
	2604	}
	2605
	2606	/*
	2607	* If the whole inode is being deleting all on-disk records will
	2608	* be deleted very soon, we can't sync any new records to disk
	2609	* because they will be deleted in the same transaction they were
	2610	* created in (delete_tid == create_tid), which will assert.
	2611	*
	2612	* XXX There may be a case with RECORD_ADD with DELETED_FE set
	2613	* that we currently panic on.
	2614	*/
	2615	if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
	2616	switch(record->type) {
	2617	case HAMMER_MEM_RECORD_DATA:
	2618	/*
	2619	* We don't have to do anything, if the record was
	2620	* committed the space will have been accounted for
	2621	* in the blockmap.
	2622	*/
	2623	/* fall through */
	2624	case HAMMER_MEM_RECORD_GENERAL:
	2625	/*
	2626	* Set deleted-by-backend flag. Do not set the
	2627	* backend committed flag, because we are throwing
	2628	* the record away.
	2629	*/
	2630	record->flags \|= HAMMER_RECF_DELETED_BE;
	2631	++record->ip->rec_generation;
	2632	error = 0;
	2633	goto done;
	2634	case HAMMER_MEM_RECORD_ADD:
	2635	panic("hammer_sync_record_callback: illegal add "
	2636	"during inode deletion record %p", record);
	2637	break; /* NOT REACHED */
	2638	case HAMMER_MEM_RECORD_INODE:
	2639	panic("hammer_sync_record_callback: attempt to "
	2640	"sync inode record %p?", record);
	2641	break; /* NOT REACHED */
	2642	case HAMMER_MEM_RECORD_DEL:
	2643	/*
	2644	* Follow through and issue the on-disk deletion
	2645	*/
	2646	break;
	2647	}
	2648	}
	2649
	2650	/*
	2651	* If DELETED_FE is set special handling is needed for directory
	2652	* entries. Dependant pieces related to the directory entry may
	2653	* have already been synced to disk. If this occurs we have to
	2654	* sync the directory entry and then change the in-memory record
	2655	* from an ADD to a DELETE to cover the fact that it's been
	2656	* deleted by the frontend.
	2657	*
	2658	* A directory delete covering record (MEM_RECORD_DEL) can never
	2659	* be deleted by the frontend.
	2660	*
	2661	* Any other record type (aka DATA) can be deleted by the frontend.
	2662	* XXX At the moment the flusher must skip it because there may
	2663	* be another data record in the flush group for the same block,
	2664	* meaning that some frontend data changes can leak into the backend's
	2665	* synchronization point.
	2666	*/
	2667	if (record->flags & HAMMER_RECF_DELETED_FE) {
	2668	if (record->type == HAMMER_MEM_RECORD_ADD) {
	2669	/*
	2670	* Convert a front-end deleted directory-add to
	2671	* a directory-delete entry later.
	2672	*/
	2673	record->flags \|= HAMMER_RECF_CONVERT_DELETE;
	2674	} else {
	2675	/*
	2676	* Dispose of the record (race case). Mark as
	2677	* deleted by backend (and not committed).
	2678	*/
	2679	KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
	2680	record->flags \|= HAMMER_RECF_DELETED_BE;
	2681	++record->ip->rec_generation;
	2682	error = 0;
	2683	goto done;
	2684	}
	2685	}
	2686
	2687	/*
	2688	* Assign the create_tid for new records. Deletions already
	2689	* have the record's entire key properly set up.
	2690	*/
	2691	if (record->type != HAMMER_MEM_RECORD_DEL) {
	2692	record->leaf.base.create_tid = trans->tid;
	2693	record->leaf.create_ts = trans->time32;
	2694	}
	2695
	2696	/*
	2697	* This actually moves the record to the on-media B-Tree. We
	2698	* must also generate REDO_TERM entries in the UNDO/REDO FIFO
	2699	* indicating that the related REDO_WRITE(s) have been committed.
	2700	*
	2701	* During recovery any REDO_TERM's within the nominal recovery span
	2702	* are ignored since the related meta-data is being undone, causing
	2703	* any matching REDO_WRITEs to execute. The REDO_TERMs outside
	2704	* the nominal recovery span will match against REDO_WRITEs and
	2705	* prevent them from being executed (because the meta-data has
	2706	* already been synchronized).
	2707	*/
	2708	if (record->flags & HAMMER_RECF_REDO) {
	2709	KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
	2710	hammer_generate_redo(trans, record->ip,
	2711	record->leaf.base.key -
	2712	record->leaf.data_len,
	2713	HAMMER_REDO_TERM_WRITE,
	2714	NULL,
	2715	record->leaf.data_len);
	2716	}
	2717	for (;;) {
	2718	error = hammer_ip_sync_record_cursor(cursor, record);
	2719	if (error != EDEADLK)
	2720	break;
	2721	hammer_done_cursor(cursor);
	2722	error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
	2723	record->ip);
	2724	if (error)
	2725	break;
	2726	}
	2727	record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
	2728
	2729	if (error)
	2730	error = -error;
	2731	done:
	2732	hammer_flush_record_done(record, error);
	2733
	2734	/*
	2735	* Do partial finalization if we have built up too many dirty
	2736	* buffers. Otherwise a buffer cache deadlock can occur when
	2737	* doing things like creating tens of thousands of tiny files.
	2738	*
	2739	* We must release our cursor lock to avoid a 3-way deadlock
	2740	* due to the exclusive sync lock the finalizer must get.
	2741	*
	2742	* WARNING: See warnings in hammer_unlock_cursor() function.
	2743	*/
	2744	if (hammer_flusher_meta_limit(hmp)) {
	2745	hammer_unlock_cursor(cursor);
	2746	hammer_flusher_finalize(trans, 0);
	2747	hammer_lock_cursor(cursor);
	2748	}
	2749
	2750	return(error);
	2751	}
	2752
	2753	/*
	2754	* Backend function called by the flusher to sync an inode to media.
	2755	*/
	2756	int
	2757	hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
	2758	{
	2759	struct hammer_cursor cursor;
	2760	hammer_node_t tmp_node;
	2761	hammer_record_t depend;
	2762	hammer_record_t next;
	2763	int error, tmp_error;
	2764	u_int64_t nlinks;
	2765
	2766	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
	2767	return(0);
	2768
	2769	error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
	2770	if (error)
	2771	goto done;
	2772
	2773	/*
	2774	* Any directory records referencing this inode which are not in
	2775	* our current flush group must adjust our nlink count for the
	2776	* purposes of synchronizating to disk.
	2777	*
	2778	* Records which are in our flush group can be unlinked from our
	2779	* inode now, potentially allowing the inode to be physically
	2780	* deleted.
	2781	*
	2782	* This cannot block.
	2783	*/
	2784	nlinks = ip->ino_data.nlinks;
	2785	next = TAILQ_FIRST(&ip->target_list);
	2786	while ((depend = next) != NULL) {
	2787	next = TAILQ_NEXT(depend, target_entry);
	2788	if (depend->flush_state == HAMMER_FST_FLUSH &&
	2789	depend->flush_group == ip->flush_group) {
	2790	/*
	2791	* If this is an ADD that was deleted by the frontend
	2792	* the frontend nlinks count will have already been
	2793	* decremented, but the backend is going to sync its
	2794	* directory entry and must account for it. The
	2795	* record will be converted to a delete-on-disk when
	2796	* it gets synced.
	2797	*
	2798	* If the ADD was not deleted by the frontend we
	2799	* can remove the dependancy from our target_list.
	2800	*/
	2801	if (depend->flags & HAMMER_RECF_DELETED_FE) {
	2802	++nlinks;
	2803	} else {
	2804	TAILQ_REMOVE(&ip->target_list, depend,
	2805	target_entry);
	2806	depend->target_ip = NULL;
	2807	}
	2808	} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
	2809	/*
	2810	* Not part of our flush group and not deleted by
	2811	* the front-end, adjust the link count synced to
	2812	* the media (undo what the frontend did when it
	2813	* queued the record).
	2814	*/
	2815	KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
	2816	switch(depend->type) {
	2817	case HAMMER_MEM_RECORD_ADD:
	2818	--nlinks;
	2819	break;
	2820	case HAMMER_MEM_RECORD_DEL:
	2821	++nlinks;
	2822	break;
	2823	default:
	2824	break;
	2825	}
	2826	}
	2827	}
	2828
	2829	/*
	2830	* Set dirty if we had to modify the link count.
	2831	*/
	2832	if (ip->sync_ino_data.nlinks != nlinks) {
	2833	KKASSERT((int64_t)nlinks >= 0);
	2834	ip->sync_ino_data.nlinks = nlinks;
	2835	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	2836	}
	2837
	2838	/*
	2839	* If there is a trunction queued destroy any data past the (aligned)
	2840	* truncation point. Userland will have dealt with the buffer
	2841	* containing the truncation point for us.
	2842	*
	2843	* We don't flush pending frontend data buffers until after we've
	2844	* dealt with the truncation.
	2845	*/
	2846	if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	2847	/*
	2848	* Interlock trunc_off. The VOP front-end may continue to
	2849	* make adjustments to it while we are blocked.
	2850	*/
	2851	off_t trunc_off;
	2852	off_t aligned_trunc_off;
	2853	int blkmask;
	2854
	2855	trunc_off = ip->sync_trunc_off;
	2856	blkmask = hammer_blocksize(trunc_off) - 1;
	2857	aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
	2858
	2859	/*
	2860	* Delete any whole blocks on-media. The front-end has
	2861	* already cleaned out any partial block and made it
	2862	* pending. The front-end may have updated trunc_off
	2863	* while we were blocked so we only use sync_trunc_off.
	2864	*
	2865	* This operation can blow out the buffer cache, EWOULDBLOCK
	2866	* means we were unable to complete the deletion. The
	2867	* deletion will update sync_trunc_off in that case.
	2868	*/
	2869	error = hammer_ip_delete_range(&cursor, ip,
	2870	aligned_trunc_off,
	2871	0x7FFFFFFFFFFFFFFFLL, 2);
	2872	if (error == EWOULDBLOCK) {
	2873	ip->flags \|= HAMMER_INODE_WOULDBLOCK;
	2874	error = 0;
	2875	goto defer_buffer_flush;
	2876	}
	2877
	2878	if (error)
	2879	goto done;
	2880
	2881	/*
	2882	* Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
	2883	*
	2884	* XXX we do this even if we did not previously generate
	2885	* a REDO_TRUNC record. This operation may enclosed the
	2886	* range for multiple prior truncation entries in the REDO
	2887	* log.
	2888	*/
	2889	if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
	2890	(ip->flags & HAMMER_INODE_RDIRTY)) {
	2891	hammer_generate_redo(trans, ip, aligned_trunc_off,
	2892	HAMMER_REDO_TERM_TRUNC,
	2893	NULL, 0);
	2894	}
	2895
	2896	/*
	2897	* Clear the truncation flag on the backend after we have
	2898	* completed the deletions. Backend data is now good again
	2899	* (including new records we are about to sync, below).
	2900	*
	2901	* Leave sync_trunc_off intact. As we write additional
	2902	* records the backend will update sync_trunc_off. This
	2903	* tells the backend whether it can skip the overwrite
	2904	* test. This should work properly even when the backend
	2905	* writes full blocks where the truncation point straddles
	2906	* the block because the comparison is against the base
	2907	* offset of the record.
	2908	*/
	2909	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2910	/* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
	2911	} else {
	2912	error = 0;
	2913	}
	2914
	2915	/*
	2916	* Now sync related records. These will typically be directory
	2917	* entries, records tracking direct-writes, or delete-on-disk records.
	2918	*/
	2919	if (error == 0) {
	2920	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	2921	hammer_sync_record_callback, &cursor);
	2922	if (tmp_error < 0)
	2923	tmp_error = -error;
	2924	if (tmp_error)
	2925	error = tmp_error;
	2926	}
	2927	hammer_cache_node(&ip->cache[1], cursor.node);
	2928
	2929	/*
	2930	* Re-seek for inode update, assuming our cache hasn't been ripped
	2931	* out from under us.
	2932	*/
	2933	if (error == 0) {
	2934	tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
	2935	if (tmp_node) {
	2936	hammer_cursor_downgrade(&cursor);
	2937	hammer_lock_sh(&tmp_node->lock);
	2938	if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
	2939	hammer_cursor_seek(&cursor, tmp_node, 0);
	2940	hammer_unlock(&tmp_node->lock);
	2941	hammer_rel_node(tmp_node);
	2942	}
	2943	error = 0;
	2944	}
	2945
	2946	/*
	2947	* If we are deleting the inode the frontend had better not have
	2948	* any active references on elements making up the inode.
	2949	*
	2950	* The call to hammer_ip_delete_clean() cleans up auxillary records
	2951	* but not DB or DATA records. Those must have already been deleted
	2952	* by the normal truncation mechanic.
	2953	*/
	2954	if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
	2955	RB_EMPTY(&ip->rec_tree) &&
	2956	(ip->sync_flags & HAMMER_INODE_DELETING) &&
	2957	(ip->flags & HAMMER_INODE_DELETED) == 0) {
	2958	int count1 = 0;
	2959
	2960	error = hammer_ip_delete_clean(&cursor, ip, &count1);
	2961	if (error == 0) {
	2962	ip->flags \|= HAMMER_INODE_DELETED;
	2963	ip->sync_flags &= ~HAMMER_INODE_DELETING;
	2964	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	2965	KKASSERT(RB_EMPTY(&ip->rec_tree));
	2966
	2967	/*
	2968	* Set delete_tid in both the frontend and backend
	2969	* copy of the inode record. The DELETED flag handles
	2970	* this, do not set DDIRTY.
	2971	*/
	2972	ip->ino_leaf.base.delete_tid = trans->tid;
	2973	ip->sync_ino_leaf.base.delete_tid = trans->tid;
	2974	ip->ino_leaf.delete_ts = trans->time32;
	2975	ip->sync_ino_leaf.delete_ts = trans->time32;
	2976
	2977
	2978	/*
	2979	* Adjust the inode count in the volume header
	2980	*/
	2981	hammer_sync_lock_sh(trans);
	2982	if (ip->flags & HAMMER_INODE_ONDISK) {
	2983	hammer_modify_volume_field(trans,
	2984	trans->rootvol,
	2985	vol0_stat_inodes);
	2986	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	2987	hammer_modify_volume_done(trans->rootvol);
	2988	}
	2989	hammer_sync_unlock(trans);
	2990	}
	2991	}
	2992
	2993	if (error)
	2994	goto done;
	2995	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	2996
	2997	defer_buffer_flush:
	2998	/*
	2999	* Now update the inode's on-disk inode-data and/or on-disk record.
	3000	* DELETED and ONDISK are managed only in ip->flags.
	3001	*
	3002	* In the case of a defered buffer flush we still update the on-disk
	3003	* inode to satisfy visibility requirements if there happen to be
	3004	* directory dependancies.
	3005	*/
	3006	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	3007	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	3008	/*
	3009	* If deleted and on-disk, don't set any additional flags.
	3010	* the delete flag takes care of things.
	3011	*
	3012	* Clear flags which may have been set by the frontend.
	3013	*/
	3014	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	3015	HAMMER_INODE_SDIRTY \|
	3016	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	3017	HAMMER_INODE_DELETING);
	3018	break;
	3019	case HAMMER_INODE_DELETED:
	3020	/*
	3021	* Take care of the case where a deleted inode was never
	3022	* flushed to the disk in the first place.
	3023	*
	3024	* Clear flags which may have been set by the frontend.
	3025	*/
	3026	ip->sync_flags &= ~(HAMMER_INODE_DDIRTY \| HAMMER_INODE_XDIRTY \|
	3027	HAMMER_INODE_SDIRTY \|
	3028	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME \|
	3029	HAMMER_INODE_DELETING);
	3030	while (RB_ROOT(&ip->rec_tree)) {
	3031	hammer_record_t record = RB_ROOT(&ip->rec_tree);
	3032	hammer_ref(&record->lock);
	3033	KKASSERT(hammer_oneref(&record->lock));
	3034	record->flags \|= HAMMER_RECF_DELETED_BE;
	3035	++record->ip->rec_generation;
	3036	hammer_rel_mem_record(record);
	3037	}
	3038	break;
	3039	case HAMMER_INODE_ONDISK:
	3040	/*
	3041	* If already on-disk, do not set any additional flags.
	3042	*/
	3043	break;
	3044	default:
	3045	/*
	3046	* If not on-disk and not deleted, set DDIRTY to force
	3047	* an initial record to be written.
	3048	*
	3049	* Also set the create_tid in both the frontend and backend
	3050	* copy of the inode record.
	3051	*/
	3052	ip->ino_leaf.base.create_tid = trans->tid;
	3053	ip->ino_leaf.create_ts = trans->time32;
	3054	ip->sync_ino_leaf.base.create_tid = trans->tid;
	3055	ip->sync_ino_leaf.create_ts = trans->time32;
	3056	ip->sync_flags \|= HAMMER_INODE_DDIRTY;
	3057	break;
	3058	}
	3059
	3060	/*
	3061	* If DDIRTY or SDIRTY is set, write out a new record.
	3062	* If the inode is already on-disk the old record is marked as
	3063	* deleted.
	3064	*
	3065	* If DELETED is set hammer_update_inode() will delete the existing
	3066	* record without writing out a new one.
	3067	*
	3068	* If ONLY the ITIMES flag is set we can update the record in-place.
	3069	*/
	3070	if (ip->flags & HAMMER_INODE_DELETED) {
	3071	error = hammer_update_inode(&cursor, ip);
	3072	} else
	3073	if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_SDIRTY)) &&
	3074	(ip->sync_flags & (HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME))) {
	3075	error = hammer_update_itimes(&cursor, ip);
	3076	} else
	3077	if (ip->sync_flags & (HAMMER_INODE_DDIRTY \| HAMMER_INODE_SDIRTY \|
	3078	HAMMER_INODE_ATIME \| HAMMER_INODE_MTIME)) {
	3079	error = hammer_update_inode(&cursor, ip);
	3080	}
	3081	done:
	3082	if (error) {
	3083	hammer_critical_error(ip->hmp, ip, error,
	3084	"while syncing inode");
	3085	}
	3086	hammer_done_cursor(&cursor);
	3087	return(error);
	3088	}
	3089
	3090	/*
	3091	* This routine is called when the OS is no longer actively referencing
	3092	* the inode (but might still be keeping it cached), or when releasing
	3093	* the last reference to an inode.
	3094	*
	3095	* At this point if the inode's nlinks count is zero we want to destroy
	3096	* it, which may mean destroying it on-media too.
	3097	*/
	3098	void
	3099	hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
	3100	{
	3101	struct vnode *vp;
	3102
	3103	/*
	3104	* Set the DELETING flag when the link count drops to 0 and the
	3105	* OS no longer has any opens on the inode.
	3106	*
	3107	* The backend will clear DELETING (a mod flag) and set DELETED
	3108	* (a state flag) when it is actually able to perform the
	3109	* operation.
	3110	*
	3111	* Don't reflag the deletion if the flusher is currently syncing
	3112	* one that was already flagged. A previously set DELETING flag
	3113	* may bounce around flags and sync_flags until the operation is
	3114	* completely done.
	3115	*
	3116	* Do not attempt to modify a snapshot inode (one set to read-only).
	3117	*/
	3118	if (ip->ino_data.nlinks == 0 &&
	3119	((ip->flags \| ip->sync_flags) & (HAMMER_INODE_RO\|HAMMER_INODE_DELETING\|HAMMER_INODE_DELETED)) == 0) {
	3120	ip->flags \|= HAMMER_INODE_DELETING;
	3121	ip->flags \|= HAMMER_INODE_TRUNCATED;
	3122	ip->trunc_off = 0;
	3123	vp = NULL;
	3124	if (getvp) {
	3125	if (hammer_get_vnode(ip, &vp) != 0)
	3126	return;
	3127	}
	3128
	3129	/*
	3130	* Final cleanup
	3131	*/
	3132	if (ip->vp)
	3133	nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0);
	3134	if (getvp)
	3135	vput(vp);
	3136	}
	3137	}
	3138
	3139	/*
	3140	* After potentially resolving a dependancy the inode is tested
	3141	* to determine whether it needs to be reflushed.
	3142	*/
	3143	void
	3144	hammer_test_inode(hammer_inode_t ip)
	3145	{
	3146	if (ip->flags & HAMMER_INODE_REFLUSH) {
	3147	ip->flags &= ~HAMMER_INODE_REFLUSH;
	3148	hammer_ref(&ip->lock);
	3149	if (ip->flags & HAMMER_INODE_RESIGNAL) {
	3150	ip->flags &= ~HAMMER_INODE_RESIGNAL;
	3151	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
	3152	} else {
	3153	hammer_flush_inode(ip, 0);
	3154	}
	3155	hammer_rel_inode(ip, 0);
	3156	}
	3157	}
	3158
	3159	/*
	3160	* Clear the RECLAIM flag on an inode. This occurs when the inode is
	3161	* reassociated with a vp or just before it gets freed.
	3162	*
	3163	* Pipeline wakeups to threads blocked due to an excessive number of
	3164	* detached inodes. This typically occurs when atime updates accumulate
	3165	* while scanning a directory tree.
	3166	*/
	3167	static void
	3168	hammer_inode_wakereclaims(hammer_inode_t ip)
	3169	{
	3170	struct hammer_reclaim *reclaim;
	3171	hammer_mount_t hmp = ip->hmp;
	3172
	3173	if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
	3174	return;
	3175
	3176	--hammer_count_reclaiming;
	3177	--hmp->inode_reclaims;
	3178	ip->flags &= ~HAMMER_INODE_RECLAIM;
	3179
	3180	if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
	3181	KKASSERT(reclaim->count > 0);
	3182	if (--reclaim->count == 0) {
	3183	TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
	3184	wakeup(reclaim);
	3185	}
	3186	}
	3187	}
	3188
	3189	/*
	3190	* Setup our reclaim pipeline. We only let so many detached (and dirty)
	3191	* inodes build up before we start blocking. This routine is called
	3192	* if a new inode is created or an inode is loaded from media.
	3193	*
	3194	* When we block we don't care which inode has finished reclaiming,
	3195	* as lone as one does.
	3196	*
	3197	* The reclaim pipeline is primary governed by the auto-flush which is
	3198	* 1/4 hammer_limit_reclaim. We don't want to block if the count is
	3199	* less than 1/2 hammer_limit_reclaim. From 1/2 to full count is
	3200	* dynamically governed.
	3201	*/
	3202	void
	3203	hammer_inode_waitreclaims(hammer_transaction_t trans)
	3204	{
	3205	hammer_mount_t hmp = trans->hmp;
	3206	struct hammer_reclaim reclaim;
	3207	int lower_limit;
	3208
	3209	/*
	3210	* Track inode load, delay if the number of reclaiming inodes is
	3211	* between 2/4 and 4/4 hammer_limit_reclaim, depending.
	3212	*/
	3213	if (curthread->td_proc) {
	3214	struct hammer_inostats *stats;
	3215
	3216	stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
	3217	++stats->count;
	3218
	3219	if (stats->count > hammer_limit_reclaim / 2)
	3220	stats->count = hammer_limit_reclaim / 2;
	3221	lower_limit = hammer_limit_reclaim - stats->count;
	3222	if (hammer_debug_general & 0x10000) {
	3223	kprintf("pid %5d limit %d\n",
	3224	(int)curthread->td_proc->p_pid, lower_limit);
	3225	}
	3226	} else {
	3227	lower_limit = hammer_limit_reclaim * 3 / 4;
	3228	}
	3229	if (hmp->inode_reclaims >= lower_limit) {
	3230	reclaim.count = 1;
	3231	TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
	3232	tsleep(&reclaim, 0, "hmrrcm", hz);
	3233	if (reclaim.count > 0)
	3234	TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
	3235	}
	3236	}
	3237
	3238	/*
	3239	* Keep track of reclaim statistics on a per-pid basis using a loose
	3240	* 4-way set associative hash table. Collisions inherit the count of
	3241	* the previous entry.
	3242	*
	3243	* NOTE: We want to be careful here to limit the chain size. If the chain
	3244	* size is too large a pid will spread its stats out over too many
	3245	* entries under certain types of heavy filesystem activity and
	3246	* wind up not delaying long enough.
	3247	*/
	3248	static
	3249	struct hammer_inostats *
	3250	hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
	3251	{
	3252	struct hammer_inostats *stats;
	3253	int delta;
	3254	int chain;
	3255	static volatile int iterator; /* we don't care about MP races */
	3256
	3257	/*
	3258	* Chain up to 4 times to find our entry.
	3259	*/
	3260	for (chain = 0; chain < 4; ++chain) {
	3261	stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
	3262	if (stats->pid == pid)
	3263	break;
	3264	}
	3265
	3266	/*
	3267	* Replace one of the four chaining entries with our new entry.
	3268	*/
	3269	if (chain == 4) {
	3270	stats = &hmp->inostats[(pid + (iterator++ & 3)) &
	3271	HAMMER_INOSTATS_HMASK];
	3272	stats->pid = pid;
	3273	}
	3274
	3275	/*
	3276	* Decay the entry
	3277	*/
	3278	if (stats->count && stats->ltick != ticks) {
	3279	delta = ticks - stats->ltick;
	3280	stats->ltick = ticks;
	3281	if (delta <= 0 \|\| delta > hz * 60)
	3282	stats->count = 0;
	3283	else
	3284	stats->count = stats->count * hz / (hz + delta);
	3285	}
	3286	if (hammer_debug_general & 0x10000)
	3287	kprintf("pid %5d stats %d\n", (int)pid, stats->count);
	3288	return (stats);
	3289	}
	3290
	3291	#if 0
	3292
	3293	/*
	3294	* XXX not used, doesn't work very well due to the large batching nature
	3295	* of flushes.
	3296	*
	3297	* A larger then normal backlog of inodes is sitting in the flusher,
	3298	* enforce a general slowdown to let it catch up. This routine is only
	3299	* called on completion of a non-flusher-related transaction which
	3300	* performed B-Tree node I/O.
	3301	*
	3302	* It is possible for the flusher to stall in a continuous load.
	3303	* blogbench -i1000 -o seems to do a good job generating this sort of load.
	3304	* If the flusher is unable to catch up the inode count can bloat until
	3305	* we run out of kvm.
	3306	*
	3307	* This is a bit of a hack.
	3308	*/
	3309	void
	3310	hammer_inode_waithard(hammer_mount_t hmp)
	3311	{
	3312	/*
	3313	* Hysteresis.
	3314	*/
	3315	if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
	3316	if (hmp->inode_reclaims < hammer_limit_reclaim / 2 &&
	3317	hmp->count_iqueued < hmp->count_inodes / 20) {
	3318	hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
	3319	return;
	3320	}
	3321	} else {
	3322	if (hmp->inode_reclaims < hammer_limit_reclaim \|\|
	3323	hmp->count_iqueued < hmp->count_inodes / 10) {
	3324	return;
	3325	}
	3326	hmp->flags \|= HAMMER_MOUNT_FLUSH_RECOVERY;
	3327	}
	3328
	3329	/*
	3330	* Block for one flush cycle.
	3331	*/
	3332	hammer_flusher_wait_next(hmp);
	3333	}
	3334
	3335	#endif