gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.39 2008/04/26 08:02:17 dillon Exp $
	35	*/
	36
	37	#include "hammer.h"
	38	#include <sys/buf.h>
	39	#include <sys/buf2.h>
	40
	41	static int hammer_unload_inode(struct hammer_inode *ip);
	42	static void hammer_flush_inode_copysync(hammer_inode_t ip);
	43	static int hammer_mark_record_callback(hammer_record_t rec, void *data);
	44
	45	/*
	46	* The kernel is not actively referencing this vnode but is still holding
	47	* it cached.
	48	*
	49	* This is called from the frontend.
	50	*/
	51	int
	52	hammer_vop_inactive(struct vop_inactive_args *ap)
	53	{
	54	struct hammer_inode *ip = VTOI(ap->a_vp);
	55
	56	/*
	57	* Degenerate case
	58	*/
	59	if (ip == NULL) {
	60	vrecycle(ap->a_vp);
	61	return(0);
	62	}
	63
	64	/*
	65	* If the inode no longer has any references we recover its
	66	* in-memory resources immediately.
	67	*
	68	* NOTE: called from frontend, use ino_rec instead of sync_ino_rec.
	69	*/
	70	if (ip->ino_rec.ino_nlinks == 0)
	71	vrecycle(ap->a_vp);
	72	return(0);
	73	}
	74
	75	/*
	76	* Release the vnode association. This is typically (but not always)
	77	* the last reference on the inode and will flush the inode to the
	78	* buffer cache.
	79	*
	80	* XXX Currently our sync code only runs through inodes with vnode
	81	* associations, so we depend on hammer_rel_inode() to sync any inode
	82	* record data to the block device prior to losing the association.
	83	* Otherwise transactions that the user expected to be distinct by
	84	* doing a manual sync may be merged.
	85	*/
	86	int
	87	hammer_vop_reclaim(struct vop_reclaim_args *ap)
	88	{
	89	struct hammer_inode *ip;
	90	struct vnode *vp;
	91
	92	vp = ap->a_vp;
	93
	94	if ((ip = vp->v_data) != NULL) {
	95	vp->v_data = NULL;
	96	ip->vp = NULL;
	97
	98	/*
	99	* Don't let too many dependancies build up on unreferenced
	100	* inodes or we could run ourselves out of memory.
	101	*/
	102	if (TAILQ_FIRST(&ip->depend_list)) {
	103	ip->hmp->reclaim_count += ip->depend_count;
	104	if (ip->hmp->reclaim_count > 256) {
	105	ip->hmp->reclaim_count = 0;
	106	hammer_flusher_async(ip->hmp);
	107	}
	108	}
	109	hammer_rel_inode(ip, 1);
	110	}
	111	return(0);
	112	}
	113
	114	/*
	115	* Return a locked vnode for the specified inode. The inode must be
	116	* referenced but NOT LOCKED on entry and will remain referenced on
	117	* return.
	118	*
	119	* Called from the frontend.
	120	*/
	121	int
	122	hammer_get_vnode(struct hammer_inode ip, int lktype, struct vnode *vpp)
	123	{
	124	struct vnode *vp;
	125	int error = 0;
	126
	127	for (;;) {
	128	if ((vp = ip->vp) == NULL) {
	129	error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
	130	if (error)
	131	break;
	132	hammer_lock_ex(&ip->lock);
	133	if (ip->vp != NULL) {
	134	hammer_unlock(&ip->lock);
	135	vp->v_type = VBAD;
	136	vx_put(vp);
	137	continue;
	138	}
	139	hammer_ref(&ip->lock);
	140	vp = *vpp;
	141	ip->vp = vp;
	142	vp->v_type = hammer_get_vnode_type(
	143	ip->ino_rec.base.base.obj_type);
	144
	145	switch(ip->ino_rec.base.base.obj_type) {
	146	case HAMMER_OBJTYPE_CDEV:
	147	case HAMMER_OBJTYPE_BDEV:
	148	vp->v_ops = &ip->hmp->mp->mnt_vn_spec_ops;
	149	addaliasu(vp, ip->ino_data.rmajor,
	150	ip->ino_data.rminor);
	151	break;
	152	case HAMMER_OBJTYPE_FIFO:
	153	vp->v_ops = &ip->hmp->mp->mnt_vn_fifo_ops;
	154	break;
	155	default:
	156	break;
	157	}
	158
	159	/*
	160	* Only mark as the root vnode if the ip is not
	161	* historical, otherwise the VFS cache will get
	162	* confused. The other half of the special handling
	163	* is in hammer_vop_nlookupdotdot().
	164	*/
	165	if (ip->obj_id == HAMMER_OBJID_ROOT &&
	166	ip->obj_asof == ip->hmp->asof) {
	167	vp->v_flag \|= VROOT;
	168	}
	169
	170	vp->v_data = (void *)ip;
	171	/* vnode locked by getnewvnode() */
	172	/* make related vnode dirty if inode dirty? */
	173	hammer_unlock(&ip->lock);
	174	if (vp->v_type == VREG)
	175	vinitvmio(vp, ip->ino_rec.ino_size);
	176	break;
	177	}
	178
	179	/*
	180	* loop if the vget fails (aka races), or if the vp
	181	* no longer matches ip->vp.
	182	*/
	183	if (vget(vp, LK_EXCLUSIVE) == 0) {
	184	if (vp == ip->vp)
	185	break;
	186	vput(vp);
	187	}
	188	}
	189	*vpp = vp;
	190	return(error);
	191	}
	192
	193	/*
	194	* Acquire a HAMMER inode. The returned inode is not locked. These functions
	195	* do not attach or detach the related vnode (use hammer_get_vnode() for
	196	* that).
	197	*
	198	* The flags argument is only applied for newly created inodes, and only
	199	* certain flags are inherited.
	200	*
	201	* Called from the frontend.
	202	*/
	203	struct hammer_inode *
	204	hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache,
	205	u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
	206	{
	207	hammer_mount_t hmp = trans->hmp;
	208	struct hammer_inode_info iinfo;
	209	struct hammer_cursor cursor;
	210	struct hammer_inode *ip;
	211
	212	/*
	213	* Determine if we already have an inode cached. If we do then
	214	* we are golden.
	215	*/
	216	iinfo.obj_id = obj_id;
	217	iinfo.obj_asof = asof;
	218	loop:
	219	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
	220	if (ip) {
	221	hammer_ref(&ip->lock);
	222	*errorp = 0;
	223	return(ip);
	224	}
	225
	226	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	227	++hammer_count_inodes;
	228	ip->obj_id = obj_id;
	229	ip->obj_asof = iinfo.obj_asof;
	230	ip->hmp = hmp;
	231	ip->flags = flags & HAMMER_INODE_RO;
	232	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	233	if (hmp->ronly)
	234	ip->flags \|= HAMMER_INODE_RO;
	235	RB_INIT(&ip->rec_tree);
	236	TAILQ_INIT(&ip->bio_list);
	237	TAILQ_INIT(&ip->bio_alt_list);
	238	TAILQ_INIT(&ip->depend_list);
	239
	240	/*
	241	* Locate the on-disk inode.
	242	*/
	243	retry:
	244	hammer_init_cursor(trans, &cursor, cache);
	245	cursor.key_beg.obj_id = ip->obj_id;
	246	cursor.key_beg.key = 0;
	247	cursor.key_beg.create_tid = 0;
	248	cursor.key_beg.delete_tid = 0;
	249	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	250	cursor.key_beg.obj_type = 0;
	251	cursor.asof = iinfo.obj_asof;
	252	cursor.flags = HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_GET_DATA \|
	253	HAMMER_CURSOR_ASOF;
	254
	255	*errorp = hammer_btree_lookup(&cursor);
	256	if (*errorp == EDEADLK) {
	257	hammer_done_cursor(&cursor);
	258	goto retry;
	259	}
	260
	261	/*
	262	* On success the B-Tree lookup will hold the appropriate
	263	* buffer cache buffers and provide a pointer to the requested
	264	* information. Copy the information to the in-memory inode
	265	* and cache the B-Tree node to improve future operations.
	266	*/
	267	if (*errorp == 0) {
	268	ip->ino_rec = cursor.record->inode;
	269	ip->ino_data = cursor.data->inode;
	270	hammer_cache_node(cursor.node, &ip->cache[0]);
	271	if (cache)
	272	hammer_cache_node(cursor.node, cache);
	273	}
	274
	275	/*
	276	* On success load the inode's record and data and insert the
	277	* inode into the B-Tree. It is possible to race another lookup
	278	* insertion of the same inode so deal with that condition too.
	279	*
	280	* The cursor's locked node interlocks against others creating and
	281	* destroying ip while we were blocked.
	282	*/
	283	if (*errorp == 0) {
	284	hammer_ref(&ip->lock);
	285	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	286	hammer_uncache_node(&ip->cache[0]);
	287	hammer_uncache_node(&ip->cache[1]);
	288	KKASSERT(ip->lock.refs == 1);
	289	--hammer_count_inodes;
	290	kfree(ip, M_HAMMER);
	291	hammer_done_cursor(&cursor);
	292	goto loop;
	293	}
	294	ip->flags \|= HAMMER_INODE_ONDISK;
	295	} else {
	296	--hammer_count_inodes;
	297	kfree(ip, M_HAMMER);
	298	ip = NULL;
	299	}
	300	hammer_done_cursor(&cursor);
	301	return (ip);
	302	}
	303
	304	/*
	305	* Create a new filesystem object, returning the inode in *ipp. The
	306	* returned inode will be referenced and shared-locked. The caller
	307	* must unlock and release it when finished.
	308	*
	309	* The inode is created in-memory.
	310	*/
	311	int
	312	hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
	313	struct ucred *cred, hammer_inode_t dip,
	314	struct hammer_inode **ipp)
	315	{
	316	hammer_mount_t hmp;
	317	hammer_inode_t ip;
	318	uid_t xuid;
	319
	320	hmp = trans->hmp;
	321	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK\|M_ZERO);
	322	++hammer_count_inodes;
	323	ip->obj_id = hammer_alloc_tid(trans);
	324	KKASSERT(ip->obj_id != 0);
	325	ip->obj_asof = hmp->asof;
	326	ip->hmp = hmp;
	327	ip->flush_state = HAMMER_FST_IDLE;
	328	ip->flags = HAMMER_INODE_DDIRTY \| HAMMER_INODE_RDIRTY \|
	329	HAMMER_INODE_ITIMES;
	330
	331	RB_INIT(&ip->rec_tree);
	332	TAILQ_INIT(&ip->bio_list);
	333	TAILQ_INIT(&ip->bio_alt_list);
	334	TAILQ_INIT(&ip->depend_list);
	335
	336	ip->ino_rec.ino_atime = trans->time;
	337	ip->ino_rec.ino_mtime = trans->time;
	338	ip->ino_rec.ino_size = 0;
	339	ip->ino_rec.ino_nlinks = 0;
	340	/* XXX */
	341	ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
	342	ip->ino_rec.base.base.obj_id = ip->obj_id;
	343	ip->ino_rec.base.base.key = 0;
	344	ip->ino_rec.base.base.create_tid = 0;
	345	ip->ino_rec.base.base.delete_tid = 0;
	346	ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
	347	ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
	348
	349	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
	350	ip->ino_data.mode = vap->va_mode;
	351	ip->ino_data.ctime = trans->time;
	352	ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
	353
	354	switch(ip->ino_rec.base.base.obj_type) {
	355	case HAMMER_OBJTYPE_CDEV:
	356	case HAMMER_OBJTYPE_BDEV:
	357	ip->ino_data.rmajor = vap->va_rmajor;
	358	ip->ino_data.rminor = vap->va_rminor;
	359	break;
	360	default:
	361	break;
	362	}
	363
	364	/*
	365	* Calculate default uid/gid and overwrite with information from
	366	* the vap.
	367	*/
	368	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
	369	ip->ino_data.gid = dip->ino_data.gid;
	370	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
	371	&vap->va_mode);
	372	ip->ino_data.mode = vap->va_mode;
	373
	374	if (vap->va_vaflags & VA_UID_UUID_VALID)
	375	ip->ino_data.uid = vap->va_uid_uuid;
	376	else if (vap->va_uid != (uid_t)VNOVAL)
	377	hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
	378	if (vap->va_vaflags & VA_GID_UUID_VALID)
	379	ip->ino_data.gid = vap->va_gid_uuid;
	380	else if (vap->va_gid != (gid_t)VNOVAL)
	381	hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
	382
	383	hammer_ref(&ip->lock);
	384	hammer_lock_sh(&ip->lock);
	385	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
	386	hammer_unref(&ip->lock);
	387	panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
	388	}
	389	*ipp = ip;
	390	return(0);
	391	}
	392
	393	/*
	394	* Called by hammer_sync_inode().
	395	*/
	396	static int
	397	hammer_update_inode(hammer_transaction_t trans, hammer_inode_t ip)
	398	{
	399	struct hammer_cursor cursor;
	400	hammer_record_t record;
	401	int error;
	402
	403	/*
	404	* Locate the record on-disk and mark it as deleted. Both the B-Tree
	405	* node and the record must be marked deleted. The record may or
	406	* may not be physically deleted, depending on the retention policy.
	407	*
	408	* If the inode has already been deleted on-disk we have nothing
	409	* to do.
	410	*
	411	* XXX Update the inode record and data in-place if the retention
	412	* policy allows it.
	413	*/
	414	retry:
	415	error = 0;
	416
	417	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	418	HAMMER_INODE_ONDISK) {
	419	hammer_init_cursor(trans, &cursor, &ip->cache[0]);
	420	cursor.key_beg.obj_id = ip->obj_id;
	421	cursor.key_beg.key = 0;
	422	cursor.key_beg.create_tid = 0;
	423	cursor.key_beg.delete_tid = 0;
	424	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	425	cursor.key_beg.obj_type = 0;
	426	cursor.asof = ip->obj_asof;
	427	cursor.flags \|= HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_ASOF;
	428	cursor.flags \|= HAMMER_CURSOR_BACKEND;
	429
	430	error = hammer_btree_lookup(&cursor);
	431	if (error) {
	432	kprintf("error %d\n", error);
	433	Debugger("hammer_update_inode");
	434	}
	435
	436
	437	if (error == 0) {
	438	error = hammer_ip_delete_record(&cursor, trans->tid);
	439	if (error && error != EDEADLK) {
	440	kprintf("error %d\n", error);
	441	Debugger("hammer_update_inode2");
	442	}
	443	if (error == 0)
	444	ip->flags \|= HAMMER_INODE_DELONDISK;
	445	hammer_cache_node(cursor.node, &ip->cache[0]);
	446	}
	447	hammer_done_cursor(&cursor);
	448	if (error == EDEADLK)
	449	goto retry;
	450	}
	451
	452	/*
	453	* Write out a new record if the in-memory inode is not marked
	454	* as having been deleted. Update our inode statistics if this
	455	* is the first application of the inode on-disk.
	456	*
	457	* If the inode has been deleted permanently, HAMMER_INODE_DELONDISK
	458	* will remain set and prevent further updates.
	459	*/
	460	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
	461	record = hammer_alloc_mem_record(ip);
	462	record->state = HAMMER_FST_FLUSH;
	463	record->rec.inode = ip->sync_ino_rec;
	464	record->rec.inode.base.base.create_tid = trans->tid;
	465	record->rec.inode.base.data_len = sizeof(ip->sync_ino_data);
	466	record->data = (void *)&ip->sync_ino_data;
	467	error = hammer_ip_sync_record(trans, record);
	468	if (error) {
	469	kprintf("error %d\n", error);
	470	Debugger("hammer_update_inode3");
	471	}
	472	hammer_delete_mem_record(record);
	473	hammer_rel_mem_record(record);
	474	if (error == 0) {
	475	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY \|
	476	HAMMER_INODE_DDIRTY \|
	477	HAMMER_INODE_ITIMES);
	478	ip->flags &= ~HAMMER_INODE_DELONDISK;
	479	if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
	480	hammer_modify_volume(trans, trans->rootvol,
	481	NULL, 0);
	482	++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	483	hammer_modify_volume_done(trans->rootvol);
	484	ip->flags \|= HAMMER_INODE_ONDISK;
	485	}
	486	}
	487	}
	488	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
	489	/*
	490	* Clean out any left-over flags if the inode has been
	491	* destroyed.
	492	*/
	493	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY \|
	494	HAMMER_INODE_DDIRTY \|
	495	HAMMER_INODE_ITIMES);
	496	}
	497	return(error);
	498	}
	499
	500	/*
	501	* Update only the itimes fields. This is done no-historically. The
	502	* record is updated in-place on the disk.
	503	*/
	504	static int
	505	hammer_update_itimes(hammer_transaction_t trans, hammer_inode_t ip)
	506	{
	507	struct hammer_cursor cursor;
	508	struct hammer_inode_record *rec;
	509	int error;
	510
	511	retry:
	512	error = 0;
	513	if ((ip->flags & (HAMMER_INODE_ONDISK\|HAMMER_INODE_DELONDISK)) ==
	514	HAMMER_INODE_ONDISK) {
	515	hammer_init_cursor(trans, &cursor, &ip->cache[0]);
	516	cursor.key_beg.obj_id = ip->obj_id;
	517	cursor.key_beg.key = 0;
	518	cursor.key_beg.create_tid = 0;
	519	cursor.key_beg.delete_tid = 0;
	520	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
	521	cursor.key_beg.obj_type = 0;
	522	cursor.asof = ip->obj_asof;
	523	cursor.flags \|= HAMMER_CURSOR_GET_RECORD \| HAMMER_CURSOR_ASOF;
	524	cursor.flags \|= HAMMER_CURSOR_BACKEND;
	525
	526	error = hammer_btree_lookup(&cursor);
	527	if (error) {
	528	kprintf("error %d\n", error);
	529	Debugger("hammer_update_itimes1");
	530	}
	531	if (error == 0) {
	532	/*
	533	* Do not generate UNDO records for atime/mtime
	534	* updates.
	535	*/
	536	rec = &cursor.record->inode;
	537	hammer_modify_buffer(cursor.trans, cursor.record_buffer,
	538	NULL, 0);
	539	rec->ino_atime = ip->sync_ino_rec.ino_atime;
	540	rec->ino_mtime = ip->sync_ino_rec.ino_mtime;
	541	hammer_modify_buffer_done(cursor.record_buffer);
	542	ip->sync_flags &= ~HAMMER_INODE_ITIMES;
	543	/* XXX recalculate crc */
	544	hammer_cache_node(cursor.node, &ip->cache[0]);
	545	}
	546	hammer_done_cursor(&cursor);
	547	if (error == EDEADLK)
	548	goto retry;
	549	}
	550	return(error);
	551	}
	552
	553	/*
	554	* Release a reference on an inode. If asked to flush the last release
	555	* will flush the inode.
	556	*
	557	* On the last reference we queue the inode to the flusher for its final
	558	* disposition.
	559	*/
	560	void
	561	hammer_rel_inode(struct hammer_inode *ip, int flush)
	562	{
	563	/*
	564	* Handle disposition when dropping the last ref.
	565	*/
	566	while (ip->lock.refs == 1) {
	567	#if 0
	568	/*
	569	* XXX this can create a deep stack recursion
	570	*/
	571	if (curthread == ip->hmp->flusher_td) {
	572	/*
	573	* We are the flusher, do any required flushes
	574	* before unloading the inode.
	575	*/
	576	int error = 0;
	577
	578	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	579	while (error == 0 &&
	580	(ip->flags & HAMMER_INODE_MODMASK)) {
	581	hammer_ref(&ip->lock);
	582	hammer_flush_inode_copysync(ip);
	583	error = hammer_sync_inode(ip, 1);
	584	hammer_flush_inode_done(ip);
	585	}
	586	if (error)
	587	kprintf("hammer_sync_inode failed error %d\n",
	588	error);
	589	if (ip->lock.refs > 1)
	590	continue;
	591	hammer_unload_inode(ip);
	592	return;
	593	}
	594	#endif
	595	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
	596	hammer_unload_inode(ip);
	597	return;
	598	}
	599
	600	/*
	601	* Hand the inode over to the flusher, which will
	602	* add another ref to it.
	603	*/
	604	if (++ip->hmp->reclaim_count > 256) {
	605	ip->hmp->reclaim_count = 0;
	606	hammer_flush_inode(ip, HAMMER_FLUSH_FORCE \|
	607	HAMMER_FLUSH_SIGNAL);
	608	} else {
	609	hammer_flush_inode(ip, HAMMER_FLUSH_FORCE);
	610	}
	611	/* retry */
	612	}
	613
	614	/*
	615	* The inode still has multiple refs, drop one ref. If a flush was
	616	* requested make sure the flusher sees it.
	617	*/
	618	if (flush && ip->flush_state == HAMMER_FST_IDLE)
	619	hammer_flush_inode(ip, HAMMER_FLUSH_RELEASE);
	620	else
	621	hammer_unref(&ip->lock);
	622	}
	623
	624	/*
	625	* Unload and destroy the specified inode. Must be called with one remaining
	626	* reference. The reference is disposed of.
	627	*
	628	* This can only be called in the context of the flusher.
	629	*/
	630	static int
	631	hammer_unload_inode(struct hammer_inode *ip)
	632	{
	633
	634	KASSERT(ip->lock.refs == 1,
	635	("hammer_unload_inode: %d refs\n", ip->lock.refs));
	636	KKASSERT(ip->vp == NULL);
	637	KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
	638	KKASSERT(ip->cursor_ip_refs == 0);
	639	KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
	640
	641	KKASSERT(RB_EMPTY(&ip->rec_tree));
	642	KKASSERT(TAILQ_EMPTY(&ip->bio_list));
	643	KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
	644
	645	RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
	646
	647	hammer_uncache_node(&ip->cache[0]);
	648	hammer_uncache_node(&ip->cache[1]);
	649	--hammer_count_inodes;
	650	kfree(ip, M_HAMMER);
	651
	652	return(0);
	653	}
	654
	655	/*
	656	* A transaction has modified an inode, requiring updates as specified by
	657	* the passed flags.
	658	*
	659	* HAMMER_INODE_RDIRTY: Inode record has been updated
	660	* HAMMER_INODE_DDIRTY: Inode data has been updated
	661	* HAMMER_INODE_XDIRTY: Dirty frontend buffer cache buffer strategized
	662	* HAMMER_INODE_DELETED: Inode record/data must be deleted
	663	* HAMMER_INODE_ITIMES: mtime/atime has been updated
	664	*/
	665	void
	666	hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
	667	{
	668	KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 \|\|
	669	(flags & (HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	670	HAMMER_INODE_XDIRTY\|
	671	HAMMER_INODE_DELETED\|HAMMER_INODE_ITIMES)) == 0);
	672
	673	ip->flags \|= flags;
	674	}
	675
	676	/*
	677	* Flush an inode. If the inode is already being flushed wait for
	678	* it to complete, then flush it again. The interlock is against
	679	* front-end transactions, the backend flusher does not hold the lock.
	680	*
	681	* The flusher must distinguish between the records that are part of the
	682	* flush and any new records created in parallel with the flush. The
	683	* inode data and truncation fields are also copied. BIOs are a bit more
	684	* troublesome because some dirty buffers may not have been queued yet.
	685	*/
	686	void
	687	hammer_flush_inode(hammer_inode_t ip, int flags)
	688	{
	689	if (ip->flush_state != HAMMER_FST_IDLE &&
	690	(ip->flags & HAMMER_INODE_MODMASK)) {
	691	ip->flags \|= HAMMER_INODE_REFLUSH;
	692	if (flags & HAMMER_FLUSH_RELEASE) {
	693	hammer_unref(&ip->lock);
	694	KKASSERT(ip->lock.refs > 0);
	695	}
	696	return;
	697	}
	698	if (ip->flush_state == HAMMER_FST_IDLE) {
	699	if ((ip->flags & HAMMER_INODE_MODMASK) \|\|
	700	(flags & HAMMER_FLUSH_FORCE)) {
	701	/*
	702	* Add a reference to represent the inode being queued
	703	* to the flusher. If the caller wants us to
	704	* release a reference the two cancel each other out.
	705	*/
	706	if ((flags & HAMMER_FLUSH_RELEASE) == 0)
	707	hammer_ref(&ip->lock);
	708
	709	hammer_flush_inode_copysync(ip);
	710	/*
	711	* Move the inode to the flush list and add a ref to
	712	* it representing it on the list.
	713	*/
	714	TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
	715	if (flags & HAMMER_FLUSH_SIGNAL)
	716	hammer_flusher_async(ip->hmp);
	717	}
	718	}
	719	}
	720
	721	/*
	722	* Helper routine to copy the frontend synchronization state to the backend.
	723	* This routine may be called by either the frontend or the backend.
	724	*/
	725	static void
	726	hammer_flush_inode_copysync(hammer_inode_t ip)
	727	{
	728	int error;
	729	int count;
	730
	731	/*
	732	* Prevent anyone else from trying to do the same thing.
	733	*/
	734	ip->flush_state = HAMMER_FST_SETUP;
	735
	736	/*
	737	* Sync the buffer cache. This will queue the BIOs. If called
	738	* from the context of the flusher the BIO's are thrown into bio_list
	739	* regardless of ip->flush_state.
	740	*/
	741	if (ip->vp != NULL)
	742	error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
	743	else
	744	error = 0;
	745
	746	/*
	747	* This freezes strategy writes, any further BIOs will be
	748	* queued to alt_bio (unless we are
	749	*/
	750	ip->flush_state = HAMMER_FST_FLUSH;
	751
	752	/*
	753	* Snapshot the state of the inode for the backend flusher.
	754	*
	755	* The truncation must be retained in the frontend until after
	756	* we've actually performed the record deletion.
	757	*/
	758	ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
	759	ip->sync_trunc_off = ip->trunc_off;
	760	ip->sync_ino_rec = ip->ino_rec;
	761	ip->sync_ino_data = ip->ino_data;
	762	ip->flags &= ~HAMMER_INODE_MODMASK \|
	763	HAMMER_INODE_TRUNCATED \| HAMMER_INODE_BUFS;
	764
	765	/*
	766	* Fix up the dirty buffer status.
	767	*/
	768	if (ip->vp == NULL \|\| RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL)
	769	ip->flags &= ~HAMMER_INODE_BUFS;
	770	if (TAILQ_FIRST(&ip->bio_list))
	771	ip->sync_flags \|= HAMMER_INODE_BUFS;
	772	else
	773	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	774
	775	/*
	776	* Set the state for the inode's in-memory records. If some records
	777	* could not be marked for backend flush (i.e. deleted records),
	778	* re-set the XDIRTY flag.
	779	*/
	780	count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	781	hammer_mark_record_callback, NULL);
	782	if (count)
	783	ip->flags \|= HAMMER_INODE_XDIRTY;
	784	}
	785
	786	/*
	787	* Mark records for backend flush, accumulate a count of the number of
	788	* records which could not be marked.
	789	*/
	790	static int
	791	hammer_mark_record_callback(hammer_record_t rec, void *data)
	792	{
	793	if (rec->state == HAMMER_FST_FLUSH) {
	794	return(0);
	795	} else if ((rec->flags & HAMMER_RECF_DELETED_FE) == 0) {
	796	rec->state = HAMMER_FST_FLUSH;
	797	hammer_ref(&rec->lock);
	798	return(0);
	799	} else {
	800	return(1);
	801	}
	802	}
	803
	804
	805
	806	/*
	807	* Wait for a previously queued flush to complete
	808	*/
	809	void
	810	hammer_wait_inode(hammer_inode_t ip)
	811	{
	812	while (ip->flush_state == HAMMER_FST_FLUSH) {
	813	ip->flags \|= HAMMER_INODE_FLUSHW;
	814	tsleep(&ip->flags, 0, "hmrwin", 0);
	815	}
	816	}
	817
	818	/*
	819	* Called by the backend code when a flush has been completed.
	820	* The inode has already been removed from the flush list.
	821	*
	822	* A pipelined flush can occur, in which case we must re-enter the
	823	* inode on the list and re-copy its fields.
	824	*/
	825	void
	826	hammer_flush_inode_done(hammer_inode_t ip)
	827	{
	828	struct bio *bio;
	829
	830	KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
	831
	832	if (ip->sync_flags)
	833	kprintf("ip %p leftover sync_flags %08x\n", ip, ip->sync_flags);
	834	ip->flags \|= ip->sync_flags;
	835	ip->flush_state = HAMMER_FST_IDLE;
	836
	837	/*
	838	* Reflush any BIOs that wound up in the alt list. Our inode will
	839	* also wind up at the end of the flusher's list.
	840	*/
	841	while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
	842	TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
	843	TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
	844	ip->flags \|= HAMMER_INODE_XDIRTY;
	845	ip->flags \|= HAMMER_INODE_REFLUSH;
	846	kprintf("rebio %p ip %p @%016llx,%d\n", bio, ip, bio->bio_offset, bio->bio_buf->b_bufsize);
	847	}
	848
	849	/*
	850	* If the frontend made more changes and requested another flush,
	851	* do it.
	852	*/
	853	if (ip->flags & HAMMER_INODE_REFLUSH) {
	854	ip->flags &= ~HAMMER_INODE_REFLUSH;
	855	hammer_flush_inode(ip, 0);
	856	} else {
	857	if (ip->flags & HAMMER_INODE_FLUSHW) {
	858	ip->flags &= ~HAMMER_INODE_FLUSHW;
	859	wakeup(&ip->flags);
	860	}
	861	}
	862	hammer_rel_inode(ip, 0);
	863	}
	864
	865	/*
	866	* Called from hammer_sync_inode() to synchronize in-memory records
	867	* to the media.
	868	*/
	869	static int
	870	hammer_sync_record_callback(hammer_record_t record, void *data)
	871	{
	872	hammer_transaction_t trans = data;
	873	int error;
	874
	875	/*
	876	* Skip records that do not belong to the current flush. Records
	877	* belonging to the flush will have been referenced for us.
	878	*
	879	* Skip records that were deleted by the backend itself. Records
	880	* deleted by the frontend after their state has changed to FLUSH
	881	* are not considered to be deleted by the backend.
	882	*
	883	* XXX special delete-on-disk records can be deleted by the backend
	884	* prior to the sync due to a truncation operation. This is kinda
	885	* a hack to deal with it.
	886	*/
	887	if (record->state != HAMMER_FST_FLUSH)
	888	return(0);
	889	if (record->flags & HAMMER_RECF_DELETED_BE) {
	890	hammer_flush_record_done(record);
	891	return(0);
	892	}
	893
	894	/*
	895	* Assign the create_tid for new records. Deletions already
	896	* have the record's entire key properly set up.
	897	*/
	898	if ((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0)
	899	record->rec.inode.base.base.create_tid = trans->tid;
	900	error = hammer_ip_sync_record(trans, record);
	901
	902	if (error) {
	903	error = -error;
	904	if (error != -ENOSPC) {
	905	kprintf("hammer_sync_record_callback: sync failed rec "
	906	"%p, error %d\n", record, error);
	907	Debugger("sync failed rec");
	908	}
	909	}
	910	hammer_flush_record_done(record);
	911	return(error);
	912	}
	913
	914	/*
	915	* XXX error handling
	916	*/
	917	int
	918	hammer_sync_inode(hammer_inode_t ip, int handle_delete)
	919	{
	920	struct hammer_transaction trans;
	921	struct bio *bio;
	922	hammer_depend_t depend;
	923	int error, tmp_error;
	924
	925	if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0 &&
	926	handle_delete == 0) {
	927	return(0);
	928	}
	929
	930
	931	hammer_lock_ex(&ip->lock);
	932
	933	hammer_start_transaction_fls(&trans, ip->hmp);
	934
	935	/*
	936	* Any (directory) records this inode depends on must also be
	937	* synchronized. The directory itself only needs to be flushed
	938	* if its inode is not already on-disk.
	939	*/
	940	while ((depend = TAILQ_FIRST(&ip->depend_list)) != NULL) {
	941	hammer_record_t record;
	942
	943	record = depend->record;
	944	TAILQ_REMOVE(&depend->record->depend_list, depend, rec_entry);
	945	TAILQ_REMOVE(&ip->depend_list, depend, ip_entry);
	946	--ip->depend_count;
	947	if (record->state != HAMMER_FST_FLUSH) {
	948	record->state = HAMMER_FST_FLUSH;
	949	/* add ref (steal ref from dependancy) */
	950	} else {
	951	/* remove ref related to dependancy */
	952	/* record still has at least one ref from state */
	953	hammer_unref(&record->lock);
	954	KKASSERT(record->lock.refs > 0);
	955	}
	956	if (record->ip->flags & HAMMER_INODE_ONDISK) {
	957	kprintf("I");
	958	hammer_sync_record_callback(record, &trans);
	959	} else {
	960	kprintf("J");
	961	hammer_flush_inode(record->ip, 0);
	962	}
	963	hammer_unref(&ip->lock);
	964	KKASSERT(ip->lock.refs > 0);
	965	kfree(depend, M_HAMMER);
	966	}
	967
	968
	969	/*
	970	* Sync inode deletions and truncations.
	971	*/
	972	if (ip->sync_ino_rec.ino_nlinks == 0 && handle_delete &&
	973	(ip->flags & HAMMER_INODE_GONE) == 0) {
	974	/*
	975	* Handle the case where the inode has been completely deleted
	976	* and is no longer referenceable from the filesystem
	977	* namespace.
	978	*
	979	* NOTE: We do not set the RDIRTY flag when updating the
	980	* delete_tid, setting HAMMER_INODE_DELETED takes care of it.
	981	*/
	982
	983	ip->flags \|= HAMMER_INODE_GONE \| HAMMER_INODE_DELETED;
	984	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	985	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	986	if (ip->vp)
	987	vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
	988	error = hammer_ip_delete_range_all(&trans, ip);
	989	if (error)
	990	Debugger("hammer_ip_delete_range_all errored");
	991
	992	/*
	993	* Sanity check. The only records that remain should be
	994	* marked for back-end deletion.
	995	*/
	996	{
	997	hammer_record_t rec;
	998
	999	RB_FOREACH(rec, hammer_rec_rb_tree, &ip->rec_tree) {
	1000	KKASSERT(rec->flags & HAMMER_RECF_DELETED_BE);
	1001	}
	1002	}
	1003
	1004	/*
	1005	* Set delete_tid in both the frontend and backend
	1006	* copy of the inode record.
	1007	*/
	1008	ip->ino_rec.base.base.delete_tid = trans.tid;
	1009	ip->sync_ino_rec.base.base.delete_tid = trans.tid;
	1010
	1011	/*
	1012	* Indicate that the inode has/is-being deleted.
	1013	*/
	1014	ip->flags \|= HAMMER_NODE_DELETED;
	1015	hammer_modify_inode(&trans, ip, HAMMER_INODE_RDIRTY);
	1016	hammer_modify_volume(&trans, trans.rootvol, NULL, 0);
	1017	--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
	1018	hammer_modify_volume_done(trans.rootvol);
	1019	} else if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
	1020	/*
	1021	* Interlock trunc_off. The VOP front-end may continue to
	1022	* make adjustments to it while we are blocked.
	1023	*/
	1024	off_t trunc_off;
	1025	off_t aligned_trunc_off;
	1026
	1027	trunc_off = ip->sync_trunc_off;
	1028	aligned_trunc_off = (trunc_off + HAMMER_BUFMASK) &
	1029	~HAMMER_BUFMASK64;
	1030
	1031	/*
	1032	* Delete any whole blocks on-media. The front-end has
	1033	* already cleaned out any partial block and made it
	1034	* pending. The front-end may have updated trunc_off
	1035	* while we were blocked so do not just unconditionally
	1036	* set it to the maximum offset.
	1037	*/
	1038	kprintf("sync truncation range @ %016llx\n", aligned_trunc_off);
	1039	error = hammer_ip_delete_range(&trans, ip,
	1040	aligned_trunc_off,
	1041	0x7FFFFFFFFFFFFFFFLL);
	1042	if (error)
	1043	Debugger("hammer_ip_delete_range errored");
	1044	ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
	1045	if (ip->trunc_off >= trunc_off) {
	1046	ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
	1047	ip->flags &= ~HAMMER_INODE_TRUNCATED;
	1048	}
	1049	}
	1050
	1051	error = 0; /* XXX vfsync used to be here */
	1052
	1053	/*
	1054	* Flush any queued BIOs.
	1055	*/
	1056	while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) {
	1057	TAILQ_REMOVE(&ip->bio_list, bio, bio_act);
	1058	#if 0
	1059	kprintf("dowrite %016llx ip %p bio %p @ %016llx\n", trans.tid, ip, bio, bio->bio_offset);
	1060	#endif
	1061	tmp_error = hammer_dowrite(&trans, ip, bio);
	1062	if (tmp_error)
	1063	error = tmp_error;
	1064	}
	1065	ip->sync_flags &= ~HAMMER_INODE_BUFS;
	1066
	1067	/*
	1068	* Now sync related records.
	1069	*/
	1070	for (;;) {
	1071	tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
	1072	hammer_sync_record_callback, &trans);
	1073	KKASSERT(error <= 0);
	1074	if (tmp_error < 0)
	1075	tmp_error = -error;
	1076	if (tmp_error)
	1077	error = tmp_error;
	1078	break;
	1079	}
	1080
	1081	/*
	1082	* XDIRTY represents rec_tree and bio_list. However, rec_tree may
	1083	* contain new front-end records so short of scanning it we can't
	1084	* just test whether it is empty or not.
	1085	*
	1086	* If no error occured assume we succeeded.
	1087	*/
	1088	if (error == 0)
	1089	ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
	1090
	1091	if (error)
	1092	Debugger("RB_SCAN errored");
	1093
	1094	/*
	1095	* Now update the inode's on-disk inode-data and/or on-disk record.
	1096	* DELETED and ONDISK are managed only in ip->flags.
	1097	*/
	1098	switch(ip->flags & (HAMMER_INODE_DELETED \| HAMMER_INODE_ONDISK)) {
	1099	case HAMMER_INODE_DELETED\|HAMMER_INODE_ONDISK:
	1100	/*
	1101	* If deleted and on-disk, don't set any additional flags.
	1102	* the delete flag takes care of things.
	1103	*/
	1104	break;
	1105	case HAMMER_INODE_DELETED:
	1106	/*
	1107	* Take care of the case where a deleted inode was never
	1108	* flushed to the disk in the first place.
	1109	*/
	1110	ip->sync_flags &= ~(HAMMER_INODE_RDIRTY\|HAMMER_INODE_DDIRTY\|
	1111	HAMMER_INODE_XDIRTY\|HAMMER_INODE_ITIMES);
	1112	while (RB_ROOT(&ip->rec_tree)) {
	1113	hammer_record_t rec = RB_ROOT(&ip->rec_tree);
	1114	hammer_ref(&rec->lock);
	1115	KKASSERT(rec->lock.refs == 1);
	1116	hammer_delete_mem_record(rec);
	1117	rec->flags \|= HAMMER_RECF_DELETED_BE;
	1118	hammer_rel_mem_record(rec);
	1119	}
	1120	break;
	1121	case HAMMER_INODE_ONDISK:
	1122	/*
	1123	* If already on-disk, do not set any additional flags.
	1124	*/
	1125	break;
	1126	default:
	1127	/*
	1128	* If not on-disk and not deleted, set both dirty flags
	1129	* to force an initial record to be written. Also set
	1130	* the create_tid for the inode.
	1131	*
	1132	* Set create_tid in both the frontend and backend
	1133	* copy of the inode record.
	1134	*/
	1135	ip->ino_rec.base.base.create_tid = trans.tid;
	1136	ip->sync_ino_rec.base.base.create_tid = trans.tid;
	1137	ip->sync_flags \|= HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY;
	1138	break;
	1139	}
	1140
	1141	/*
	1142	* If RDIRTY or DDIRTY is set, write out a new record. If the inode
	1143	* is already on-disk the old record is marked as deleted.
	1144	*
	1145	* If DELETED is set hammer_update_inode() will delete the existing
	1146	* record without writing out a new one.
	1147	*
	1148	* If ONLY the ITIMES flag is set we can update the record in-place.
	1149	*/
	1150	if (ip->flags & HAMMER_INODE_DELETED) {
	1151	error = hammer_update_inode(&trans, ip);
	1152	} else
	1153	if ((ip->sync_flags & (HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY \|
	1154	HAMMER_INODE_ITIMES)) == HAMMER_INODE_ITIMES) {
	1155	error = hammer_update_itimes(&trans, ip);
	1156	} else
	1157	if (ip->sync_flags & (HAMMER_INODE_RDIRTY \| HAMMER_INODE_DDIRTY \|
	1158	HAMMER_INODE_ITIMES)) {
	1159	error = hammer_update_inode(&trans, ip);
	1160	}
	1161	if (error)
	1162	Debugger("hammer_update_itimes/inode errored");
	1163
	1164	/*
	1165	* Save the TID we used to sync the inode with to make sure we
	1166	* do not improperly reuse it.
	1167	*/
	1168	hammer_unlock(&ip->lock);
	1169	hammer_done_transaction(&trans);
	1170	return(error);
	1171	}
	1172