gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.42 2008/08/06 15:38:58 dillon Exp $
	35	*/
	36
	37	/*
	38	* HAMMER B-Tree index - cursor support routines
	39	*/
	40	#include "hammer.h"
	41
	42	static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive);
	43
	44	/*
	45	* Initialize a fresh cursor using the B-Tree node cache. If the cache
	46	* is not available initialize a fresh cursor at the root of the filesystem.
	47	*/
	48	int
	49	hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
	50	hammer_node_cache_t cache, hammer_inode_t ip)
	51	{
	52	hammer_volume_t volume;
	53	hammer_node_t node;
	54	int error;
	55
	56	bzero(cursor, sizeof(*cursor));
	57
	58	cursor->trans = trans;
	59
	60	/*
	61	* If the cursor operation is on behalf of an inode, lock
	62	* the inode.
	63	*/
	64	if ((cursor->ip = ip) != NULL) {
	65	++ip->cursor_ip_refs;
	66	if (trans->type == HAMMER_TRANS_FLS)
	67	hammer_lock_ex(&ip->lock);
	68	else
	69	hammer_lock_sh(&ip->lock);
	70	}
	71
	72	/*
	73	* Step 1 - acquire a locked node from the cache if possible
	74	*/
	75	if (cache && cache->node) {
	76	node = hammer_ref_node_safe(trans, cache, &error);
	77	if (error == 0) {
	78	hammer_lock_sh(&node->lock);
	79	if (node->flags & HAMMER_NODE_DELETED) {
	80	hammer_unlock(&node->lock);
	81	hammer_rel_node(node);
	82	node = NULL;
	83	}
	84	}
	85	if (node == NULL)
	86	++hammer_stats_btree_root_iterations;
	87	} else {
	88	node = NULL;
	89	++hammer_stats_btree_root_iterations;
	90	}
	91
	92	/*
	93	* Step 2 - If we couldn't get a node from the cache, get
	94	* the one from the root of the filesystem.
	95	*/
	96	while (node == NULL) {
	97	volume = hammer_get_root_volume(trans->hmp, &error);
	98	if (error)
	99	break;
	100	node = hammer_get_node(trans, volume->ondisk->vol0_btree_root,
	101	0, &error);
	102	hammer_rel_volume(volume, 0);
	103	if (error)
	104	break;
	105	hammer_lock_sh(&node->lock);
	106
	107	/*
	108	* If someone got in before we could lock the node, retry.
	109	*/
	110	if (node->flags & HAMMER_NODE_DELETED) {
	111	hammer_unlock(&node->lock);
	112	hammer_rel_node(node);
	113	node = NULL;
	114	continue;
	115	}
	116	if (volume->ondisk->vol0_btree_root != node->node_offset) {
	117	hammer_unlock(&node->lock);
	118	hammer_rel_node(node);
	119	node = NULL;
	120	continue;
	121	}
	122	}
	123
	124	/*
	125	* Step 3 - finish initializing the cursor by acquiring the parent
	126	*/
	127	cursor->node = node;
	128	if (error == 0)
	129	error = hammer_load_cursor_parent(cursor, 0);
	130	KKASSERT(error == 0);
	131	/* if (error) hammer_done_cursor(cursor); */
	132	return(error);
	133	}
	134
	135	/*
	136	* Normalize a cursor. Sometimes cursors can be left in a state
	137	* where node is NULL. If the cursor is in this state, cursor up.
	138	*/
	139	void
	140	hammer_normalize_cursor(hammer_cursor_t cursor)
	141	{
	142	if (cursor->node == NULL) {
	143	KKASSERT(cursor->parent != NULL);
	144	hammer_cursor_up(cursor);
	145	}
	146	}
	147
	148
	149	/*
	150	* We are finished with a cursor. We NULL out various fields as sanity
	151	* check, in case the structure is inappropriately used afterwords.
	152	*/
	153	void
	154	hammer_done_cursor(hammer_cursor_t cursor)
	155	{
	156	hammer_inode_t ip;
	157
	158	KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
	159	if (cursor->parent) {
	160	hammer_unlock(&cursor->parent->lock);
	161	hammer_rel_node(cursor->parent);
	162	cursor->parent = NULL;
	163	}
	164	if (cursor->node) {
	165	hammer_unlock(&cursor->node->lock);
	166	hammer_rel_node(cursor->node);
	167	cursor->node = NULL;
	168	}
	169	if (cursor->data_buffer) {
	170	hammer_rel_buffer(cursor->data_buffer, 0);
	171	cursor->data_buffer = NULL;
	172	}
	173	if ((ip = cursor->ip) != NULL) {
	174	KKASSERT(ip->cursor_ip_refs > 0);
	175	--ip->cursor_ip_refs;
	176	hammer_unlock(&ip->lock);
	177	cursor->ip = NULL;
	178	}
	179	if (cursor->iprec) {
	180	hammer_rel_mem_record(cursor->iprec);
	181	cursor->iprec = NULL;
	182	}
	183
	184	/*
	185	* If we deadlocked this node will be referenced. Do a quick
	186	* lock/unlock to wait for the deadlock condition to clear.
	187	*/
	188	if (cursor->deadlk_node) {
	189	hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
	190	hammer_unlock(&cursor->deadlk_node->lock);
	191	hammer_rel_node(cursor->deadlk_node);
	192	cursor->deadlk_node = NULL;
	193	}
	194	if (cursor->deadlk_rec) {
	195	hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
	196	hammer_rel_mem_record(cursor->deadlk_rec);
	197	cursor->deadlk_rec = NULL;
	198	}
	199
	200	cursor->data = NULL;
	201	cursor->leaf = NULL;
	202	cursor->left_bound = NULL;
	203	cursor->right_bound = NULL;
	204	cursor->trans = NULL;
	205	}
	206
	207	/*
	208	* Upgrade cursor->node and cursor->parent to exclusive locks. This
	209	* function can return EDEADLK.
	210	*
	211	* The lock must already be either held shared or already held exclusively
	212	* by us.
	213	*
	214	* If we fail to upgrade the lock and cursor->deadlk_node is NULL,
	215	* we add another reference to the node that failed and set
	216	* cursor->deadlk_node so hammer_done_cursor() can block on it.
	217	*/
	218	int
	219	hammer_cursor_upgrade(hammer_cursor_t cursor)
	220	{
	221	int error;
	222
	223	error = hammer_lock_upgrade(&cursor->node->lock);
	224	if (error && cursor->deadlk_node == NULL) {
	225	cursor->deadlk_node = cursor->node;
	226	hammer_ref_node(cursor->deadlk_node);
	227	} else if (error == 0 && cursor->parent) {
	228	error = hammer_lock_upgrade(&cursor->parent->lock);
	229	if (error && cursor->deadlk_node == NULL) {
	230	cursor->deadlk_node = cursor->parent;
	231	hammer_ref_node(cursor->deadlk_node);
	232	}
	233	}
	234	return(error);
	235	}
	236
	237	int
	238	hammer_cursor_upgrade_node(hammer_cursor_t cursor)
	239	{
	240	int error;
	241
	242	error = hammer_lock_upgrade(&cursor->node->lock);
	243	if (error && cursor->deadlk_node == NULL) {
	244	cursor->deadlk_node = cursor->node;
	245	hammer_ref_node(cursor->deadlk_node);
	246	}
	247	return(error);
	248	}
	249
	250	/*
	251	* Downgrade cursor->node and cursor->parent to shared locks. This
	252	* function can return EDEADLK.
	253	*/
	254	void
	255	hammer_cursor_downgrade(hammer_cursor_t cursor)
	256	{
	257	if (hammer_lock_excl_owned(&cursor->node->lock, curthread))
	258	hammer_lock_downgrade(&cursor->node->lock);
	259	if (cursor->parent &&
	260	hammer_lock_excl_owned(&cursor->parent->lock, curthread)) {
	261	hammer_lock_downgrade(&cursor->parent->lock);
	262	}
	263	}
	264
	265	/*
	266	* Seek the cursor to the specified node and index.
	267	*
	268	* The caller must ref the node prior to calling this routine and release
	269	* it after it returns. If the seek succeeds the cursor will gain its own
	270	* ref on the node.
	271	*/
	272	int
	273	hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index)
	274	{
	275	int error;
	276
	277	hammer_cursor_downgrade(cursor);
	278	error = 0;
	279
	280	if (cursor->node != node) {
	281	hammer_unlock(&cursor->node->lock);
	282	hammer_rel_node(cursor->node);
	283	cursor->node = node;
	284	hammer_ref_node(node);
	285	hammer_lock_sh(&node->lock);
	286	KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
	287
	288	if (cursor->parent) {
	289	hammer_unlock(&cursor->parent->lock);
	290	hammer_rel_node(cursor->parent);
	291	cursor->parent = NULL;
	292	cursor->parent_index = 0;
	293	}
	294	error = hammer_load_cursor_parent(cursor, 0);
	295	}
	296	cursor->index = index;
	297	return (error);
	298	}
	299
	300	/*
	301	* Load the parent of cursor->node into cursor->parent.
	302	*/
	303	static
	304	int
	305	hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive)
	306	{
	307	hammer_mount_t hmp;
	308	hammer_node_t parent;
	309	hammer_node_t node;
	310	hammer_btree_elm_t elm;
	311	int error;
	312	int parent_index;
	313
	314	hmp = cursor->trans->hmp;
	315
	316	if (cursor->node->ondisk->parent) {
	317	node = cursor->node;
	318	parent = hammer_btree_get_parent(cursor->trans, node,
	319	&parent_index,
	320	&error, try_exclusive);
	321	if (error == 0) {
	322	elm = &parent->ondisk->elms[parent_index];
	323	cursor->parent = parent;
	324	cursor->parent_index = parent_index;
	325	cursor->left_bound = &elm[0].internal.base;
	326	cursor->right_bound = &elm[1].internal.base;
	327	}
	328	} else {
	329	cursor->parent = NULL;
	330	cursor->parent_index = 0;
	331	cursor->left_bound = &hmp->root_btree_beg;
	332	cursor->right_bound = &hmp->root_btree_end;
	333	error = 0;
	334	}
	335	return(error);
	336	}
	337
	338	/*
	339	* Cursor up to our parent node. Return ENOENT if we are at the root of
	340	* the filesystem.
	341	*/
	342	int
	343	hammer_cursor_up(hammer_cursor_t cursor)
	344	{
	345	int error;
	346
	347	hammer_cursor_downgrade(cursor);
	348
	349	/*
	350	* If the parent is NULL we are at the root of the B-Tree and
	351	* return ENOENT.
	352	*/
	353	if (cursor->parent == NULL)
	354	return (ENOENT);
	355
	356	/*
	357	* Set the node to its parent.
	358	*/
	359	hammer_unlock(&cursor->node->lock);
	360	hammer_rel_node(cursor->node);
	361	cursor->node = cursor->parent;
	362	cursor->index = cursor->parent_index;
	363	cursor->parent = NULL;
	364	cursor->parent_index = 0;
	365
	366	error = hammer_load_cursor_parent(cursor, 0);
	367	return(error);
	368	}
	369
	370	/*
	371	* Special cursor up given a locked cursor. The orignal node is not
	372	* unlocked or released and the cursor is not downgraded.
	373	*
	374	* This function can fail with EDEADLK.
	375	*
	376	* This function is only run when recursively deleting parent nodes
	377	* to get rid of an empty leaf.
	378	*/
	379	int
	380	hammer_cursor_up_locked(hammer_cursor_t cursor)
	381	{
	382	hammer_node_t save;
	383	int error;
	384	int save_index;
	385
	386	/*
	387	* If the parent is NULL we are at the root of the B-Tree and
	388	* return ENOENT.
	389	*/
	390	if (cursor->parent == NULL)
	391	return (ENOENT);
	392
	393	save = cursor->node;
	394	save_index = cursor->index;
	395
	396	/*
	397	* Set the node to its parent.
	398	*/
	399	cursor->node = cursor->parent;
	400	cursor->index = cursor->parent_index;
	401	cursor->parent = NULL;
	402	cursor->parent_index = 0;
	403
	404	/*
	405	* load the new parent, attempt to exclusively lock it. Note that
	406	* we are still holding the old parent (now cursor->node) exclusively
	407	* locked.
	408	*
	409	* This can return EDEADLK. Undo the operation on any error. These
	410	* up sequences can occur during iterations so be sure to restore
	411	* the index.
	412	*/
	413	error = hammer_load_cursor_parent(cursor, 1);
	414	if (error) {
	415	cursor->parent = cursor->node;
	416	cursor->parent_index = cursor->index;
	417	cursor->node = save;
	418	cursor->index = save_index;
	419	}
	420	return(error);
	421	}
	422
	423
	424	/*
	425	* Cursor down through the current node, which must be an internal node.
	426	*
	427	* This routine adjusts the cursor and sets index to 0.
	428	*/
	429	int
	430	hammer_cursor_down(hammer_cursor_t cursor)
	431	{
	432	hammer_node_t node;
	433	hammer_btree_elm_t elm;
	434	int error;
	435
	436	/*
	437	* The current node becomes the current parent
	438	*/
	439	hammer_cursor_downgrade(cursor);
	440	node = cursor->node;
	441	KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count);
	442	if (cursor->parent) {
	443	hammer_unlock(&cursor->parent->lock);
	444	hammer_rel_node(cursor->parent);
	445	}
	446	cursor->parent = node;
	447	cursor->parent_index = cursor->index;
	448	cursor->node = NULL;
	449	cursor->index = 0;
	450
	451	/*
	452	* Extract element to push into at (node,index), set bounds.
	453	*/
	454	elm = &node->ondisk->elms[cursor->parent_index];
	455
	456	/*
	457	* Ok, push down into elm. If elm specifies an internal or leaf
	458	* node the current node must be an internal node. If elm specifies
	459	* a spike then the current node must be a leaf node.
	460	*/
	461	switch(elm->base.btype) {
	462	case HAMMER_BTREE_TYPE_INTERNAL:
	463	case HAMMER_BTREE_TYPE_LEAF:
	464	KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
	465	KKASSERT(elm->internal.subtree_offset != 0);
	466	cursor->left_bound = &elm[0].internal.base;
	467	cursor->right_bound = &elm[1].internal.base;
	468	node = hammer_get_node(cursor->trans,
	469	elm->internal.subtree_offset, 0, &error);
	470	if (error == 0) {
	471	KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p\n", elm->base.btype, node->ondisk->type, node));
	472	if (node->ondisk->parent != cursor->parent->node_offset)
	473	panic("node %p %016llx vs %016llx\n", node, (long long)node->ondisk->parent, (long long)cursor->parent->node_offset);
	474	KKASSERT(node->ondisk->parent == cursor->parent->node_offset);
	475	}
	476	break;
	477	default:
	478	panic("hammer_cursor_down: illegal btype %02x (%c)\n",
	479	elm->base.btype,
	480	(elm->base.btype ? elm->base.btype : '?'));
	481	break;
	482	}
	483	if (error == 0) {
	484	hammer_lock_sh(&node->lock);
	485	KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
	486	cursor->node = node;
	487	cursor->index = 0;
	488	}
	489	return(error);
	490	}
	491
	492	/************************************************************************
	493	* DEADLOCK RECOVERY *
	494	************************************************************************
	495	*
	496	* These are the new deadlock recovery functions. Currently they are only
	497	* used for the mirror propagation and physical node removal cases but
	498	* ultimately the intention is to use them for all deadlock recovery
	499	* operations.
	500	*
	501	* WARNING! The contents of the cursor may be modified while unlocked.
	502	* passive modifications including adjusting the node, parent,
	503	* indexes, and leaf pointer.
	504	*
	505	* An outright removal of the element the cursor was pointing at
	506	* will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set,
	507	* which chains to causing the HAMMER_CURSOR_RETEST to be set
	508	* when the cursor is locked again.
	509	*/
	510	void
	511	hammer_unlock_cursor(hammer_cursor_t cursor)
	512	{
	513	hammer_node_t node;
	514
	515	KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
	516	KKASSERT(cursor->node);
	517
	518	/*
	519	* Release the cursor's locks and track B-Tree operations on node.
	520	* While being tracked our cursor can be modified by other threads
	521	* and the node may be replaced.
	522	*/
	523	if (cursor->parent) {
	524	hammer_unlock(&cursor->parent->lock);
	525	hammer_rel_node(cursor->parent);
	526	cursor->parent = NULL;
	527	}
	528	node = cursor->node;
	529	cursor->flags \|= HAMMER_CURSOR_TRACKED;
	530	TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry);
	531	hammer_unlock(&node->lock);
	532	}
	533
	534	/*
	535	* Get the cursor heated up again. The cursor's node may have
	536	* changed and we might have to locate the new parent.
	537	*
	538	* If the exact element we were on got deleted RIPOUT will be
	539	* set and we must clear ATEDISK so an iteration does not skip
	540	* the element after it.
	541	*/
	542	int
	543	hammer_lock_cursor(hammer_cursor_t cursor)
	544	{
	545	hammer_node_t node;
	546	int error;
	547
	548	KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED);
	549
	550	/*
	551	* Relock the node
	552	*/
	553	for (;;) {
	554	node = cursor->node;
	555	hammer_ref_node(node);
	556	hammer_lock_sh(&node->lock);
	557	if (cursor->node == node) {
	558	hammer_rel_node(node);
	559	break;
	560	}
	561	hammer_unlock(&node->lock);
	562	hammer_rel_node(node);
	563	}
	564
	565	/*
	566	* Untrack the cursor, clean up, and re-establish the parent node.
	567	*/
	568	TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
	569	cursor->flags &= ~HAMMER_CURSOR_TRACKED;
	570
	571	/*
	572	* If a ripout has occured iterations must re-test the (new)
	573	* current element. Clearing ATEDISK prevents the element from
	574	* being skipped and RETEST causes it to be re-tested.
	575	*/
	576	if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) {
	577	cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT;
	578	cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
	579	cursor->flags \|= HAMMER_CURSOR_RETEST;
	580	}
	581	error = hammer_load_cursor_parent(cursor, 0);
	582	return(error);
	583	}
	584
	585	/*
	586	* Recover from a deadlocked cursor, tracking any node removals or
	587	* replacements. If the cursor's current node is removed by another
	588	* thread (via btree_remove()) the cursor will be seeked upwards.
	589	*
	590	* The caller is working a modifying operation and must be holding the
	591	* sync lock (shared). We do not release the sync lock because this
	592	* would break atomicy.
	593	*/
	594	int
	595	hammer_recover_cursor(hammer_cursor_t cursor)
	596	{
	597	int error;
	598
	599	hammer_unlock_cursor(cursor);
	600	KKASSERT(cursor->trans->sync_lock_refs > 0);
	601
	602	/*
	603	* Wait for the deadlock to clear
	604	*/
	605	if (cursor->deadlk_node) {
	606	hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
	607	hammer_unlock(&cursor->deadlk_node->lock);
	608	hammer_rel_node(cursor->deadlk_node);
	609	cursor->deadlk_node = NULL;
	610	}
	611	if (cursor->deadlk_rec) {
	612	hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
	613	hammer_rel_mem_record(cursor->deadlk_rec);
	614	cursor->deadlk_rec = NULL;
	615	}
	616	error = hammer_lock_cursor(cursor);
	617	return(error);
	618	}
	619
	620	/*
	621	* Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor
	622	* is effectively unlocked and becomes tracked. If ocursor was not locked
	623	* then ncursor also inherits the tracking.
	624	*
	625	* After the caller finishes working with ncursor it must be cleaned up
	626	* with hammer_done_cursor(), and the caller must re-lock ocursor.
	627	*/
	628	hammer_cursor_t
	629	hammer_push_cursor(hammer_cursor_t ocursor)
	630	{
	631	hammer_cursor_t ncursor;
	632	hammer_inode_t ip;
	633	hammer_node_t node;
	634	hammer_mount_t hmp;
	635
	636	hmp = ocursor->trans->hmp;
	637	ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK \| M_ZERO);
	638	bcopy(ocursor, ncursor, sizeof(*ocursor));
	639
	640	node = ocursor->node;
	641	hammer_ref_node(node);
	642	if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) {
	643	ocursor->flags \|= HAMMER_CURSOR_TRACKED;
	644	TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry);
	645	}
	646	if (ncursor->parent)
	647	ocursor->parent = NULL;
	648	ocursor->data_buffer = NULL;
	649	ocursor->leaf = NULL;
	650	ocursor->data = NULL;
	651	if (ncursor->flags & HAMMER_CURSOR_TRACKED)
	652	TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry);
	653	if ((ip = ncursor->ip) != NULL) {
	654	++ip->cursor_ip_refs;
	655	}
	656	if (ncursor->iprec)
	657	hammer_ref(&ncursor->iprec->lock);
	658	return(ncursor);
	659	}
	660
	661	/*
	662	* Destroy ncursor and restore ocursor
	663	*
	664	* This is a temporary hack for the release. We can't afford to lose
	665	* the IP lock until the IP object scan code is able to deal with it,
	666	* so have ocursor inherit it back.
	667	*/
	668	void
	669	hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor)
	670	{
	671	hammer_mount_t hmp;
	672	hammer_inode_t ip;
	673
	674	hmp = ncursor->trans->hmp;
	675	ip = ncursor->ip;
	676	ncursor->ip = NULL;
	677	if (ip)
	678	--ip->cursor_ip_refs;
	679	hammer_done_cursor(ncursor);
	680	kfree(ncursor, hmp->m_misc);
	681	KKASSERT(ocursor->ip == ip);
	682	hammer_lock_cursor(ocursor);
	683	}
	684
	685	/*
	686	* onode is being replaced by nnode by the reblocking code.
	687	*/
	688	void
	689	hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode)
	690	{
	691	hammer_cursor_t cursor;
	692	hammer_node_ondisk_t ondisk;
	693	hammer_node_ondisk_t nndisk;
	694
	695	ondisk = onode->ondisk;
	696	nndisk = nnode->ondisk;
	697
	698	while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) {
	699	TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
	700	TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
	701	KKASSERT(cursor->node == onode);
	702	if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
	703	cursor->leaf = &nndisk->elms[cursor->index].leaf;
	704	cursor->node = nnode;
	705	hammer_ref_node(nnode);
	706	hammer_rel_node(onode);
	707	}
	708	}
	709
	710	/*
	711	* We have removed <node> from the parent and collapsed the parent.
	712	*
	713	* Cursors in deadlock recovery are seeked upward to the parent so the
	714	* btree_remove() recursion works properly even though we have marked
	715	* the cursor as requiring a reseek.
	716	*
	717	* This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK,
	718	* meaning the cursor is no longer definitively pointing at an element
	719	* within its iteration (if the cursor is being used to iterate). The
	720	* iteration code will take this into account instead of asserting if the
	721	* cursor is outside the iteration range.
	722	*/
	723	void
	724	hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index)
	725	{
	726	hammer_cursor_t cursor;
	727	hammer_node_ondisk_t ondisk;
	728
	729	KKASSERT(parent != NULL);
	730	ondisk = node->ondisk;
	731
	732	while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) {
	733	KKASSERT(cursor->node == node);
	734	KKASSERT(cursor->index == 0);
	735	TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
	736	TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry);
	737	if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
	738	cursor->leaf = NULL;
	739	cursor->flags \|= HAMMER_CURSOR_TRACKED_RIPOUT;
	740	cursor->flags \|= HAMMER_CURSOR_ITERATE_CHECK;
	741	cursor->node = parent;
	742	cursor->index = index;
	743	hammer_ref_node(parent);
	744	hammer_rel_node(node);
	745	}
	746	}
	747
	748	/*
	749	* node was split at (onode, index) with elements >= index moved to nnode.
	750	*/
	751	void
	752	hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index)
	753	{
	754	hammer_cursor_t cursor;
	755	hammer_node_ondisk_t ondisk;
	756	hammer_node_ondisk_t nndisk;
	757
	758	ondisk = onode->ondisk;
	759	nndisk = nnode->ondisk;
	760
	761	again:
	762	TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
	763	KKASSERT(cursor->node == onode);
	764	if (cursor->index < index)
	765	continue;
	766	TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
	767	TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
	768	if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
	769	cursor->leaf = &nndisk->elms[cursor->index - index].leaf;
	770	cursor->node = nnode;
	771	cursor->index -= index;
	772	hammer_ref_node(nnode);
	773	hammer_rel_node(onode);
	774	goto again;
	775	}
	776	}
	777
	778	/*
	779	* An element was moved from one node to another or within a node. The
	780	* index may also represent the end of the node (index == numelements).
	781	*
	782	* {oparent,pindex} is the parent node's pointer to onode/oindex.
	783	*
	784	* This is used by the rebalancing code. This is not an insertion or
	785	* deletion and any additional elements, including the degenerate case at
	786	* the end of the node, will be dealt with by additional distinct calls.
	787	*/
	788	void
	789	hammer_cursor_moved_element(hammer_node_t oparent, int pindex,
	790	hammer_node_t onode, int oindex,
	791	hammer_node_t nnode, int nindex)
	792	{
	793	hammer_cursor_t cursor;
	794	hammer_node_ondisk_t ondisk;
	795	hammer_node_ondisk_t nndisk;
	796
	797	/*
	798	* Adjust any cursors pointing at the element
	799	*/
	800	ondisk = onode->ondisk;
	801	nndisk = nnode->ondisk;
	802	again1:
	803	TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
	804	KKASSERT(cursor->node == onode);
	805	if (cursor->index != oindex)
	806	continue;
	807	TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
	808	TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
	809	if (cursor->leaf == &ondisk->elms[oindex].leaf)
	810	cursor->leaf = &nndisk->elms[nindex].leaf;
	811	cursor->node = nnode;
	812	cursor->index = nindex;
	813	hammer_ref_node(nnode);
	814	hammer_rel_node(onode);
	815	goto again1;
	816	}
	817
	818	/*
	819	* When moving the first element of onode to a different node any
	820	* cursor which is pointing at (oparent,pindex) must be repointed
	821	* to nnode and ATEDISK must be cleared.
	822	*
	823	* This prevents cursors from losing track due to insertions.
	824	* Insertions temporarily release the cursor in order to update
	825	* the mirror_tids. It primarily effects the mirror_write code.
	826	* The other code paths generally only do a single insertion and
	827	* then relookup or drop the cursor.
	828	*/
	829	if (onode == nnode \|\| oindex)
	830	return;
	831	ondisk = oparent->ondisk;
	832	again2:
	833	TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) {
	834	KKASSERT(cursor->node == oparent);
	835	if (cursor->index != pindex)
	836	continue;
	837	kprintf("HAMMER debug: shifted cursor pointing at parent\n"
	838	"parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n",
	839	(intmax_t)oparent->node_offset, pindex,
	840	(intmax_t)onode->node_offset, oindex,
	841	(intmax_t)nnode->node_offset, nindex);
	842	TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry);
	843	TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
	844	if (cursor->leaf == &ondisk->elms[oindex].leaf)
	845	cursor->leaf = &nndisk->elms[nindex].leaf;
	846	cursor->node = nnode;
	847	cursor->index = nindex;
	848	cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
	849	hammer_ref_node(nnode);
	850	hammer_rel_node(oparent);
	851	goto again2;
	852	}
	853	}
	854
	855	/*
	856	* The B-Tree element pointing to the specified node was moved from (oparent)
	857	* to (nparent, nindex). We must locate any tracked cursors pointing at
	858	* node and adjust their parent accordingly.
	859	*
	860	* This is used by the rebalancing code when packing elements causes an
	861	* element to shift from one node to another.
	862	*/
	863	void
	864	hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent,
	865	hammer_node_t nparent, int nindex)
	866	{
	867	hammer_cursor_t cursor;
	868
	869	again:
	870	TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
	871	KKASSERT(cursor->node == node);
	872	if (cursor->parent == oparent) {
	873	cursor->parent = nparent;
	874	cursor->parent_index = nindex;
	875	hammer_ref_node(nparent);
	876	hammer_rel_node(oparent);
	877	goto again;
	878	}
	879	}
	880	}
	881
	882	/*
	883	* Deleted element at (node, index)
	884	*
	885	* Shift indexes >= index
	886	*/
	887	void
	888	hammer_cursor_deleted_element(hammer_node_t node, int index)
	889	{
	890	hammer_cursor_t cursor;
	891	hammer_node_ondisk_t ondisk;
	892
	893	ondisk = node->ondisk;
	894
	895	TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
	896	KKASSERT(cursor->node == node);
	897	if (cursor->index == index) {
	898	cursor->flags \|= HAMMER_CURSOR_TRACKED_RIPOUT;
	899	if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
	900	cursor->leaf = NULL;
	901	} else if (cursor->index > index) {
	902	if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
	903	cursor->leaf = &ondisk->elms[cursor->index - 1].leaf;
	904	--cursor->index;
	905	}
	906	}
	907	}
	908
	909	/*
	910	* Inserted element at (node, index)
	911	*
	912	* Shift indexes >= index
	913	*/
	914	void
	915	hammer_cursor_inserted_element(hammer_node_t node, int index)
	916	{
	917	hammer_cursor_t cursor;
	918	hammer_node_ondisk_t ondisk;
	919
	920	ondisk = node->ondisk;
	921
	922	TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
	923	KKASSERT(cursor->node == node);
	924	if (cursor->index >= index) {
	925	if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
	926	cursor->leaf = &ondisk->elms[cursor->index + 1].leaf;
	927	++cursor->index;
	928	}
	929	}
	930	}
	931
	932	/*
	933	* Invalidate the cached data buffer associated with a cursor.
	934	*
	935	* This needs to be done when the underlying block is being freed or
	936	* the referenced buffer can prevent the related buffer cache buffer
	937	* from being properly invalidated.
	938	*/
	939	void
	940	hammer_cursor_invalidate_cache(hammer_cursor_t cursor)
	941	{
	942	if (cursor->data_buffer) {
	943	hammer_rel_buffer(cursor->data_buffer, 0);
	944	cursor->data_buffer = NULL;
	945	cursor->data = NULL;
	946	}
	947	}
	948