From eaeff70d7ec4b8944e1f94deb49549af6377482c Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 21 Jan 2008 00:00:19 +0000 Subject: [PATCH] HAMMER 22/many: Recovery and B-Tree work. * More work on the recovery code. Interlock cluster header loading with the recovery operation and fix numerous bugs. * Move some of the complexity of an AS-OF B-Tree lookup out of btree_search() and into btree_lookup(). Now btree_search() can just fail normally and btree_lookup() checks for the edge case. The situation that occurs is this: 10 15 20 | | LEAF1 LEAF2 (12) (18) If the boundary only differs by the delete_tid element, and we are doing a lookup AS-OF timestamp 14, we will traverse into LEAF1 which contains no visible nodes (element @ timestamp 12 has been deleted as-of 14). btree_lookup() now checks for the edge case and iterates the search to locate the visible element (18) in LEAF2. It's a bit more complex then that, but that's the basic issue. * Fix a leaf-splitting case due to the new spike topology. * Change the spike's ending element to be range-exclusive instead of range- inclusive, matching all other end-range keys in HAMMER. Adjust the B-Tree code to handle the case. * Normalize debugging output into the volume:cluster:node_offset form that the 'hammer show' utility uses. --- sys/vfs/hammer/hammer.h | 16 +- sys/vfs/hammer/hammer_alist.c | 6 +- sys/vfs/hammer/hammer_alist.h | 6 +- sys/vfs/hammer/hammer_btree.c | 621 ++++++++++++++-------------- sys/vfs/hammer/hammer_btree.h | 6 +- sys/vfs/hammer/hammer_cursor.h | 3 +- sys/vfs/hammer/hammer_disk.h | 4 +- sys/vfs/hammer/hammer_inode.c | 4 +- sys/vfs/hammer/hammer_object.c | 29 +- sys/vfs/hammer/hammer_ondisk.c | 129 ++++-- sys/vfs/hammer/hammer_recover.c | 139 ++++--- sys/vfs/hammer/hammer_spike.c | 23 +- sys/vfs/hammer/hammer_transaction.c | 13 +- 13 files changed, 558 insertions(+), 441 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 0a380da80b..5b1d98b521 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.26 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.27 2008/01/21 00:00:19 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -274,6 +274,7 @@ struct hammer_io { u_int released : 1; /* bp released (w/ B_LOCKED set) */ u_int running : 1; /* bp write IO in progress */ u_int waiting : 1; /* someone is waiting on us */ + u_int loading : 1; /* ondisk is loading */ }; typedef struct hammer_io *hammer_io_t; @@ -339,6 +340,12 @@ struct hammer_cluster { typedef struct hammer_cluster *hammer_cluster_t; +/* + * Passed to hammer_get_cluster() + */ +#define GET_CLUSTER_NEW 0x0001 +#define GET_CLUSTER_NORECOVER 0x0002 + /* * In-memory buffer (other then volume, super-cluster, or cluster), * representing an on-disk buffer. @@ -477,7 +484,6 @@ int hammer_unload_inode(hammer_inode_t ip, void *data); int hammer_unload_volume(hammer_volume_t volume, void *data __unused); int hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused); int hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused); -void hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid); int hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused); int hammer_install_volume(hammer_mount_t hmp, const char *volname); @@ -519,7 +525,7 @@ hammer_tid_t hammer_timespec_to_transid(struct timespec *ts); hammer_tid_t hammer_alloc_tid(hammer_transaction_t trans); hammer_tid_t hammer_now_tid(void); hammer_tid_t hammer_str_to_tid(const char *str); -hammer_tid_t hammer_alloc_recid(hammer_transaction_t trans); +u_int64_t hammer_alloc_recid(hammer_cluster_t cluster); enum vtype hammer_get_vnode_type(u_int8_t obj_type); int hammer_get_dtype(u_int8_t obj_type); @@ -542,7 +548,6 @@ int hammer_btree_insert_cluster(hammer_cursor_t cursor, int hammer_btree_delete(hammer_cursor_t cursor); int hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2); int hammer_btree_chkts(hammer_tid_t ts, hammer_base_elm_t key); -void hammer_make_base_inclusive(hammer_base_elm_t key); void hammer_print_btree_node(hammer_node_ondisk_t ondisk); void hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i); @@ -559,7 +564,7 @@ hammer_volume_t hammer_get_volume(hammer_mount_t hmp, hammer_supercl_t hammer_get_supercl(hammer_volume_t volume, int32_t scl_no, int *errorp, hammer_alloc_state_t isnew); hammer_cluster_t hammer_get_cluster(hammer_volume_t volume, int32_t clu_no, - int *errorp, hammer_alloc_state_t isnew); + int *errorp, int getflags); hammer_buffer_t hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no, u_int64_t buf_type, int *errorp); @@ -655,6 +660,7 @@ int hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t rec, void hammer_load_spike(hammer_cursor_t cursor, struct hammer_cursor **spikep); int hammer_spike(struct hammer_cursor **spikep); int hammer_recover(struct hammer_cluster *cluster); +int buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count); void hammer_io_init(hammer_io_t io, enum hammer_io_type type); int hammer_io_read(struct vnode *devvp, struct hammer_io *io); diff --git a/sys/vfs/hammer/hammer_alist.c b/sys/vfs/hammer/hammer_alist.c index 0fdd1f01f8..c103f17e46 100644 --- a/sys/vfs/hammer/hammer_alist.c +++ b/sys/vfs/hammer/hammer_alist.c @@ -38,7 +38,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.c,v 1.9 2008/01/15 06:02:57 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.c,v 1.10 2008/01/21 00:00:19 dillon Exp $ */ /* * This module implements a generic allocator through the use of a hinted @@ -1642,6 +1642,10 @@ hammer_alst_radix_recover(hammer_alist_recover_t info, hammer_almeta_t scan, } else { /* * Stacked meta node, recurse. + * + * XXX need a more sophisticated return + * code to control how the bitmap is set + * in the parent. */ n = live->config->bl_radix_recover( live->info, diff --git a/sys/vfs/hammer/hammer_alist.h b/sys/vfs/hammer/hammer_alist.h index 923836e3f5..0d6dbc1869 100644 --- a/sys/vfs/hammer/hammer_alist.h +++ b/sys/vfs/hammer/hammer_alist.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.h,v 1.5 2008/01/15 06:02:57 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.h,v 1.6 2008/01/21 00:00:19 dillon Exp $ */ /* @@ -149,8 +149,8 @@ void hammer_alist_template(hammer_alist_config_t bl, int32_t blocks, int32_t base_radix, int32_t maxmeta); void hammer_alist_init(hammer_alist_t live, int32_t start, int32_t count, hammer_alloc_state_t state); -int32_t hammer_alist_recover(hammer_alist_t live, int32_t blk, int32_t start, - int32_t count); +int32_t hammer_alist_recover(hammer_alist_t live, int32_t blk, + int32_t start, int32_t count); int32_t hammer_alist_alloc(hammer_alist_t live, int32_t count); int32_t hammer_alist_alloc_fwd(hammer_alist_t live, int32_t count, int32_t atblk); diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 1378025866..e3d40de65b 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.21 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.22 2008/01/21 00:00:19 dillon Exp $ */ /* @@ -67,9 +67,11 @@ * * B-Trees also make the stacking of trees fairly straightforward. * - * SPIKES: Two leaf elements denoting an inclusive sub-range of keys - * may represent a spike, or a recursion into another cluster. Most - * standard B-Tree searches traverse spikes. + * SPIKES: Two leaf elements denoting a sub-range of keys may represent + * a spike, or a recursion into another cluster. Most standard B-Tree + * searches traverse spikes. The ending spike element is range-exclusive + * and operates like a right-bound. Thus it may exactly match a record + * element directly following it. * * INSERTIONS: A search performed with the intention of doing * an insert will guarantee that the terminal leaf node is not full by @@ -93,11 +95,7 @@ static int btree_split_leaf(hammer_cursor_t cursor); static int btree_remove(hammer_cursor_t cursor, int depth); static int btree_remove_deleted_element(hammer_cursor_t cursor); static int btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm); -#if 0 -static int btree_rebalance(hammer_cursor_t cursor); -static int btree_collapse(hammer_cursor_t cursor); static int btree_node_is_almost_full(hammer_node_ondisk_t node); -#endif static int btree_node_is_full(hammer_node_ondisk_t node); static void hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, hammer_base_elm_t dest); @@ -111,8 +109,8 @@ static void hammer_make_separator(hammer_base_elm_t key1, * The iteration is inclusive of key_beg and can be inclusive or exclusive * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. * - * When doing an as-of search (cursor->asof != 0), key_beg.delete_tid - * may be modified by B-Tree functions. + * When doing an as-of search (cursor->asof != 0), key_beg.create_tid + * and key_beg.delete_tid may be modified by B-Tree functions. * * cursor->key_beg may or may not be modified by this function during * the iteration. XXX future - in case of an inverted lock we may have @@ -186,15 +184,21 @@ hammer_btree_iterate(hammer_cursor_t cursor) r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); if (hammer_debug_btree) { - kprintf("BRACKETL %p:%d %016llx %02x %016llx %d\n", - cursor->node, cursor->index, + kprintf("BRACKETL %d:%d:%08x[%d] %016llx %02x %016llx %d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index, elm[0].internal.base.obj_id, elm[0].internal.base.rec_type, elm[0].internal.base.key, r ); - kprintf("BRACKETR %p:%d %016llx %02x %016llx %d\n", - cursor->node, cursor->index + 1, + kprintf("BRACKETR %d:%d:%08x[%d] %016llx %02x %016llx %d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index + 1, elm[1].internal.base.obj_id, elm[1].internal.base.rec_type, elm[1].internal.base.key, @@ -232,8 +236,13 @@ hammer_btree_iterate(hammer_cursor_t cursor) elm = &node->elms[cursor->index]; r = hammer_btree_cmp(&cursor->key_end, &elm->base); if (hammer_debug_btree) { - kprintf("ELEMENT %p:%d %016llx %02x %016llx %d\n", - cursor->node, cursor->index, + kprintf("ELEMENT %d:%d:%08x:%d %c %016llx %02x %016llx %d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index, + (elm[0].leaf.base.btype ? + elm[0].leaf.base.btype : '?'), elm[0].leaf.base.obj_id, elm[0].leaf.base.rec_type, elm[0].leaf.base.key, @@ -244,13 +253,18 @@ hammer_btree_iterate(hammer_cursor_t cursor) error = ENOENT; break; } - if (r == 0 && (cursor->flags & - HAMMER_CURSOR_END_INCLUSIVE) == 0) { - error = ENOENT; - break; - } switch(elm->leaf.base.btype) { case HAMMER_BTREE_TYPE_RECORD: + /* + * We support both end-inclusive and + * end-exclusive searches. + */ + if (r == 0 && + (cursor->flags & + HAMMER_CURSOR_END_INCLUSIVE) == 0) { + error = ENOENT; + break; + } if ((cursor->flags & HAMMER_CURSOR_ASOF) && hammer_btree_chkts(cursor->asof, &elm->base)) { ++cursor->index; @@ -258,15 +272,52 @@ hammer_btree_iterate(hammer_cursor_t cursor) } break; case HAMMER_BTREE_TYPE_SPIKE_BEG: + /* + * We support both end-inclusive and + * end-exclusive searches. + * + * NOTE: This code assumes that the spike + * ending element immediately follows the + * spike beginning element. + */ + if (r == 0 && + (cursor->flags & + HAMMER_CURSOR_END_INCLUSIVE) == 0) { + error = ENOENT; + break; + } /* * We must cursor-down via the SPIKE_END * element, otherwise cursor->parent will * not be set correctly for deletions. + * + * fall-through to avoid an improper + * termination from the conditional above. */ KKASSERT(cursor->index + 1 < node->count); + ++elm; + KKASSERT(elm->leaf.base.btype == + HAMMER_BTREE_TYPE_SPIKE_END); ++cursor->index; /* fall through */ case HAMMER_BTREE_TYPE_SPIKE_END: + /* + * The SPIKE_END element is non-inclusive, + * just like a boundary, so if we match it + * there's no point pushing down. It is + * possible for an exactly matching element + * to follow it, however. Either stop + * or continue the iteration. + */ + if (r == 0) { + if ((cursor->flags & + HAMMER_CURSOR_END_INCLUSIVE) == 0) { + error = ENOENT; + break; + } + continue; + } + if (cursor->flags & HAMMER_CURSOR_INCLUSTER) break; error = hammer_cursor_down(cursor); @@ -305,13 +356,30 @@ hammer_btree_iterate(hammer_cursor_t cursor) * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry * could not be found, EDEADLK if inserting and a retry is needed, and a * fatal error otherwise. When retrying, the caller must terminate the - * cursor and reinitialize it. + * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. * * The cursor is suitably positioned for a deletion on success, and suitably - * positioned for an insertion on ENOENT. + * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was + * specified. * * The cursor may begin anywhere, the search will traverse clusters in * either direction to locate the requested element. + * + * Most of the logic implementing historical searches is handled here. We + * do an initial lookup with delete_tid set to the asof TID. Due to the + * way records are laid out, a forwards iteration may be required if + * ENOENT is returned to locate the historical record. Here's the + * problem: + * + * delete_tid: 10 15 20 + * LEAF1 LEAF2 + * records: (11) (18) + * + * Lets say we want to do a lookup AS-OF timestamp 12. We will traverse + * LEAF1 but the only record in LEAF1 has a termination (delete_tid) of 11, + * thus causing ENOENT to be returned. We really need to check record 18 + * in LEAF2. If it also fails then the search fails (e.g. it might represent + * the range 14-18 and thus still not match our AS-OF timestamp of 12). */ int hammer_btree_lookup(hammer_cursor_t cursor) @@ -319,10 +387,43 @@ hammer_btree_lookup(hammer_cursor_t cursor) int error; if (cursor->flags & HAMMER_CURSOR_ASOF) { + KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); + cursor->key_beg.delete_tid = cursor->asof; - do { + for (;;) { error = btree_search(cursor, 0); - } while (error == EAGAIN); + if (error != ENOENT) + break; + + /* + * ENOENT needs special handling, we may have gone + * the tree to the left of a matching record because + * any record with (delete_tid > asof) can potentially + * match. + * + * If we are AT a non-matching record we can stop, + * but if we are at a right-edge we have to loop. + * Since deletions don't always succeed completely + * we can wind up going through several empty nodes. + * + * NOTE: we can be at a leaf OR an internal node + * here, because we aren't inserting. + */ + if (cursor->index != cursor->node->ondisk->count) + break; + if (cursor->key_beg.obj_id != + cursor->right_bound->obj_id || + cursor->key_beg.rec_type != + cursor->right_bound->rec_type || + cursor->key_beg.key != + cursor->right_bound->key + ) { + break; + } + cursor->key_beg.delete_tid = + cursor->right_bound->delete_tid; + /* loop */ + } } else { error = btree_search(cursor, 0); } @@ -498,58 +599,67 @@ hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm) node->elms[i] = *elm; ++node->count; + /* + * Debugging sanity checks. Note that the element to the left + * can match the element we are inserting if it is a SPIKE_END, + * because spike-end's represent a non-inclusive end to a range. + */ KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); - if (i) - KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->leaf.base) < 0); + if (i) { + if (node->elms[i-1].base.btype == HAMMER_BTREE_TYPE_SPIKE_END){ + KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->leaf.base) <= 0); + } else { + KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->leaf.base) < 0); + } + } if (i != node->count - 1) KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->leaf.base) > 0); return(0); } -#if 0 - /* - * Insert a cluster push into the B-Tree at the current cursor position. - * The cursor is positioned at a leaf after a failed btree_lookup. + * Insert a cluster spike into the B-Tree at the current cursor position. + * The caller pre-positions the insertion cursor at ncluster's + * left bound in the originating cluster. Both the originating cluster + * and the target cluster must be serialized, EDEADLK is fatal. * - * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT - * flag set and that call must return ENOENT before this function can be - * called. + * Basically we have to lay down the two spike elements and assert that + * the leaf's right bound does not bisect the ending element. The ending + * spike element is non-inclusive, just like a boundary. * - * This routine is used ONLY during a recovery pass while the originating - * cluster is serialized. The leaf is broken up into up to three pieces, - * causing up to an additional internal elements to be added to the parent. - * - * ENOSPC is returned if there is no room to insert a new record. + * NOTE: Serialization is usually accoplished by virtue of being the + * initial accessor of a cluster. */ int hammer_btree_insert_cluster(hammer_cursor_t cursor, hammer_cluster_t ncluster, int32_t rec_offset) { - hammer_cluster_t ocluster; - hammer_node_ondisk_t parent; hammer_node_ondisk_t node; - hammer_node_ondisk_t xnode; /* additional leaf node */ - hammer_node_t new_node; hammer_btree_elm_t elm; const int esize = sizeof(*elm); - u_int8_t save; int error; - int pi, i; + int i; if ((error = hammer_cursor_upgrade(cursor)) != 0) return(error); hammer_modify_node(cursor->node); + hammer_modify_cluster(ncluster); node = cursor->node->ondisk; i = cursor->index; + KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); - KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); + KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS - 2); + KKASSERT(i >= 0 && i <= node->count); /* * Make sure the spike is legal or the B-Tree code will get really * confused. + * + * XXX the right bound my bisect the two spike elements. We + * need code here to 'fix' the right bound going up the tree + * instead of an assertion. */ KKASSERT(hammer_btree_cmp(&ncluster->ondisk->clu_btree_beg, cursor->left_bound) >= 0); @@ -560,174 +670,25 @@ hammer_btree_insert_cluster(hammer_cursor_t cursor, hammer_cluster_t ncluster, &node->elms[i].leaf.base) <= 0); } - /* - * If we are at the local root of the cluster a new root node - * must be created, because we need an internal node. The - * caller has already marked the source cluster as undergoing - * modification. - */ - ocluster = cursor->node->cluster; - if (cursor->parent == NULL) { - cursor->parent = hammer_alloc_btree(ocluster, &error); - if (error) - return(error); - hammer_lock_ex(&cursor->parent->lock); - hammer_modify_node(cursor->parent); - parent = cursor->parent->ondisk; - parent->count = 1; - parent->parent = 0; - parent->type = HAMMER_BTREE_TYPE_INTERNAL; - parent->elms[0].base = ocluster->clu_btree_beg; - parent->elms[0].base.subtree_type = node->type; - parent->elms[0].internal.subtree_offset = cursor->node->node_offset; - parent->elms[1].base = ocluster->clu_btree_end; - cursor->parent_index = 0; - cursor->left_bound = &parent->elms[0].base; - cursor->right_bound = &parent->elms[1].base; - node->parent = cursor->parent->node_offset; - ocluster->ondisk->clu_btree_root = cursor->parent->node_offset; - kprintf("no parent\n"); - } else { - kprintf("has parent\n"); - if (error) - return(error); - } - + ncluster->ondisk->clu_btree_parent_offset = cursor->node->node_offset; - KKASSERT(cursor->parent->ondisk->count <= HAMMER_BTREE_INT_ELMS - 2); + elm = &node->elms[i]; + bcopy(elm, elm + 2, (node->count - i) * esize); + bzero(elm, 2 * esize); + node->count += 2; - hammer_modify_node(cursor->parent); - parent = cursor->parent->ondisk; - pi = cursor->parent_index; - - kprintf("%d node %d/%d (%c) offset=%d parent=%d\n", - cursor->node->cluster->clu_no, - i, node->count, node->type, cursor->node->node_offset, node->parent); - - /* - * If the insertion point bisects the node we will need to allocate - * a second leaf node to copy the right hand side into. - */ - if (i != 0 && i != node->count) { - new_node = hammer_alloc_btree(cursor->node->cluster, &error); - if (error) - return(error); - xnode = new_node->ondisk; - bcopy(&node->elms[i], &xnode->elms[0], - (node->count - i) * esize); - xnode->count = node->count - i; - xnode->parent = cursor->parent->node_offset; - xnode->type = HAMMER_BTREE_TYPE_LEAF; - node->count = i; - } else { - new_node = NULL; - xnode = NULL; - } - - /* - * Adjust the parent and set pi to point at the internal element - * which we intended to hold the spike. - */ - if (new_node) { - /* - * Insert spike after parent index. Spike is at pi + 1. - * Also include room after the spike for new_node - */ - ++pi; - bcopy(&parent->elms[pi], &parent->elms[pi+2], - (parent->count - pi + 1) * esize); - parent->count += 2; - } else if (i == 0) { - /* - * Insert spike before parent index. Spike is at pi. - * - * cursor->node's index in the parent (cursor->parent_index) - * has now shifted over by one. - */ - bcopy(&parent->elms[pi], &parent->elms[pi+1], - (parent->count - pi + 1) * esize); - ++parent->count; - ++cursor->parent_index; - } else { - /* - * Insert spike after parent index. Spike is at pi + 1. - */ - ++pi; - bcopy(&parent->elms[pi], &parent->elms[pi+1], - (parent->count - pi + 1) * esize); - ++parent->count; - } - - /* - * Load the spike into the parent at (pi). - * - * WARNING: subtree_type is actually overloaded within base. - * WARNING: subtree_clu_no is overloaded on subtree_offset - */ - elm = &parent->elms[pi]; - elm[0].internal.base = ncluster->ondisk->clu_btree_beg; - elm[0].internal.base.subtree_type = HAMMER_BTREE_TYPE_CLUSTER; - elm[0].internal.rec_offset = rec_offset; - elm[0].internal.subtree_clu_no = ncluster->clu_no; - elm[0].internal.subtree_vol_no = ncluster->volume->vol_no; - - /* - * Load the new node into parent at (pi+1) if non-NULL, and also - * set the right-hand boundary for the spike. - * - * Because new_node is a leaf its elements do not point to any - * nodes so we don't have to scan it to adjust parent pointers. - * - * WARNING: subtree_type is actually overloaded within base. - * WARNING: subtree_clu_no is overloaded on subtree_offset - * - * XXX right-boundary may not match clu_btree_end if spike is - * at the end of the internal node. For now the cursor search - * insertion code will deal with it. - */ - if (new_node) { - elm[1].internal.base = ncluster->ondisk->clu_btree_end; - elm[1].internal.base.subtree_type = HAMMER_BTREE_TYPE_LEAF; - elm[1].internal.subtree_offset = new_node->node_offset; - elm[1].internal.subtree_vol_no = -1; - elm[1].internal.rec_offset = 0; - } else { - /* - * The right boundary is only the base part of elm[1]. - * The rest belongs to elm[1]'s recursion. Note however - * that subtree_type is overloaded within base so we - * have to retain it as well. - */ - save = elm[1].internal.base.subtree_type; - elm[1].internal.base = ncluster->ondisk->clu_btree_end; - elm[1].internal.base.subtree_type = save; - } - - /* - * The boundaries stored in the cursor for node are probably all - * messed up now, fix them. - */ - cursor->left_bound = &parent->elms[cursor->parent_index].base; - cursor->right_bound = &parent->elms[cursor->parent_index+1].base; - - KKASSERT(hammer_btree_cmp(&ncluster->ondisk->clu_btree_end, - &elm[1].internal.base) <= 0); - - - /* - * Adjust the target cluster's parent offset - */ - hammer_modify_cluster(ncluster); - ncluster->ondisk->clu_btree_parent_offset = cursor->parent->node_offset; - - if (new_node) - hammer_rel_node(new_node); + elm[0].leaf.base = ncluster->ondisk->clu_btree_beg; + elm[0].leaf.base.btype = HAMMER_BTREE_TYPE_SPIKE_BEG; + elm[0].leaf.spike_clu_no = ncluster->clu_no; + elm[0].leaf.spike_vol_no = ncluster->volume->vol_no; + elm[1].leaf.base = ncluster->ondisk->clu_btree_end; + elm[1].leaf.base.btype = HAMMER_BTREE_TYPE_SPIKE_END; + elm[1].leaf.spike_clu_no = ncluster->clu_no; + elm[1].leaf.spike_vol_no = ncluster->volume->vol_no; return(0); } -#endif - /* * Delete a record from the B-Tree at the current cursor position. * The cursor is positioned such that the current element is the one @@ -805,7 +766,8 @@ hammer_btree_delete(hammer_cursor_t cursor) } else { error = 0; } - KKASSERT(cursor->parent == NULL || cursor->parent_index < cursor->parent->ondisk->count); + KKASSERT(cursor->parent == NULL || + cursor->parent_index < cursor->parent->ondisk->count); return(error); } @@ -823,18 +785,39 @@ hammer_btree_delete(hammer_cursor_t cursor) * of space the search continues to the leaf (to position the cursor for * the spike), but ENOSPC is returned. * - * XXX this isn't optimal - we really need to just locate the end point and - * insert space going up, and if we get a deadlock just release and retry - * the operation. Or something like that. The insertion code can transit - * multiple clusters and run splits in unnecessary clusters. - * - * DELETIONS: The search will rebalance the tree on its way down. XXX - * * The search is only guarenteed to end up on a leaf if an error code of 0 * is returned, or if inserting and an error code of ENOENT is returned. * Otherwise it can stop at an internal node. On success a search returns * a leaf node unless INCLUSTER is set and the search located a cluster push * node (which is an internal node). + * + * COMPLEXITY WARNING! This is the core B-Tree search code for the entire + * filesystem, and it is not simple code. Please note the following facts: + * + * - Internal node recursions have a boundary on the left AND right. The + * right boundary is non-inclusive. The delete_tid is a generic part + * of the key for internal nodes. + * + * - Leaf nodes contain terminal elements AND spikes. A spike recurses into + * another cluster and contains two leaf elements.. a beginning and an + * ending element. The SPIKE_END element is RANGE-EXCLUSIVE, just like a + * boundary. This means that it is possible to have two elements + * (a spike ending element and a record) side by side with the same key. + * + * - Because the SPIKE_END element is range exclusive, it can match the + * right boundary of the parent node. SPIKE_BEG and SPIKE_END elements + * always come in pairs, and always exist side by side in the same leaf. + * + * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a + * historical search. This is true for current lookups too. Raw B-Tree + * operations such as insertions and recovery scans typically do NOT + * set this flag. When the flag is not set, create_tid and delete_tid + * are considered to be a generic part of the key for internal nodes. + * + * When the flag IS set the search code must check whether the record + * is actually visible to the search or not. btree_lookup() will also + * use failed results to poke around if necessary to locate the correct + * record. */ static int @@ -851,8 +834,11 @@ btree_search(hammer_cursor_t cursor, int flags) flags |= cursor->flags; if (hammer_debug_btree) { - kprintf("SEARCH %p:%d %016llx %02x key=%016llx did=%016llx\n", - cursor->node, cursor->index, + kprintf("SEARCH %d:%d:%08x[%d] %016llx %02x key=%016llx did=%016llx\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index, cursor->key_beg.obj_id, cursor->key_beg.rec_type, cursor->key_beg.key, @@ -923,8 +909,13 @@ btree_search(hammer_cursor_t cursor, int flags) * and stop at the root of the current cluster. */ while ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { - if (btree_node_is_full(cursor->node->ondisk) == 0) - break; + if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { + if (btree_node_is_full(cursor->node->ondisk) == 0) + break; + } else { + if (btree_node_is_almost_full(cursor->node->ondisk) ==0) + break; + } if (cursor->parent == NULL) break; if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) @@ -950,12 +941,13 @@ new_cluster: * We must proactively remove deleted elements which may * have been left over from a deadlocked btree_remove(). * - * The left and right boundaries are included in the loop h + * The left and right boundaries are included in the loop * in order to detect edge cases. * * If the separator only differs by delete_tid (r == -1) - * we may end up going down a branch to the left of the - * one containing the desired key. Flag it. + * and we are doing an as-of search, we may end up going + * down a branch to the left of the one containing the + * desired key. This requires numerous special cases. */ for (i = 0; i <= node->count; ++i) { elm = &node->elms[i]; @@ -963,6 +955,13 @@ new_cluster: if (r < 0) break; } + if (hammer_debug_btree) { + kprintf("SEARCH-I %d:%d:%08x pre-i %d/%d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + i, node->count); + } /* * These cases occur when the parent's idea of the boundary @@ -979,19 +978,23 @@ new_cluster: */ u_int8_t save; + elm = &node->elms[0]; + + /* + * If we aren't inserting we can stop here. + */ if ((flags & HAMMER_CURSOR_INSERT) == 0) { cursor->index = 0; return(ENOENT); } - elm = &node->elms[0]; /* * Correct a left-hand boundary mismatch. * - * This is done without an exclusive lock XXX. We - * have to do this or the search will not terminate - * at a leaf. + * We can only do this if we can upgrade the lock. */ + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); hammer_modify_node(cursor->node); save = node->elms[0].base.btype; node->elms[0].base = *cursor->left_bound; @@ -1000,7 +1003,8 @@ new_cluster: /* * If i == node->count + 1 the search terminated to * the RIGHT of the right boundary but to the LEFT - * of the parent's right boundary. + * of the parent's right boundary. If we aren't + * inserting we can stop here. * * Note that the last element in this case is * elms[i-2] prior to adjustments to 'i'. @@ -1008,7 +1012,7 @@ new_cluster: --i; if ((flags & HAMMER_CURSOR_INSERT) == 0) { cursor->index = i; - return(ENOENT); + return (ENOENT); } /* @@ -1016,10 +1020,10 @@ new_cluster: * (actual push-down record is i-2 prior to * adjustments to i). * - * This is done without an exclusive lock XXX. We - * have to do this or the search will not terminate - * at a leaf. + * We can only do this if we can upgrade the lock. */ + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); elm = &node->elms[i]; hammer_modify_node(cursor->node); elm->base = *cursor->right_bound; @@ -1036,8 +1040,11 @@ new_cluster: elm = &node->elms[i]; if (hammer_debug_btree) { - kprintf("SEARCH-I %p:%d %016llx %02x key=%016llx did=%016llx\n", - cursor->node, i, + kprintf("SEARCH-I %d:%d:%08x[%d] %016llx %02x key=%016llx did=%016llx\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + i, elm->internal.base.obj_id, elm->internal.base.rec_type, elm->internal.base.key, @@ -1058,9 +1065,11 @@ new_cluster: error = btree_remove_deleted_element(cursor); if (error == 0) goto new_cluster; - if (flags & HAMMER_CURSOR_INSERT) - return(EDEADLK); - return(ENOENT); + if (error == EDEADLK && + (flags & HAMMER_CURSOR_INSERT) == 0) { + error = ENOENT; + } + return(error); } @@ -1112,7 +1121,8 @@ new_cluster: * We are at a leaf, do a linear search of the key array. * * If we encounter a spike element type within the necessary - * range we push into it. + * range we push into it. Note that SPIKE_END is non-inclusive + * of the spike range. * * On success the index is set to the matching element and 0 * is returned. @@ -1145,26 +1155,31 @@ new_cluster: * the last element, however, then push into the * spike. * - * A Spike demark on a delete_tid boundary must be - * pushed into. An as-of search failure will force - * an iteration. + * If doing an as-of search a Spike demark on a + * delete_tid boundary must be pushed into and an + * iteration will be forced if it turned out to be + * the wrong choice. + * + * If not doing an as-of search exact comparisons + * must be used. * * enospc must be reset because we have crossed a * cluster boundary. */ - if (r < -1) + if (r < 0) goto failed; if (i != node->count - 1) continue; panic("btree_search: illegal spike, no SPIKE_END " "in leaf node! %p\n", cursor->node); +#if 0 /* * XXX This is not currently legal, you can only * cursor_down() from a SPIKE_END element, otherwise * the cursor parent is pointing at the wrong element * for deletions. */ - if (cursor->flags & HAMMER_CURSOR_INCLUSTER) + if (flags & HAMMER_CURSOR_INCLUSTER) goto success; cursor->index = i; error = hammer_cursor_down(cursor); @@ -1172,22 +1187,25 @@ new_cluster: if (error) goto done; goto new_cluster; +#endif } if (elm->leaf.base.btype == HAMMER_BTREE_TYPE_SPIKE_END) { /* * SPIKE_END. We can only hit this case if we are * greater or equal to SPIKE_BEG. * - * If we are less then or equal to the SPIKE_END - * we must push into it, otherwise continue the - * search. + * If we are less then SPIKE_END we must push into + * it, otherwise continue the search. The SPIKE_END + * element is range-exclusive and because of that + * it is possible for it's key to match the next + * record on the right. * * enospc must be reset because we have crossed a * cluster boundary. */ - if (r > 0) + if (r >= 0) continue; - if (cursor->flags & HAMMER_CURSOR_INCLUSTER) + if (flags & HAMMER_CURSOR_INCLUSTER) goto success; cursor->index = i; error = hammer_cursor_down(cursor); @@ -1209,58 +1227,45 @@ new_cluster: continue; /* - * Check our as-of timestamp against the element. + * Check our as-of timestamp against the element. A + * delete_tid that matches exactly in an as-of search + * is actually a no-match (the record was deleted as-of + * our timestamp so it isn't visible). */ - if (r == -1) { - if ((cursor->flags & HAMMER_CURSOR_ASOF) == 0) - goto failed; + if (flags & HAMMER_CURSOR_ASOF) { if (hammer_btree_chkts(cursor->asof, &node->elms[i].base) != 0) { continue; } + /* success */ + } else { + if (r < 0) /* can only be -1 */ + goto failed; + /* success */ } success: cursor->index = i; error = 0; - if (hammer_debug_btree) - kprintf("SEARCH-L %p:%d (SUCCESS)\n", cursor->node, i); + if (hammer_debug_btree) { + kprintf("SEARCH-L %d:%d:%08x[%d] (SUCCESS)\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + i); + } goto done; } /* - * The search failed but due the way we handle delete_tid we may - * have to iterate. Here is why: If a center separator differs - * only by its delete_tid as shown below and we are looking for, say, - * a record with an as-of TID of 12, we will traverse LEAF1. LEAF1 - * might contain element 11 and thus not match, and LEAF2 might - * contain element 17 which we DO want to match (i.e. that record - * will be visible to us). - * - * delete_tid: 10 15 20 - * L1 L2 - * - * - * Its easiest to adjust delete_tid and to tell the caller to - * retry, because this may be an insertion search and require - * more then just a simple iteration. + * The search of the leaf node failed. i is the insertion point. */ - if ((flags & (HAMMER_CURSOR_INSERT|HAMMER_CURSOR_ASOF)) == - HAMMER_CURSOR_ASOF && - cursor->key_beg.obj_id == cursor->right_bound->obj_id && - cursor->key_beg.rec_type == cursor->right_bound->rec_type && - cursor->key_beg.key == cursor->right_bound->key && - (cursor->right_bound->delete_tid == 0 || - cursor->key_beg.delete_tid < cursor->right_bound->delete_tid) - ) { - kprintf("MUST ITERATE\n"); - cursor->key_beg.delete_tid = cursor->right_bound->delete_tid; - return(EAGAIN); - } - failed: if (hammer_debug_btree) { - kprintf("SEARCH-L %p:%d (FAILED)\n", - cursor->node, i); + kprintf("SEARCH-L %d:%d:%08x[%d] (FAILED)\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + i); } /* @@ -1269,15 +1274,19 @@ failed: * If inserting split a full leaf before returning. This * may have the side effect of adjusting cursor->node and * cursor->index. + * + * For now the leaf must have at least 2 free elements to accomodate + * the insertion of a spike during recovery. See the + * hammer_btree_insert_cluster() function. */ cursor->index = i; - if ((flags & HAMMER_CURSOR_INSERT) && btree_node_is_full(node)) { + if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && + btree_node_is_almost_full(node)) { error = btree_split_leaf(cursor); if (error) { if (error != ENOSPC) goto done; enospc = 1; - flags &= ~HAMMER_CURSOR_INSERT; } /* * reload stale pointers @@ -1624,7 +1633,7 @@ btree_split_leaf(hammer_cursor_t cursor) hammer_lock_ex(&new_leaf->lock); /* - * Create the new node. P become the left-hand boundary in the + * Create the new node. P (elm) become the left-hand boundary in the * new node. Copy the right-hand boundary as well. */ hammer_modify_node(leaf); @@ -1659,7 +1668,19 @@ btree_split_leaf(hammer_cursor_t cursor) parent_elm = &ondisk->elms[parent_index+1]; bcopy(parent_elm, parent_elm + 1, (ondisk->count - parent_index) * esize); - hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); + + /* + * Create the separator. XXX At the moment use exactly the + * right-hand element if this is a recovery operation in order + * to guarantee that it does not bisect the spike elements in a + * later call to hammer_btree_insert_cluster(). + */ + if (cursor->flags & HAMMER_CURSOR_RECOVER) { + parent_elm->base = elm[0].base; + } else { + hammer_make_separator(&elm[-1].base, &elm[0].base, + &parent_elm->base); + } parent_elm->internal.base.btype = new_leaf->ondisk->type; parent_elm->internal.subtree_offset = new_leaf->node_offset; mid_boundary = &parent_elm->base; @@ -1723,10 +1744,18 @@ btree_split_leaf(hammer_cursor_t cursor) parent_elm = &parent->ondisk->elms[cursor->parent_index]; cursor->left_bound = &parent_elm[0].internal.base; cursor->right_bound = &parent_elm[1].internal.base; + + /* + * Note: The right assertion is typically > 0, but if the last element + * is a SPIKE_END it can be == 0 because the spike-end is non-inclusive + * of the range being spiked. + * + * This may seem a bit odd but it works. + */ KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->node->ondisk->elms[0].leaf.base) <= 0); KKASSERT(hammer_btree_cmp(cursor->right_bound, - &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); + &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) >= 0); done: hammer_cursor_downgrade(cursor); @@ -2116,18 +2145,6 @@ hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, } } -/* - * This adjusts a right-hand key from being exclusive to being inclusive. - * - * A delete_key of 0 represents infinity. Decrementing it results in - * (u_int64_t)-1 which is the largest value possible prior to infinity. - */ -void -hammer_make_base_inclusive(hammer_base_elm_t key) -{ - --key->delete_tid; -} - #undef MAKE_SEPARATOR /* @@ -2151,7 +2168,6 @@ btree_node_is_full(hammer_node_ondisk_t node) return(0); } -#if 0 /* * Return whether a generic internal or leaf node is almost full. This * routine is used as a helper for search insertions to guarentee at @@ -2175,7 +2191,6 @@ btree_node_is_almost_full(hammer_node_ondisk_t node) } return(0); } -#endif #if 0 static int diff --git a/sys/vfs/hammer/hammer_btree.h b/sys/vfs/hammer/hammer_btree.h index 1f200f8922..c99c9e2bb6 100644 --- a/sys/vfs/hammer/hammer_btree.h +++ b/sys/vfs/hammer/hammer_btree.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.9 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.10 2008/01/21 00:00:19 dillon Exp $ */ /* @@ -173,6 +173,10 @@ typedef union hammer_btree_elm *hammer_btree_elm_t; * Instead, each element specifies the subtype (elm->base.subtype). * This allows us to maintain an unbalanced B-Tree and to easily identify * special inter-cluster link elements. + * + * NOTE: FUTURE EXPANSION: The reserved fields in hammer_node_ondisk are + * reserved for left/right leaf linkage fields, flags, and other future + * features. */ #define HAMMER_BTREE_LEAF_ELMS 14 #define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1) diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h index 24e1aae66a..c60ca8b3cc 100644 --- a/sys/vfs/hammer/hammer_cursor.h +++ b/sys/vfs/hammer/hammer_cursor.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.8 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.9 2008/01/21 00:00:19 dillon Exp $ */ /* @@ -112,6 +112,7 @@ typedef struct hammer_cursor *hammer_cursor_t; #define HAMMER_CURSOR_DELETE 0x0010 /* adjust for delete */ #define HAMMER_CURSOR_END_INCLUSIVE 0x0020 /* key_end is inclusive */ #define HAMMER_CURSOR_END_EXCLUSIVE 0x0040 /* key_end is exclusive (def) */ +#define HAMMER_CURSOR_RECOVER 0x0080 /* recovery insertion case */ #define HAMMER_CURSOR_ATEDISK 0x0100 #define HAMMER_CURSOR_ATEMEM 0x0200 diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index 9fb1fd27c5..5947dcb376 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.17 2008/01/16 01:15:36 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.18 2008/01/21 00:00:19 dillon Exp $ */ #ifndef _SYS_UUID_H_ @@ -381,7 +381,7 @@ struct hammer_cluster_ondisk { /* * The synchronized record id is used for recovery purposes. */ - u_int64_t synchronized_tid; + u_int64_t synchronized_rec_id; u_int32_t reserved16[510]; struct hammer_almeta clu_master_meta[HAMMER_CLU_MASTER_METAELMS]; diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 3b6833bded..1b8972c3a7 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.22 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.23 2008/01/21 00:00:19 dillon Exp $ */ #include "hammer.h" @@ -323,8 +323,6 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, ip->ino_rec.ino_size = 0; ip->ino_rec.ino_nlinks = 0; /* XXX */ - ip->ino_rec.base.rec_id = hammer_alloc_recid(trans); - KKASSERT(ip->ino_rec.base.rec_id != 0); ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD; ip->ino_rec.base.base.obj_id = ip->obj_id; ip->ino_rec.base.base.key = 0; diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index ce571f1783..469f095fd7 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.21 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.22 2008/01/21 00:00:19 dillon Exp $ */ #include "hammer.h" @@ -428,6 +428,8 @@ hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) * Sync data from a buffer cache buffer (typically) to the filesystem. This * is called via the strategy called from a cached data source. This code * is responsible for actually writing a data record out to the disk. + * + * This can only occur non-historically (i.e. 'current' data only). */ int hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, @@ -450,7 +452,7 @@ retry: cursor.key_beg.delete_tid = 0; cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; cursor.asof = trans->tid; - cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_ASOF; + cursor.flags |= HAMMER_CURSOR_INSERT; /* * Issue a lookup to position the cursor and locate the cluster @@ -490,7 +492,7 @@ retry: rec->base.base.delete_tid = 0; rec->base.base.rec_type = HAMMER_RECTYPE_DATA; rec->base.data_crc = crc32(data, bytes); - rec->base.rec_id = 0; /* XXX */ + rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster); rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata); rec->base.data_len = bytes; @@ -511,10 +513,8 @@ retry: ip->flags |= HAMMER_INODE_DONDISK; error = hammer_btree_insert(&cursor, &elm); - if (error == 0) { - hammer_update_syncid(cursor.record_buffer->cluster, trans->tid); + if (error == 0) goto done; - } hammer_free_record_ptr(cursor.record_buffer, rec); fail1: @@ -624,8 +624,6 @@ retry: /* * Fill everything in and insert our B-Tree node. - * - * XXX assign rec_id here */ hammer_modify_buffer(cursor.record_buffer); *rec = record->rec; @@ -651,7 +649,7 @@ retry: bcopy(record->data, bdata, rec->base.data_len); } } - rec->base.rec_id = 0; /* XXX */ + rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster); elm.leaf.base = record->rec.base.base; elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); @@ -667,8 +665,6 @@ retry: if (error == 0) { record->flags |= HAMMER_RECF_DELETED; record->flags &= ~HAMMER_RECF_SYNCING; - hammer_update_syncid(cursor.record_buffer->cluster, - record->rec.base.base.create_tid); goto done; } @@ -755,8 +751,6 @@ hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, /* * Fill everything in and insert our B-Tree node. - * - * XXX assign rec_id here */ hammer_modify_buffer(cursor->record_buffer); *nrec = *orec; @@ -781,7 +775,7 @@ hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, bcopy(data, bdata, nrec->base.data_len); } } - nrec->base.rec_id = 0; /* XXX */ + nrec->base.rec_id = hammer_alloc_recid(cursor->node->cluster); elm.leaf.base = nrec->base.base; elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec); @@ -790,11 +784,8 @@ hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, elm.leaf.data_crc = nrec->base.data_crc; error = hammer_btree_insert(cursor, &elm); - if (error == 0) { - hammer_update_syncid(cursor->record_buffer->cluster, - nrec->base.base.create_tid); + if (error == 0) goto done; - } hammer_free_record_ptr(cursor->record_buffer, nrec); fail1: @@ -1322,8 +1313,6 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) hammer_modify_node(cursor->node); elm = &cursor->node->ondisk->elms[cursor->index]; elm->leaf.base.delete_tid = tid; - hammer_update_syncid(cursor->record_buffer->cluster, - tid); } } diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index a46463fd12..39c825fb36 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.22 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.23 2008/01/21 00:00:19 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -49,8 +49,7 @@ static void hammer_free_volume(hammer_volume_t volume); static int hammer_load_volume(hammer_volume_t volume); static int hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew); -static int hammer_load_cluster(hammer_cluster_t cluster, - hammer_alloc_state_t isnew); +static int hammer_load_cluster(hammer_cluster_t cluster, int getflags); static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type); static int hammer_load_node(hammer_node_t node); static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, @@ -414,6 +413,10 @@ hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) /* * Deal with on-disk info */ + while (volume->io.loading) { + hammer_lock_ex(&volume->io.lock); + hammer_unlock(&volume->io.lock); + } if (volume->ondisk == NULL) { *errorp = hammer_load_volume(volume); if (*errorp) { @@ -436,6 +439,10 @@ hammer_ref_volume(hammer_volume_t volume) /* * Deal with on-disk info */ + while (volume->io.loading) { + hammer_lock_ex(&volume->io.lock); + hammer_unlock(&volume->io.lock); + } if (volume->ondisk == NULL) { error = hammer_load_volume(volume); if (error) @@ -458,6 +465,10 @@ hammer_get_root_volume(struct hammer_mount *hmp, int *errorp) /* * Deal with on-disk info */ + while (volume->io.loading) { + hammer_lock_ex(&volume->io.lock); + hammer_unlock(&volume->io.lock); + } if (volume->ondisk == NULL) { *errorp = hammer_load_volume(volume); if (*errorp) { @@ -482,9 +493,11 @@ hammer_load_volume(hammer_volume_t volume) int error; hammer_lock_ex(&volume->io.lock); + volume->io.loading = 1; if (volume->ondisk == NULL) { error = hammer_io_read(volume->devvp, &volume->io); if (error) { + volume->io.loading = 0; hammer_unlock(&volume->io.lock); return (error); } @@ -506,6 +519,7 @@ hammer_load_volume(hammer_volume_t volume) } else { error = 0; } + volume->io.loading = 0; hammer_unlock(&volume->io.lock); return(0); } @@ -588,6 +602,10 @@ again: /* * Deal with on-disk info */ + while (supercl->io.loading) { + hammer_lock_ex(&supercl->io.lock); + hammer_unlock(&supercl->io.lock); + } if (supercl->ondisk == NULL || isnew) { *errorp = hammer_load_supercl(supercl, isnew); if (*errorp) { @@ -609,12 +627,14 @@ hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew) int64_t nclusters; hammer_lock_ex(&supercl->io.lock); + supercl->io.loading = 1; if (supercl->ondisk == NULL) { if (isnew) error = hammer_io_new(volume->devvp, &supercl->io); else error = hammer_io_read(volume->devvp, &supercl->io); if (error) { + supercl->io.loading = 0; hammer_unlock(&supercl->io.lock); return (error); } @@ -652,6 +672,7 @@ hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew) hammer_alist_init(&supercl->alist, 0, (int32_t)nclusters, isnew); } + supercl->io.loading = 0; hammer_unlock(&supercl->io.lock); return (error); } @@ -711,7 +732,7 @@ hammer_rel_supercl(hammer_supercl_t supercl, int flush) */ hammer_cluster_t hammer_get_cluster(hammer_volume_t volume, int32_t clu_no, - int *errorp, hammer_alloc_state_t isnew) + int *errorp, int getflags) { hammer_cluster_t cluster; @@ -746,8 +767,12 @@ again: /* * Deal with on-disk info */ - if (cluster->ondisk == NULL || isnew) { - *errorp = hammer_load_cluster(cluster, isnew); + while (cluster->io.loading) { + hammer_lock_ex(&cluster->io.lock); + hammer_unlock(&cluster->io.lock); + } + if (cluster->ondisk == NULL || getflags) { + *errorp = hammer_load_cluster(cluster, getflags); if (*errorp) { hammer_rel_cluster(cluster, 1); cluster = NULL; @@ -770,6 +795,10 @@ hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp) /* * Deal with on-disk info */ + while (cluster->io.loading) { + hammer_lock_ex(&cluster->io.lock); + hammer_unlock(&cluster->io.lock); + } if (cluster->ondisk == NULL) { *errorp = hammer_load_cluster(cluster, 0); if (*errorp) { @@ -784,7 +813,7 @@ hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp) static int -hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) +hammer_load_cluster(hammer_cluster_t cluster, int getflags) { hammer_volume_t volume = cluster->volume; struct hammer_cluster_ondisk *ondisk; @@ -794,12 +823,15 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) * Load the cluster's on-disk info */ hammer_lock_ex(&cluster->io.lock); + cluster->io.loading = 1; if (cluster->ondisk == NULL) { - if (isnew) + KKASSERT(TAILQ_EMPTY(&cluster->io.deplist)); + if (getflags & GET_CLUSTER_NEW) error = hammer_io_new(volume->devvp, &cluster->io); else error = hammer_io_read(volume->devvp, &cluster->io); if (error) { + cluster->io.loading = 0; hammer_unlock(&cluster->io.lock); return (error); } @@ -817,7 +849,7 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) cluster->alist_mdata.meta = ondisk->clu_mdata_meta; cluster->alist_mdata.info = cluster; - if (isnew == 0) { + if ((getflags & GET_CLUSTER_NEW) == 0) { /* * Load cluster range info for easy access */ @@ -830,14 +862,15 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) * chunk of time. */ /*if (ondisk->clu_flags & HAMMER_CLUF_OPEN)*/ + if ((getflags & GET_CLUSTER_NORECOVER) == 0) hammer_recover(cluster); } - } else if (isnew) { + } else if (getflags & GET_CLUSTER_NEW) { error = hammer_io_new(volume->devvp, &cluster->io); } else { error = 0; } - if (error == 0 && isnew) { + if (error == 0 && (getflags & GET_CLUSTER_NEW)) { /* * If this is a new cluster we have to initialize * various ondisk structural elements. The caller is @@ -864,6 +897,7 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) ondisk->clu_no = cluster->clu_no; ondisk->clu_flags = 0; ondisk->clu_start = HAMMER_BUFSIZE; + ondisk->synchronized_rec_id = 1; /* XXX timestamp */ KKASSERT(voldisk->vol_clo_end > cluster->io.offset); if (voldisk->vol_clo_end - cluster->io.offset > voldisk->vol_clsize) { @@ -873,7 +907,6 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) cluster->io.offset); } nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE; - KKASSERT(isnew == HAMMER_ASTATE_FREE); hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1, HAMMER_ASTATE_FREE); hammer_alist_init(&cluster->alist_btree, @@ -914,6 +947,7 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) hammer_rel_node(croot); } } + cluster->io.loading = 0; hammer_unlock(&cluster->io.lock); return (error); } @@ -938,6 +972,17 @@ hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused) * recovery. NOTE: The cluster header is not written out until all related * records have been written out. */ +u_int64_t +hammer_alloc_recid(hammer_cluster_t cluster) +{ + u_int64_t recid; + + hammer_modify_cluster(cluster); + recid = cluster->ondisk->synchronized_rec_id++; + return(recid); +} + +#if 0 void hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid) { @@ -945,6 +990,7 @@ hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid) if (cluster->ondisk->synchronized_tid < tid) cluster->ondisk->synchronized_tid = tid; } +#endif /* * Reference a cluster that is either already referenced or via a specially @@ -961,6 +1007,10 @@ hammer_ref_cluster(hammer_cluster_t cluster) /* * Deal with on-disk info */ + while (cluster->io.loading) { + hammer_lock_ex(&cluster->io.lock); + hammer_unlock(&cluster->io.lock); + } if (cluster->ondisk == NULL) { error = hammer_load_cluster(cluster, 0); if (error) @@ -1077,6 +1127,10 @@ again: /* * Deal with on-disk info */ + while (buffer->io.loading) { + hammer_lock_ex(&buffer->io.lock); + hammer_unlock(&buffer->io.lock); + } if (buffer->ondisk == NULL || buf_type) { *errorp = hammer_load_buffer(buffer, buf_type); if (*errorp) { @@ -1101,6 +1155,7 @@ hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type) */ volume = buffer->volume; hammer_lock_ex(&buffer->io.lock); + buffer->io.loading = 1; if (buffer->ondisk == NULL) { if (buf_type) { error = hammer_io_new(volume->devvp, &buffer->io); @@ -1108,6 +1163,7 @@ hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type) error = hammer_io_read(volume->devvp, &buffer->io); } if (error) { + buffer->io.loading = 0; hammer_unlock(&buffer->io.lock); return (error); } @@ -1126,6 +1182,7 @@ hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type) hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type); buffer->buf_type = ondisk->head.buf_type; } + buffer->io.loading = 0; hammer_unlock(&buffer->io.lock); return (error); } @@ -1153,6 +1210,10 @@ hammer_ref_buffer(hammer_buffer_t buffer) int error; hammer_ref(&buffer->io.lock); + while (buffer->io.loading) { + hammer_lock_ex(&buffer->io.lock); + hammer_unlock(&buffer->io.lock); + } if (buffer->ondisk == NULL) { error = hammer_load_buffer(buffer, 0); if (error) { @@ -1596,7 +1657,7 @@ hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint, if (clu_no != HAMMER_ALIST_BLOCK_NONE) { kprintf("ALLOC CLUSTER %d:%d\n", volume->vol_no, clu_no); cluster = hammer_get_cluster(volume, clu_no, errorp, - HAMMER_ASTATE_FREE); + GET_CLUSTER_NEW); volume->clu_iterator = clu_no; hammer_rel_volume(volume, 0); } else { @@ -2296,27 +2357,6 @@ buffer_alist_init(void *info, int32_t blk, int32_t radix, return(0); } -static int -buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count) -{ - hammer_cluster_t cluster = info; - hammer_buffer_t buffer; - int32_t buf_no; - int error = 0; - - buf_no = blk / HAMMER_FSBUF_MAXBLKS; - buffer = hammer_get_buffer(cluster, buf_no, 0, &error); - if (buffer) { - hammer_modify_buffer(buffer); - error = hammer_alist_recover(&buffer->alist, blk, 0, count); - /* free block count is returned if >= 0 */ - hammer_rel_buffer(buffer, 0); - } else { - error = -error; - } - return (error); -} - /* * Note: This routine is only called when freeing the last elements of * an initialized buffer. Freeing all elements of the buffer when the @@ -2414,6 +2454,26 @@ buffer_alist_free(void *info, int32_t blk, int32_t radix, } } +static int32_t +buffer_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk) +{ + hammer_cluster_t cluster = info; + hammer_buffer_t buffer; + int32_t buf_no; + int error = 0; + + buf_no = blk / HAMMER_FSBUF_MAXBLKS; + buffer = hammer_get_buffer(cluster, buf_no, 0, &error); + if (buffer) { + KKASSERT(buffer->ondisk->head.buf_type != 0); + blk = hammer_alist_find(&buffer->alist, atblk - blk, radix); + hammer_rel_buffer(buffer, 0); + } else { + blk = HAMMER_ALIST_BLOCK_NONE; + } + return(blk); +} + static void buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab) { @@ -2624,6 +2684,7 @@ hammer_init_alist_config(void) config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd; config->bl_radix_alloc_rev = buffer_alist_alloc_rev; config->bl_radix_free = buffer_alist_free; + config->bl_radix_find = buffer_alist_find; config->bl_radix_print = buffer_alist_print; } diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c index d3cab3638a..63871983a1 100644 --- a/sys/vfs/hammer/hammer_recover.c +++ b/sys/vfs/hammer/hammer_recover.c @@ -31,16 +31,14 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.4 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.5 2008/01/21 00:00:19 dillon Exp $ */ #include "hammer.h" -static void hammer_recover_buffer_stage1(hammer_cluster_t cluster, +static int hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no); -static void hammer_recover_buffer_stage2(hammer_cluster_t cluster, - int32_t buf_no); -static int hammer_recover_record(hammer_cluster_t cluster, +static int hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer, int32_t rec_offset, hammer_record_ondisk_t rec); static int hammer_recover_btree(hammer_cluster_t cluster, @@ -57,20 +55,22 @@ hammer_recover(hammer_cluster_t cluster) { int buf_no; int nbuffers; - int32_t r; + int buffer_count; + int record_count; u_int32_t bitmap; - return(0); /* XXX */ + return(0); kprintf("HAMMER_RECOVER %d:%d\n", cluster->volume->vol_no, cluster->clu_no); - KKASSERT(cluster->ondisk->synchronized_tid); + KKASSERT(cluster->ondisk->synchronized_rec_id); nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE; hammer_modify_cluster(cluster); /* - * Re-initialize the A-lists. + * Re-initialize the master, B-Tree, and mdata A-lists, and + * recover the record A-list. */ hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1, HAMMER_ASTATE_FREE); @@ -82,22 +82,15 @@ hammer_recover(hammer_cluster_t cluster) HAMMER_FSBUF_MAXBLKS, (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, HAMMER_ASTATE_ALLOC); - hammer_alist_init(&cluster->alist_record, + hammer_alist_recover(&cluster->alist_record, + 0, HAMMER_FSBUF_MAXBLKS, - (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, - HAMMER_ASTATE_ALLOC); + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS); +#if 0 /* * Scan the cluster's clu_record_buf_bitmap, reserve record buffers - * from the master bitmap before we try to recover their data. Free - * the block of records to alist_record. - * - * We need to mark the blocks as free in alist_record so future - * allocations will dive into the buffer A-list's, but we don't - * want to destroy the underlying buffer A-list's. Because the - * meta data in cluster->alist_record indicates state 00 (all-allocated - * but not initialized), it will not dive down into the buffer when - * freeing the entire buffer. + * from the master bitmap. */ for (buf_no = 1; buf_no < nbuffers; ++buf_no) { bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5]; @@ -109,15 +102,12 @@ hammer_recover(hammer_cluster_t cluster) continue; r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no); KKASSERT(r == buf_no); - hammer_alist_free(&cluster->alist_record, - buf_no * HAMMER_FSBUF_MAXBLKS, - HAMMER_FSBUF_MAXBLKS); } /* * Scan the cluster's clu_record_buf_bitmap, reassign buffers * from alist_master to alist_record, and reallocate individual - * records and any related data reference if they meet the critera. + * records and any related data reference if they meet the criteria. */ for (buf_no = 1; buf_no < nbuffers; ++buf_no) { bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5]; @@ -129,6 +119,7 @@ hammer_recover(hammer_cluster_t cluster) continue; hammer_recover_buffer_stage1(cluster, buf_no); } +#endif /* * The cluster is now in good enough shape that general allocations @@ -147,14 +138,16 @@ hammer_recover(hammer_cluster_t cluster) cluster->ondisk->clu_btree_root = croot->node_offset; hammer_rel_node(croot); } + KKASSERT(error == 0); } /* * Scan the cluster's clu_record_buf_bitmap again and regenerate * the B-Tree. - * - * XXX B-tree record for cluster-push */ + buffer_count = 0; + record_count = 0; + for (buf_no = 1; buf_no < nbuffers; ++buf_no) { bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5]; if (bitmap == 0) { @@ -163,10 +156,12 @@ hammer_recover(hammer_cluster_t cluster) } if ((bitmap & (1 << (buf_no & 31))) == 0) continue; - hammer_recover_buffer_stage2(cluster, buf_no); + ++buffer_count; + record_count += hammer_recover_buffer_stage2(cluster, buf_no); } - kprintf("HAMMER_RECOVER DONE %d:%d\n", - cluster->volume->vol_no, cluster->clu_no); + kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n", + cluster->volume->vol_no, cluster->clu_no, + buffer_count, record_count); /* * Validate the parent cluster pointer. XXX @@ -177,16 +172,34 @@ hammer_recover(hammer_cluster_t cluster) /* * Reassign buf_no as a record buffer and recover any related data * references. + * + * This is used in the alist callback and must return a negative error + * code or a positive free block count. */ -static void -hammer_recover_buffer_stage1(hammer_cluster_t cluster, int32_t buf_no) +int +buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count) { + hammer_cluster_t cluster; hammer_record_ondisk_t rec; hammer_buffer_t buffer; + int32_t buf_no; int32_t rec_no; int32_t rec_offset; + int32_t r; int error; + /* + * Extract cluster and buffer number to recover + */ + cluster = info; + buf_no = blk / HAMMER_FSBUF_MAXBLKS; + + /* + * Mark the buffer as allocated in the cluster's master A-list. + */ + r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no); + KKASSERT(r == buf_no); + kprintf("recover buffer1 %d:%d:%d\n", cluster->volume->vol_no, cluster->clu_no, buf_no); @@ -199,15 +212,20 @@ hammer_recover_buffer_stage1(hammer_cluster_t cluster, int32_t buf_no) kprintf("hammer_recover_buffer_stage1: error " "recovering %d:%d:%d\n", cluster->volume->vol_no, cluster->clu_no, buf_no); - return; + return(-error); } /* * Recover the buffer, scan and validate allocated records. Records * which cannot be recovered are freed. + * + * The parent a-list must be properly adjusted so don't just call + * hammer_alist_recover() on the underlying buffer. Go through the + * parent. */ hammer_modify_buffer(buffer); - hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES); + count = hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES); + kprintf("hammer_recover_buffer count1 %d/%d\n", count, HAMMER_RECORD_NODES); rec_no = -1; for (;;) { rec_no = hammer_alist_find(&buffer->alist, rec_no + 1, @@ -228,9 +246,12 @@ hammer_recover_buffer_stage1(hammer_cluster_t cluster, int32_t buf_no) kprintf("hammer_recover_record: failed %d:%d@%d\n", cluster->clu_no, buffer->buf_no, rec_offset); hammer_alist_free(&buffer->alist, rec_no, 1); + ++count; /* free count */ } } + kprintf("hammer_recover_buffer count2 %d/%d\n", count, HAMMER_RECORD_NODES); hammer_rel_buffer(buffer, 0); + return(count); } /* @@ -243,7 +264,7 @@ hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer, int32_t rec_offset, hammer_record_ondisk_t rec) { hammer_buffer_t dbuf; - hammer_tid_t syncid = cluster->ondisk->synchronized_tid; + u_int64_t syncid = cluster->ondisk->synchronized_rec_id; int32_t data_offset; int32_t data_len; int32_t nblks; @@ -254,17 +275,22 @@ hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer, int error = 0; /* - * Discard records created after the synchronization point and - * undo any deletions made after the synchronization point. + * We have to discard any records with rec_id's greater then the + * last sync of the cluster header (which guarenteed all related + * buffers had been synced). Otherwise the record may reference + * information that was never synced to disk. */ - if (rec->base.base.create_tid > syncid) { + if (rec->base.rec_id >= syncid) { kprintf("recover record: syncid too large %016llx/%016llx\n", - rec->base.base.create_tid, syncid); + rec->base.rec_id, syncid); return(EINVAL); } +#if 0 + /* XXX undo incomplete deletions */ if (rec->base.base.delete_tid > syncid) rec->base.base.delete_tid = 0; +#endif /* * Validate the record's B-Tree key @@ -450,14 +476,17 @@ done: /* * Rebuild the B-Tree for the records residing in the specified buffer. + * + * Return the number of records recovered. */ -static void +static int hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no) { hammer_record_ondisk_t rec; hammer_buffer_t buffer; int32_t rec_no; int32_t rec_offset; + int record_count = 0; int error; buffer = hammer_get_buffer(cluster, buf_no, 0, &error); @@ -469,7 +498,7 @@ hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no) kprintf("hammer_recover_buffer_stage2: error " "recovering %d:%d:%d\n", cluster->volume->vol_no, cluster->clu_no, buf_no); - return; + return(0); } /* @@ -492,11 +521,17 @@ hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no) cluster->clu_no, buffer->buf_no, rec_offset); /* XXX free the record and its data? */ /*hammer_alist_free(&buffer->alist, rec_no, 1);*/ + } else { + ++record_count; } } hammer_rel_buffer(buffer, 0); + return(record_count); } +/* + * Enter a single record into the B-Tree. + */ static int hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, int32_t rec_offset, hammer_record_ondisk_t rec) @@ -507,10 +542,11 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, int error = 0; /* - * Check for a spike record. + * Check for a spike record. When spiking into a new cluster do + * NOT allow a recursive recovery to occur. We use a lot of + * stack and the only thing we actually modify in the target + * cluster is its parent pointer. */ - kprintf("hammer_recover_btree cluster %p (%d) type %d\n", - cluster, cluster->clu_no, rec->base.base.rec_type); if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) { hammer_volume_t ovolume = cluster->volume; hammer_volume_t nvolume; @@ -520,7 +556,7 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, if (error) return(error); ncluster = hammer_get_cluster(nvolume, rec->spike.clu_no, - &error, 0); + &error, GET_CLUSTER_NORECOVER); hammer_rel_volume(nvolume, 0); if (error) return(error); @@ -547,9 +583,11 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, /* * Locate the insertion point. Note that we are using the cluster- * localized cursor init so parent will start out NULL. + * + * The key(s) used for spike's are bounds and different from the + * key embedded in the spike record. A special B-Tree insertion + * call is made to deal with spikes. */ - kprintf("hammer_recover_btree init_cursor_cluster cluster %p (%d) type %d ncluster %p\n", - cluster, cluster->clu_no, rec->base.base.rec_type, ncluster); error = hammer_init_cursor_cluster(&cursor, cluster); if (error) goto failed; @@ -558,13 +596,14 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, cursor.key_beg = ncluster->ondisk->clu_btree_beg; else cursor.key_beg = rec->base.base; - cursor.flags |= HAMMER_CURSOR_INSERT; + cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_RECOVER; error = hammer_btree_lookup(&cursor); KKASSERT(error != EDEADLK); KKASSERT(cursor.node); if (error == 0) { - kprintf("hammer_recover_btree: Duplicate record\n"); + kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n", + &cursor, rec, ncluster); hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index); Debugger("duplicate record"); } @@ -589,8 +628,10 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, /* * Normal record */ +#if 0 kprintf("recover recrd clu %d %016llx\n", cluster->clu_no, rec->base.base.obj_id); +#endif elm.leaf.base = rec->base.base; elm.leaf.rec_offset = rec_offset; elm.leaf.data_offset = rec->base.data_offset; diff --git a/sys/vfs/hammer/hammer_spike.c b/sys/vfs/hammer/hammer_spike.c index 7b7a15ac2f..e3efb45a32 100644 --- a/sys/vfs/hammer/hammer_spike.c +++ b/sys/vfs/hammer/hammer_spike.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/Attic/hammer_spike.c,v 1.9 2008/01/18 07:02:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/Attic/hammer_spike.c,v 1.10 2008/01/21 00:00:19 dillon Exp $ */ #include "hammer.h" @@ -56,6 +56,7 @@ hammer_load_spike(hammer_cursor_t cursor, struct hammer_cursor **spikep) spike->index = cursor->index; spike->left_bound = cursor->left_bound; spike->right_bound = cursor->right_bound; + spike->key_beg = cursor->key_beg; if (spike->parent) { hammer_ref_node(spike->parent); @@ -92,6 +93,7 @@ hammer_spike(struct hammer_cursor **spikep) hammer_node_t onode; hammer_record_ondisk_t rec; int error; + int b, e; kprintf("hammer_spike: ENOSPC in cluster, spiking\n"); /*Debugger("ENOSPC");*/ @@ -135,19 +137,22 @@ hammer_spike(struct hammer_cursor **spikep) if (error) goto failed2; + /* + * Calculate the range within the leaf to spike, expand it to either + * side to try to open up more space in the current leaf + */ + + /* * Copy the elements in the leaf node. */ for (spike->index = 0; spike->index < onode->ondisk->count; ++spike->index) { + KKASSERT(spike->node->ondisk->elms[spike->index].leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); error = hammer_btree_extract(spike, HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_GET_DATA); if (error) goto failed1; - kprintf("EXTRACT %04x %016llx %016llx\n", - spike->record->base.base.rec_type, - spike->record->base.base.obj_id, - spike->record->base.base.key); KKASSERT(spike->record->base.base.rec_type != HAMMER_RECTYPE_CLUSTER); error = hammer_write_record(&ncursor, spike->record, @@ -164,6 +169,9 @@ hammer_spike(struct hammer_cursor **spikep) /* * Delete the records and data associated with the old leaf node, * then free the old leaf node (nothing references it any more). + * + * XXX I/O ordering issue, we're destroying these records too + * early, but we need one for the spike allocation. What to do? */ for (spike->index = 0; spike->index < onode->ondisk->count; ++spike->index) { @@ -188,6 +196,7 @@ hammer_spike(struct hammer_cursor **spikep) */ rec = hammer_alloc_record(ocluster, &error, &spike->record_buffer); KKASSERT(error == 0); + rec->spike.base.rec_id = hammer_alloc_recid(ocluster); rec->spike.base.base.btype = HAMMER_BTREE_TYPE_RECORD; rec->spike.base.base.rec_type = HAMMER_RECTYPE_CLUSTER; rec->spike.clu_no = ncluster->clu_no; @@ -195,7 +204,8 @@ hammer_spike(struct hammer_cursor **spikep) rec->spike.clu_id = 0; /* - * Construct the spike elements + * Construct the spike elements. Note that the right boundary + * is range-exclusive. */ hammer_modify_node(onode); ondisk = onode->ondisk; @@ -210,7 +220,6 @@ hammer_spike(struct hammer_cursor **spikep) ondisk->elms[0].leaf.spike_unused01 = 0; ondisk->elms[1].leaf.base = *spike->right_bound; - hammer_make_base_inclusive(&ondisk->elms[1].leaf.base); ondisk->elms[1].leaf.base.btype = HAMMER_BTREE_TYPE_SPIKE_END; ondisk->elms[1].leaf.rec_offset = ondisk->elms[0].leaf.rec_offset; ondisk->elms[1].leaf.spike_clu_no = ncluster->clu_no; diff --git a/sys/vfs/hammer/hammer_transaction.c b/sys/vfs/hammer/hammer_transaction.c index a2e3433ada..6055972ab1 100644 --- a/sys/vfs/hammer/hammer_transaction.c +++ b/sys/vfs/hammer/hammer_transaction.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.7 2008/01/10 07:41:03 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.8 2008/01/21 00:00:19 dillon Exp $ */ #include "hammer.h" @@ -101,14 +101,3 @@ hammer_alloc_tid(hammer_transaction_t trans) return(tid); } -hammer_tid_t -hammer_alloc_recid(hammer_transaction_t trans) -{ - hammer_volume_ondisk_t ondisk; - hammer_tid_t recid; - - hammer_modify_volume(trans->rootvol); - ondisk = trans->rootvol->ondisk; - recid = ++ondisk->vol0_recid; - return(recid); -} -- 2.41.0