From 32c90105bec38cf3ca15d5f66aa481edd792a49c Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 5 Feb 2008 07:58:43 +0000 Subject: [PATCH] HAMMER 25/many: Pruning code * Add b_tid to struct buf so dirty buffer cache buffers can be tagged with a transaction id to try to retain consistency when doing as-of queries on files that change size (so the data records have a TID <= the inode record). This is also an issue when a file is created and immediately written to. This may be temporary, a more sophisticated solution is needed. * Fix a bug in the special handling of create_tid for as-of queries in btree_search(). An assignment was off by one, causing historical queries to not be able to find bits of data here and there. * Freeze the transaction id for newly created inodes until the initial inode record is laid down on disk, so the transaction id matches the transaction id of the related directory entry. * Major work on the pruning code. When pruning the tree to a particular granularity the create_tid and delete_tid of related records must be aligned to that granularity in order to avoid creating 'holes' at various time points. This requires some serious B-Tree manipulation because the right-hand boundary may need to be shifted when the create_tid of an existing record is forward aligned. This work is still in progress but it works in basic testing. Prune the tree in the reverse direction instead of in the forward direction. This keeps the B-Tree consistent when we have to adjust the right-hand boundary to accomodate the realignment of create_tid. --- sbin/hammer/cmd_prune.c | 10 +- sbin/hammer/misc.c | 17 +- sys/sys/buf.h | 3 +- sys/vfs/hammer/hammer.h | 10 +- sys/vfs/hammer/hammer_btree.c | 543 ++++++++++++++++++++++++++++++++- sys/vfs/hammer/hammer_cursor.c | 31 +- sys/vfs/hammer/hammer_inode.c | 9 +- sys/vfs/hammer/hammer_ioctl.c | 195 ++++++++++-- sys/vfs/hammer/hammer_ioctl.h | 9 +- sys/vfs/hammer/hammer_object.c | 14 +- sys/vfs/hammer/hammer_vnops.c | 21 +- 11 files changed, 798 insertions(+), 64 deletions(-) diff --git a/sbin/hammer/cmd_prune.c b/sbin/hammer/cmd_prune.c index d529fc98e5..92f6042b82 100644 --- a/sbin/hammer/cmd_prune.c +++ b/sbin/hammer/cmd_prune.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sbin/hammer/Attic/cmd_prune.c,v 1.1 2008/02/04 08:34:22 dillon Exp $ + * $DragonFly: src/sbin/hammer/Attic/cmd_prune.c,v 1.2 2008/02/05 07:58:40 dillon Exp $ */ #include "hammer.h" @@ -60,10 +60,10 @@ hammer_cmd_prune(char **av, int ac) bzero(&prune, sizeof(prune)); prune.nelms = 0; - prune.beg_objid = HAMMER_MIN_OBJID; - prune.cur_objid = prune.beg_objid; - prune.end_objid = HAMMER_MAX_OBJID; - prune.cur_key = HAMMER_MIN_KEY; + prune.beg_obj_id = HAMMER_MIN_OBJID; + prune.end_obj_id = HAMMER_MAX_OBJID; + prune.cur_obj_id = prune.end_obj_id; /* reverse scan */ + prune.cur_key = HAMMER_MAX_KEY; if (ac == 0) prune_usage(1); diff --git a/sbin/hammer/misc.c b/sbin/hammer/misc.c index 3e5a180ad2..6871d18006 100644 --- a/sbin/hammer/misc.c +++ b/sbin/hammer/misc.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sbin/hammer/misc.c,v 1.2 2008/01/21 00:03:31 dillon Exp $ + * $DragonFly: src/sbin/hammer/misc.c,v 1.3 2008/02/05 07:58:40 dillon Exp $ */ #include "hammer.h" @@ -65,21 +65,16 @@ hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) if (key1->key > key2->key) return(2); - /* - * A delete_tid of zero indicates a record which has not been - * deleted yet and must be considered to have a value of positive - * infinity. - */ - if (key1->delete_tid == 0) { - if (key2->delete_tid == 0) + if (key1->create_tid == 0) { + if (key2->create_tid == 0) return(0); return(1); } - if (key2->delete_tid == 0) + if (key2->create_tid == 0) return(-1); - if (key1->delete_tid < key2->delete_tid) + if (key1->create_tid < key2->create_tid) return(-1); - if (key1->delete_tid > key2->delete_tid) + if (key1->create_tid > key2->create_tid) return(1); return(0); } diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 7273c38b08..b99683f942 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -37,7 +37,7 @@ * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD: src/sys/sys/buf.h,v 1.88.2.10 2003/01/25 19:02:23 dillon Exp $ - * $DragonFly: src/sys/sys/buf.h,v 1.42 2008/01/10 07:34:03 dillon Exp $ + * $DragonFly: src/sys/sys/buf.h,v 1.43 2008/02/05 07:58:41 dillon Exp $ */ #ifndef _SYS_BUF_H_ @@ -173,6 +173,7 @@ struct buf { struct xio b_xio; /* data buffer page list management */ struct bio_ops *b_ops; /* bio_ops used w/ b_dep */ struct workhead b_dep; /* List of filesystem dependencies. */ + u_int64_t b_tid; /* Transaction id */ }; /* diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index d500f68f76..65ee397a7c 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.31 2008/02/04 08:33:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.32 2008/02/05 07:58:43 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -535,7 +535,8 @@ int hammer_cursor_toroot(hammer_cursor_t cursor); int hammer_cursor_down(hammer_cursor_t cursor); int hammer_cursor_upgrade(hammer_cursor_t cursor); void hammer_cursor_downgrade(hammer_cursor_t cursor); - +int hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, + int index); void hammer_lock_ex(struct hammer_lock *lock); int hammer_lock_ex_try(struct hammer_lock *lock); void hammer_lock_sh(struct hammer_lock *lock); @@ -567,14 +568,19 @@ void hammer_mem_done(hammer_cursor_t cursor); int hammer_btree_lookup(hammer_cursor_t cursor); int hammer_btree_first(hammer_cursor_t cursor); +int hammer_btree_last(hammer_cursor_t cursor); int hammer_btree_extract(hammer_cursor_t cursor, int flags); int hammer_btree_iterate(hammer_cursor_t cursor); +int hammer_btree_iterate_reverse(hammer_cursor_t cursor); int hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm); int hammer_btree_insert_cluster(hammer_cursor_t cursor, hammer_cluster_t cluster, int32_t rec_offset); int hammer_btree_delete(hammer_cursor_t cursor); int hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2); int hammer_btree_chkts(hammer_tid_t ts, hammer_base_elm_t key); +int hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid); +int hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid); + int hammer_btree_lock_children(hammer_cursor_t cursor, struct hammer_node_locklist **locklistp); diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 87df345be3..6f752c3e78 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.27 2008/02/04 08:33:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.28 2008/02/05 07:58:43 dillon Exp $ */ /* @@ -356,6 +356,230 @@ hammer_btree_iterate(hammer_cursor_t cursor) return(error); } +/* + * Iterate in the reverse direction. This is used by the pruning code to + * avoid overlapping records. + */ +int +hammer_btree_iterate_reverse(hammer_cursor_t cursor) +{ + hammer_node_ondisk_t node; + hammer_btree_elm_t elm; + int error; + int r; + int s; + + /* + * Skip past the current record. For various reasons the cursor + * may end up set to -1 or set to point at the end of the current + * node. These cases must be addressed. + */ + node = cursor->node->ondisk; + if (node == NULL) + return(ENOENT); + if (cursor->index != -1 && + (cursor->flags & HAMMER_CURSOR_ATEDISK)) { + --cursor->index; + } + if (cursor->index == cursor->node->ondisk->count) + --cursor->index; + + /* + * Loop until an element is found or we are done. + */ + for (;;) { + /* + * We iterate up the tree and then index over one element + * while we are at the last element in the current node. + * + * NOTE: This can pop us up to another cluster. + * + * If we are at the root of the root cluster, cursor_up + * returns ENOENT. + * + * NOTE: hammer_cursor_up() will adjust cursor->key_beg + * when told to re-search for the cluster tag. + * + * XXX this could be optimized by storing the information in + * the parent reference. + * + * XXX we can lose the node lock temporarily, this could mess + * up our scan. + */ + if (cursor->index == -1) { + error = hammer_cursor_up(cursor); + if (error) { + cursor->index = 0; /* sanity */ + break; + } + /* reload stale pointer */ + node = cursor->node->ondisk; + KKASSERT(cursor->index != node->count); + --cursor->index; + continue; + } + + /* + * Check internal or leaf element. Determine if the record + * at the cursor has gone beyond the end of our range. + * + * Generally we recurse down through internal nodes. An + * internal node can only be returned if INCLUSTER is set + * and the node represents a cluster-push record. + */ + KKASSERT(cursor->index != node->count); + if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { + elm = &node->elms[cursor->index]; + r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); + s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); + if (hammer_debug_btree) { + kprintf("BRACKETL %d:%d:%08x[%d] %016llx %02x %016llx %d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index, + elm[0].internal.base.obj_id, + elm[0].internal.base.rec_type, + elm[0].internal.base.key, + r + ); + kprintf("BRACKETR %d:%d:%08x[%d] %016llx %02x %016llx %d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index + 1, + elm[1].internal.base.obj_id, + elm[1].internal.base.rec_type, + elm[1].internal.base.key, + s + ); + } + + if (s >= 0) { + error = ENOENT; + break; + } + KKASSERT(r >= 0); + + /* + * When iterating try to clean up any deleted + * internal elements left over from btree_remove() + * deadlocks, but it is ok if we can't. + */ + if (elm->internal.subtree_offset == 0) { + btree_remove_deleted_element(cursor); + /* note: elm also invalid */ + } else if (elm->internal.subtree_offset != 0) { + error = hammer_cursor_down(cursor); + if (error) + break; + KKASSERT(cursor->index == 0); + cursor->index = cursor->node->ondisk->count - 1; + } + /* reload stale pointer */ + node = cursor->node->ondisk; + continue; + } else { + elm = &node->elms[cursor->index]; + s = hammer_btree_cmp(&cursor->key_beg, &elm->base); + if (hammer_debug_btree) { + kprintf("ELEMENT %d:%d:%08x:%d %c %016llx %02x %016llx %d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, + cursor->index, + (elm[0].leaf.base.btype ? + elm[0].leaf.base.btype : '?'), + elm[0].leaf.base.obj_id, + elm[0].leaf.base.rec_type, + elm[0].leaf.base.key, + s + ); + } + if (s > 0) { + error = ENOENT; + break; + } + + switch(elm->leaf.base.btype) { + case HAMMER_BTREE_TYPE_RECORD: + if ((cursor->flags & HAMMER_CURSOR_ASOF) && + hammer_btree_chkts(cursor->asof, &elm->base)) { + --cursor->index; + continue; + } + break; + case HAMMER_BTREE_TYPE_SPIKE_BEG: + /* + * Skip the spike BEG record. We will hit + * the END record first since we are + * iterating backwards. + */ + --cursor->index; + continue; + case HAMMER_BTREE_TYPE_SPIKE_END: + /* + * The SPIKE_END element is inclusive, NOT + * like a boundary, so be careful with the + * match check. + * + * This code assumes that a preceding SPIKE_BEG + * has already been checked. + */ + if (cursor->flags & HAMMER_CURSOR_INCLUSTER) + break; + error = hammer_cursor_down(cursor); + if (error) + break; + KKASSERT(cursor->index == 0); + /* reload stale pointer */ + node = cursor->node->ondisk; + + /* + * If the cluster root is empty it and its + * related spike can be deleted. Ignore + * errors. Cursor + */ + if (node->count == 0) { + error = hammer_cursor_upgrade(cursor); + if (error == 0) + error = btree_remove(cursor); + hammer_cursor_downgrade(cursor); + error = 0; + /* reload stale pointer */ + node = cursor->node->ondisk; + } + cursor->index = node->count - 1; + continue; + default: + error = EINVAL; + break; + } + if (error) + break; + } + /* + * node pointer invalid after loop + */ + + /* + * Return entry + */ + if (hammer_debug_btree) { + int i = cursor->index; + hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; + kprintf("ITERATE %p:%d %016llx %02x %016llx\n", + cursor->node, i, + elm->internal.base.obj_id, + elm->internal.base.rec_type, + elm->internal.base.key + ); + } + return(0); + } + return(error); +} + /* * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry * could not be found, EDEADLK if inserting and a retry is needed, and a @@ -410,6 +634,10 @@ hammer_btree_lookup(hammer_cursor_t cursor) */ break; } + if (hammer_debug_btree) { + kprintf("CREATE_CHECK %016llx\n", + cursor->create_check); + } cursor->key_beg.create_tid = cursor->create_check; /* loop */ } @@ -440,6 +668,28 @@ hammer_btree_first(hammer_cursor_t cursor) return(error); } +/* + * Similarly but for an iteration in the reverse direction. + */ +int +hammer_btree_last(hammer_cursor_t cursor) +{ + struct hammer_base_elm save; + int error; + + save = cursor->key_beg; + cursor->key_beg = cursor->key_end; + error = hammer_btree_lookup(cursor); + cursor->key_beg = save; + if (error == ENOENT || + (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { + cursor->flags &= ~HAMMER_CURSOR_ATEDISK; + error = hammer_btree_iterate_reverse(cursor); + } + cursor->flags |= HAMMER_CURSOR_ATEDISK; + return(error); +} + /* * Extract the record and/or data associated with the cursor's current * position. Any prior record or data stored in the cursor is replaced. @@ -1228,7 +1478,7 @@ new_cluster: * only differs by its create_tid. */ if (r == 1) { - cursor->create_check = elm->base.create_tid; + cursor->create_check = elm->base.create_tid - 1; cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; } if (i != node->count - 1) @@ -1258,7 +1508,7 @@ new_cluster: */ if (r == 1) { cursor->create_check = - elm->base.create_tid; + elm->base.create_tid - 1; cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; } @@ -1837,6 +2087,293 @@ done: return (error); } +/* + * Recursively correct the right-hand boundary's create_tid to (tid) as + * long as the rest of the key matches. We have to recurse upward in + * the tree as well as down the left side of each parent's right node. + * + * Return EDEADLK if we were only partially successful, forcing the caller + * to try again. The original cursor is not modified. This routine can + * also fail with EDEADLK if it is forced to throw away a portion of its + * record history. + * + * The caller must pass a downgraded cursor to us (otherwise we can't dup it). + */ +struct hammer_rhb { + TAILQ_ENTRY(hammer_rhb) entry; + hammer_node_t node; + int index; +}; + +TAILQ_HEAD(hammer_rhb_list, hammer_rhb); + +int +hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) +{ + struct hammer_rhb_list rhb_list; + hammer_base_elm_t elm; + hammer_node_t orig_node; + struct hammer_rhb *rhb; + int orig_index; + int error; + + TAILQ_INIT(&rhb_list); + + /* + * Save our position so we can restore it on return. This also + * gives us a stable 'elm'. + */ + orig_node = cursor->node; + hammer_ref_node(orig_node); + hammer_lock_sh(&orig_node->lock); + orig_index = cursor->index; + elm = &orig_node->ondisk->elms[orig_index].base; + + /* + * Now build a list of parents going up, allocating a rhb + * structure for each one. + */ + while (cursor->parent) { + /* + * Stop if we no longer have any right-bounds to fix up + */ + if (elm->obj_id != cursor->right_bound->obj_id || + elm->rec_type != cursor->right_bound->rec_type || + elm->key != cursor->right_bound->key) { + break; + } + + /* + * Stop if the right-hand bound's create_tid does not + * need to be corrected. Note that if the parent is + * a cluster the bound is pointing at the actual bound + * in the cluster header, not the SPIKE_END element in + * the parent cluster, so we don't have to worry about + * the fact that SPIKE_END is range-inclusive. + */ + if (cursor->right_bound->create_tid >= tid) + break; + + KKASSERT(cursor->parent->ondisk->elms[cursor->parent_index].base.btype != HAMMER_BTREE_TYPE_SPIKE_BEG); + + rhb = kmalloc(sizeof(*rhb), M_HAMMER, M_WAITOK|M_ZERO); + rhb->node = cursor->parent; + rhb->index = cursor->parent_index; + hammer_ref_node(rhb->node); + hammer_lock_sh(&rhb->node->lock); + TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); + + hammer_cursor_up(cursor); + } + + /* + * now safely adjust the right hand bound for each rhb. This may + * also require taking the right side of the tree and iterating down + * ITS left side. + */ + error = 0; + while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { + error = hammer_cursor_seek(cursor, rhb->node, rhb->index); + kprintf("CORRECT RHB %d:%d:%08x index %d type=%c\n", + rhb->node->cluster->volume->vol_no, + rhb->node->cluster->clu_no, rhb->node->node_offset, + rhb->index, cursor->node->ondisk->type); + if (error) + break; + TAILQ_REMOVE(&rhb_list, rhb, entry); + hammer_unlock(&rhb->node->lock); + hammer_rel_node(rhb->node); + kfree(rhb, M_HAMMER); + + switch (cursor->node->ondisk->type) { + case HAMMER_BTREE_TYPE_INTERNAL: + /* + * Right-boundary for parent at internal node + * is one element to the right of the element whos + * right boundary needs adjusting. We must then + * traverse down the left side correcting any left + * bounds (which may now be too far to the left). + */ + ++cursor->index; + error = hammer_btree_correct_lhb(cursor, tid); + break; + case HAMMER_BTREE_TYPE_LEAF: + /* + * Right-boundary for parent at leaf node. Both + * the SPIKE_END and the cluster header must be + * corrected, but we don't have to traverse down + * (there's nothing TO traverse down other then what + * we've already recorded). + * + * The SPIKE_END is range-inclusive. + */ + error = hammer_cursor_down(cursor); + if (error == 0) + error = hammer_lock_upgrade(&cursor->parent->lock); + if (error == 0) { + kprintf("hammer_btree_correct_rhb-X @%d:%d:%08x\n", + cursor->parent->cluster->volume->vol_no, + cursor->parent->cluster->clu_no, + cursor->parent->node_offset); + hammer_modify_node(cursor->parent); + elm = &cursor->parent->ondisk->elms[cursor->parent_index].base; + KKASSERT(elm->btype == HAMMER_BTREE_TYPE_SPIKE_END); + elm->create_tid = tid - 1; + hammer_modify_cluster(cursor->node->cluster); + cursor->node->cluster->ondisk->clu_btree_end.create_tid = tid; + cursor->node->cluster->clu_btree_end.create_tid = tid; + } + break; + default: + panic("hammer_btree_correct_rhb(): Bad node type"); + error = EINVAL; + break; + } + } + + /* + * Cleanup + */ + while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { + TAILQ_REMOVE(&rhb_list, rhb, entry); + hammer_unlock(&rhb->node->lock); + hammer_rel_node(rhb->node); + kfree(rhb, M_HAMMER); + } + error = hammer_cursor_seek(cursor, orig_node, orig_index); + hammer_unlock(&orig_node->lock); + hammer_rel_node(orig_node); + return (error); +} + +/* + * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand + * bound going downward starting at the current cursor position. + * + * This function does not restore the cursor after use. + */ +int +hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) +{ + struct hammer_rhb_list rhb_list; + hammer_base_elm_t elm; + hammer_base_elm_t cmp; + struct hammer_rhb *rhb; + int error; + + TAILQ_INIT(&rhb_list); + + cmp = &cursor->node->ondisk->elms[cursor->index].base; + + /* + * Record the node and traverse down the left-hand side for all + * matching records needing a boundary correction. + */ + error = 0; + for (;;) { + rhb = kmalloc(sizeof(*rhb), M_HAMMER, M_WAITOK|M_ZERO); + rhb->node = cursor->node; + rhb->index = cursor->index; + hammer_ref_node(rhb->node); + hammer_lock_sh(&rhb->node->lock); + TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); + + if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { + /* + * Nothing to traverse down if we are at the right + * boundary of an internal node. + */ + if (cursor->index == cursor->node->ondisk->count) + break; + } else { + elm = &cursor->node->ondisk->elms[cursor->index].base; + if (elm->btype == HAMMER_BTREE_TYPE_RECORD) + break; + KKASSERT(elm->btype == HAMMER_BTREE_TYPE_SPIKE_BEG); + } + error = hammer_cursor_down(cursor); + if (error) + break; + + elm = &cursor->node->ondisk->elms[cursor->index].base; + if (elm->obj_id != cmp->obj_id || + elm->rec_type != cmp->rec_type || + elm->key != cmp->key) { + break; + } + if (elm->create_tid >= tid) + break; + + } + + /* + * Now we can safely adjust the left-hand boundary from the bottom-up. + * The last element we remove from the list is the caller's right hand + * boundary, which must also be adjusted. + */ + while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { + error = hammer_cursor_seek(cursor, rhb->node, rhb->index); + if (error) + break; + TAILQ_REMOVE(&rhb_list, rhb, entry); + hammer_unlock(&rhb->node->lock); + hammer_rel_node(rhb->node); + kfree(rhb, M_HAMMER); + + elm = &cursor->node->ondisk->elms[cursor->index].base; + if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { + kprintf("hammer_btree_correct_lhb-I @%d:%d:%08x @%d\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset, cursor->index); + hammer_modify_node(cursor->node); + elm->create_tid = tid; + } else if (elm->btype == HAMMER_BTREE_TYPE_SPIKE_BEG) { + /* + * SPIKE_BEG, also correct cluster header. Occurs + * only while we are traversing the left-hand + * boundary. + */ + kprintf("hammer_btree_correct_lhb-B @%d:%d:%08x\n", + cursor->node->cluster->volume->vol_no, + cursor->node->cluster->clu_no, + cursor->node->node_offset); + hammer_modify_node(cursor->node); + elm->create_tid = tid; + + /* + * We can only cursor down through SPIKE_END. + */ + ++cursor->index; + error = hammer_cursor_down(cursor); + if (error == 0) + error = hammer_lock_upgrade(&cursor->parent->lock); + if (error == 0) { + hammer_modify_node(cursor->parent); + elm = &cursor->parent->ondisk->elms[cursor->parent_index - 1].base; + KKASSERT(elm->btype == HAMMER_BTREE_TYPE_SPIKE_BEG); + elm->create_tid = tid; + hammer_modify_cluster(cursor->node->cluster); + cursor->node->cluster->ondisk->clu_btree_end.create_tid = tid; + cursor->node->cluster->clu_btree_end.create_tid = tid; + } + } else { + panic("hammer_btree_correct_lhb(): Bad element type"); + } + } + + /* + * Cleanup + */ + while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { + TAILQ_REMOVE(&rhb_list, rhb, entry); + hammer_unlock(&rhb->node->lock); + hammer_rel_node(rhb->node); + kfree(rhb, M_HAMMER); + } + return (error); +} + /* * Attempt to remove the empty B-Tree node at (cursor->node). Returns 0 * on success, EAGAIN if we could not acquire the necessary locks, or some diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index 1d58d64f42..f4eab9c258 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.15 2008/01/24 02:14:45 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.16 2008/02/05 07:58:43 dillon Exp $ */ /* @@ -224,6 +224,35 @@ hammer_cursor_downgrade(hammer_cursor_t cursor) hammer_lock_downgrade(&cursor->parent->lock); } +/* + * Seek the cursor to the specified node and index. + */ +int +hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) +{ + int error; + + hammer_cursor_downgrade(cursor); + error = 0; + + if (cursor->node != node) { + hammer_unlock(&cursor->node->lock); + hammer_rel_node(cursor->node); + cursor->node = node; + cursor->index = index; + hammer_ref_node(node); + hammer_lock_sh(&node->lock); + + if (cursor->parent) { + hammer_unlock(&cursor->parent->lock); + hammer_rel_node(cursor->parent); + cursor->parent = NULL; + cursor->parent_index = 0; + } + error = hammer_load_cursor_parent(cursor); + } + return (error); +} #if 0 diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index e44f0fb3f6..c1524d9e72 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.25 2008/01/25 05:49:08 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.26 2008/02/05 07:58:43 dillon Exp $ */ #include "hammer.h" @@ -566,7 +566,9 @@ hammer_unload_inode(struct hammer_inode *ip, void *data) * HAMMER_INODE_ITIMES: mtime/atime has been updated * * last_tid is the TID to use to generate the correct TID when the inode - * is synced to disk. + * is synced to disk. The first inode record laid out on disk must match + * the transaction id of the related directory entry so only update last_tid + * if that has already occured. */ void hammer_modify_inode(struct hammer_transaction *trans, @@ -582,7 +584,8 @@ hammer_modify_inode(struct hammer_transaction *trans, kprintf("hammer_modify_inode: %016llx (%08x)\n", trans->tid, (int)(trans->tid / 1000000000LL)); } - ip->last_tid = trans->tid; + if (ip->flags & HAMMER_INODE_ONDISK) + ip->last_tid = trans->tid; } ip->flags |= flags; } diff --git a/sys/vfs/hammer/hammer_ioctl.c b/sys/vfs/hammer/hammer_ioctl.c index eff58c81cb..347e6856eb 100644 --- a/sys/vfs/hammer/hammer_ioctl.c +++ b/sys/vfs/hammer/hammer_ioctl.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.1 2008/02/04 08:33:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.2 2008/02/05 07:58:43 dillon Exp $ */ #include "hammer.h" @@ -70,8 +70,15 @@ hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag, /* * Iterate through the specified range of object ids and remove any * deleted records that fall entirely within a prune modulo. + * + * A reverse iteration is used to prevent overlapping records from being + * created during the iteration due to alignments. This also allows us + * to adjust alignments without blowing up the B-Tree. */ -static int check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm); +static int check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm, + int *realign_cre, int *realign_del); +static int realign_prune(struct hammer_ioc_prune *prune, hammer_cursor_t cursor, + int realign_cre, int realign_del); static int hammer_ioc_prune(hammer_inode_t ip, struct hammer_ioc_prune *prune) @@ -79,6 +86,9 @@ hammer_ioc_prune(hammer_inode_t ip, struct hammer_ioc_prune *prune) struct hammer_cursor cursor; hammer_btree_elm_t elm; int error; + int isdir; + int realign_cre; + int realign_del; if (prune->nelms < 0 || prune->nelms > HAMMER_MAX_PRUNE_ELMS) { return(EINVAL); @@ -93,43 +103,59 @@ retry: hammer_done_cursor(&cursor); return(error); } - cursor.key_beg.obj_id = prune->cur_obj_id; - cursor.key_beg.key = prune->cur_key; + cursor.key_beg.obj_id = prune->beg_obj_id; + cursor.key_beg.key = HAMMER_MIN_KEY; cursor.key_beg.create_tid = 1; cursor.key_beg.delete_tid = 0; cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE; cursor.key_beg.obj_type = 0; - cursor.key_end.obj_id = prune->end_obj_id; - cursor.key_end.key = HAMMER_MIN_KEY; - cursor.key_end.create_tid = 1; + cursor.key_end.obj_id = prune->cur_obj_id; + cursor.key_end.key = prune->cur_key; + cursor.key_end.create_tid = HAMMER_MAX_TID - 1; cursor.key_end.delete_tid = 0; - cursor.key_end.rec_type = HAMMER_MIN_RECTYPE; + cursor.key_end.rec_type = HAMMER_MAX_RECTYPE; cursor.key_end.obj_type = 0; - cursor.flags |= HAMMER_CURSOR_END_EXCLUSIVE; + cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; - error = hammer_btree_first(&cursor); + error = hammer_btree_last(&cursor); while (error == 0) { elm = &cursor.node->ondisk->elms[cursor.index]; - if (check_prune(prune, elm) == 0) { + prune->cur_obj_id = elm->base.obj_id; + prune->cur_key = elm->base.key; + if (check_prune(prune, elm, &realign_cre, &realign_del) == 0) { if (hammer_debug_general & 0x0200) { kprintf("check %016llx %016llx: DELETE\n", elm->base.obj_id, elm->base.key); } + /* * NOTE: This can return EDEADLK */ - prune->cur_obj_id = elm->base.obj_id; - prune->cur_key = elm->base.key; - if (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY) - ++prune->stat_dirrecords; + isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY); + error = hammer_delete_at_cursor(&cursor, &prune->stat_bytes); - if (error == 0) - ++prune->stat_rawrecords; + if (error) + break; + + if (isdir) + ++prune->stat_dirrecords; else - --prune->stat_dirrecords; + ++prune->stat_rawrecords; + } else if (realign_cre >= 0 || realign_del >= 0) { + error = realign_prune(prune, &cursor, + realign_cre, realign_del); + if (error == 0) { + cursor.flags |= HAMMER_CURSOR_ATEDISK; + if (hammer_debug_general & 0x0200) { + kprintf("check %016llx %016llx: " + "REALIGN\n", + elm->base.obj_id, + elm->base.key); + } + } } else { cursor.flags |= HAMMER_CURSOR_ATEDISK; if (hammer_debug_general & 0x0100) { @@ -138,7 +164,7 @@ retry: } } if (error == 0) - error = hammer_btree_iterate(&cursor); + error = hammer_btree_iterate_reverse(&cursor); } if (error == ENOENT) error = 0; @@ -152,23 +178,47 @@ retry: * Check pruning list. The list must be sorted in descending order. */ static int -check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm) +check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm, + int *realign_cre, int *realign_del) { struct hammer_ioc_prune_elm *scan; int i; - if (elm->base.delete_tid == 0) - return(-1); + *realign_cre = -1; + *realign_del = -1; + for (i = 0; i < prune->nelms; ++i) { scan = &prune->elms[i]; + + /* + * Locate the scan index covering the create and delete TIDs. + */ + if (*realign_cre < 0 && + elm->base.create_tid >= scan->beg_tid && + elm->base.create_tid < scan->end_tid) { + *realign_cre = i; + } + if (*realign_del < 0 && elm->base.delete_tid && + elm->base.delete_tid > scan->beg_tid && + elm->base.delete_tid <= scan->end_tid) { + *realign_del = i; + } + + /* + * Now check for loop termination. + */ if (elm->base.create_tid >= scan->end_tid || - elm->base.delete_tid >= scan->end_tid) { + elm->base.delete_tid > scan->end_tid) { break; } - if (elm->base.create_tid < scan->beg_tid) - continue; - KKASSERT(elm->base.delete_tid >= scan->beg_tid); - if (elm->base.create_tid / scan->mod_tid == + + /* + * Now determine if we can delete the record. + */ + if (elm->base.delete_tid && + elm->base.create_tid >= scan->beg_tid && + elm->base.delete_tid <= scan->end_tid && + elm->base.create_tid / scan->mod_tid == elm->base.delete_tid / scan->mod_tid) { return(0); } @@ -176,6 +226,97 @@ check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm) return(-1); } +/* + * Align the record to cover any gaps created through the deletion of + * records within the pruning space. If we were to just delete the records + * there would be gaps which in turn would cause a snapshot that is NOT on + * a pruning boundary to appear corrupt to the user. Forcing alignment + * of the create_tid and delete_tid for retained records 'reconnects' + * the previously contiguous space, making it contiguous again after the + * deletions. + * + * The use of a reverse iteration allows us to safely align the records and + * related elements without creating temporary overlaps. XXX we should + * add ordering dependancies for record buffers to guarantee consistency + * during recovery. + */ +static int +realign_prune(struct hammer_ioc_prune *prune, + hammer_cursor_t cursor, int realign_cre, int realign_del) +{ + hammer_btree_elm_t elm; + hammer_tid_t delta; + hammer_tid_t mod; + hammer_tid_t tid; + int error; + + hammer_cursor_downgrade(cursor); + + elm = &cursor->node->ondisk->elms[cursor->index]; + ++prune->stat_realignments; + + /* + * Align the create_tid. By doing a reverse iteration we guarantee + * that all records after our current record have already been + * aligned, allowing us to safely correct the right-hand-boundary + * (because no record to our right if otherwise exactly matching + * will have a create_tid to the left of our aligned create_tid). + * + * Ordering is important here XXX but disk write ordering for + * inter-cluster corrections is not currently guaranteed. + */ + error = 0; + if (realign_cre >= 0) { + mod = prune->elms[realign_cre].mod_tid; + delta = elm->leaf.base.create_tid % mod; + if (delta) { + tid = elm->leaf.base.create_tid - delta + mod; + + /* can EDEADLK */ + error = hammer_btree_correct_rhb(cursor, tid + 1); + if (error == 0) { + error = hammer_btree_extract(cursor, + HAMMER_CURSOR_GET_RECORD); + } + if (error == 0) { + /* can EDEADLK */ + error = hammer_cursor_upgrade(cursor); + } + if (error == 0) { + hammer_modify_buffer(cursor->record_buffer); + cursor->record->base.base.create_tid = tid; + hammer_modify_node(cursor->node); + elm->leaf.base.create_tid = tid; + } + } + } + + /* + * Align the delete_tid. This only occurs if the record is historical + * was deleted at some point. Realigning the delete_tid does not + * move the record within the B-Tree but may cause it to temporarily + * overlap a record that has not yet been pruned. + */ + if (error == 0 && realign_del >= 0) { + mod = prune->elms[realign_del].mod_tid; + delta = elm->leaf.base.delete_tid % mod; + hammer_modify_node(cursor->node); + if (delta) { + error = hammer_btree_extract(cursor, + HAMMER_CURSOR_GET_RECORD); + if (error == 0) { + elm->leaf.base.delete_tid = + elm->leaf.base.delete_tid - + delta + mod; + hammer_modify_buffer(cursor->record_buffer); + cursor->record->base.base.delete_tid = + elm->leaf.base.delete_tid; + } + } + } + return (error); +} + /* * Iterate through an object's inode or an object's records and record * modification TIDs. diff --git a/sys/vfs/hammer/hammer_ioctl.h b/sys/vfs/hammer/hammer_ioctl.h index f22fb670ac..82872b32e6 100644 --- a/sys/vfs/hammer/hammer_ioctl.h +++ b/sys/vfs/hammer/hammer_ioctl.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.1 2008/02/04 08:33:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.2 2008/02/05 07:58:43 dillon Exp $ */ /* * HAMMER ioctl's. This file can be #included from userland @@ -62,14 +62,15 @@ struct hammer_ioc_prune { int nelms; int reserved01; int64_t beg_obj_id; - int64_t cur_obj_id; /* initialize to beg_obj_id */ - int64_t cur_key; /* initialize to HAMMER_MIN_KEY */ + int64_t cur_obj_id; /* initialize to end_obj_id */ + int64_t cur_key; /* initialize to HAMMER_MAX_KEY */ int64_t end_obj_id; /* (range-exclusive) */ int64_t stat_scanrecords;/* number of records scanned */ int64_t stat_rawrecords; /* number of raw records pruned */ int64_t stat_dirrecords; /* number of dir records pruned */ int64_t stat_bytes; /* number of data bytes pruned */ - int64_t reserved02[8]; + int64_t stat_realignments; /* number of raw records realigned */ + int64_t reserved02[7]; struct hammer_ioc_prune_elm elms[HAMMER_MAX_PRUNE_ELMS]; }; diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 80abe28dd3..3ca9bff81e 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.27 2008/02/04 08:33:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.28 2008/02/05 07:58:43 dillon Exp $ */ #include "hammer.h" @@ -1386,6 +1386,7 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) hammer_btree_elm_t elm; hammer_mount_t hmp; int error; + int dodelete; /* * In-memory (unsynchronized) records can simply be freed. @@ -1404,15 +1405,15 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) elm = NULL; hmp = cursor->node->cluster->volume->hmp; + dodelete = 0; if (error == 0) { - hammer_modify_buffer(cursor->record_buffer); - cursor->record->base.base.delete_tid = tid; - error = hammer_cursor_upgrade(cursor); if (error == 0) { hammer_modify_node(cursor->node); elm = &cursor->node->ondisk->elms[cursor->index]; elm->leaf.base.delete_tid = tid; + hammer_modify_buffer(cursor->record_buffer); + cursor->record->base.base.delete_tid = tid; } } @@ -1420,7 +1421,10 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) * If we were mounted with the nohistory option, we physically * delete the record. */ - if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) { + if (hmp->hflags & HMNT_NOHISTORY) + dodelete = 1; + + if (error == 0 && dodelete) { error = hammer_delete_at_cursor(cursor, NULL); if (error) { panic("hammer_ip_delete_record: unable to physically delete the record!\n"); diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 5f3e9b4c43..07097c7244 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.25 2008/02/04 08:33:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.26 2008/02/05 07:58:43 dillon Exp $ */ #include @@ -362,6 +362,18 @@ hammer_vop_write(struct vop_write_args *ap) ip->ino_rec.ino_mtime = trans.tid; flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS; hammer_modify_inode(&trans, ip, flags); + + /* + * The file write must be tagged with the same TID as the + * inode, for consistency in case the inode changed size. + * This guarantees the on-disk data records will have a + * TID <= the inode TID representing the size change. + * + * If a prior write has not yet flushed, retain its TID. + */ + if (bp->b_tid == 0) + bp->b_tid = ip->last_tid; + if (ap->a_ioflag & IO_SYNC) { bwrite(bp); } else if (ap->a_ioflag & IO_DIRECT) { @@ -1684,7 +1696,11 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) if (ip->flags & HAMMER_INODE_RO) return (EROFS); - hammer_start_transaction(&trans, ip->hmp); + /* + * Start a transaction using the TID stored with the bp. + */ + KKASSERT(bp->b_tid != 0); + hammer_start_transaction_tid(&trans, ip->hmp, bp->b_tid); retry: /* @@ -1731,6 +1747,7 @@ retry: } else { hammer_commit_transaction(&trans); bp->b_resid = 0; + bp->b_tid = 0; } return(error); } -- 2.41.0