From 4d75d8290a08a2a8661b2883f983e1bcd3d7470e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 9 Jan 2008 00:46:22 +0000 Subject: [PATCH] HAMMER 16/many - Recovery infrastructure, misc bug fixes * Add A-list recovery and iteration support. This code fixes up an A-list which might have been partially modified or gotten out of sync. * Add substantial cluster recovery infrastructure (not yet enabled). The code still needs to go through a few test/debug cycles and does not yet handle cluster pushes. * Fix a critical section and a bug in the A-list empty-check code (the latter not yet used by HAMMER but will be soon). The wrong base free block count was being supplied. * Add a synchronizing transaction id field to the cluster header. This field will be used by the recovery code. * For the moment add a bitmap of buffers allocated to record arrays. The bitmap is in the cluster header. This may or may not be temporary. For now it will allow the recovery code to wipe the cluster's A-list's and then iterate records to regenerate them. --- sbin/newfs_hammer/newfs_hammer.c | 3 +- sys/vfs/hammer/Makefile | 5 +- sys/vfs/hammer/hammer.h | 6 +- sys/vfs/hammer/hammer_alist.c | 425 ++++++++++++++++++++++++++- sys/vfs/hammer/hammer_alist.h | 18 +- sys/vfs/hammer/hammer_disk.h | 27 +- sys/vfs/hammer/hammer_io.c | 14 +- sys/vfs/hammer/hammer_object.c | 14 +- sys/vfs/hammer/hammer_ondisk.c | 131 +++++++-- sys/vfs/hammer/hammer_recover.c | 490 +++++++++++++++++++++++++++++++ sys/vfs/hammer/hammer_subs.c | 6 +- 11 files changed, 1095 insertions(+), 44 deletions(-) create mode 100644 sys/vfs/hammer/hammer_recover.c diff --git a/sbin/newfs_hammer/newfs_hammer.c b/sbin/newfs_hammer/newfs_hammer.c index 1dcd3d6039..87984df63b 100644 --- a/sbin/newfs_hammer/newfs_hammer.c +++ b/sbin/newfs_hammer/newfs_hammer.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sbin/newfs_hammer/newfs_hammer.c,v 1.12 2008/01/03 06:48:48 dillon Exp $ + * $DragonFly: src/sbin/newfs_hammer/newfs_hammer.c,v 1.13 2008/01/09 00:46:19 dillon Exp $ */ #include "newfs_hammer.h" @@ -648,6 +648,7 @@ format_root(struct cluster_info *cluster) rec->inode.ino_mtime = rec->base.base.create_tid; rec->inode.ino_size = 0; rec->inode.ino_nlinks = 1; + cluster->ondisk->synchronized_tid = rec->base.base.create_tid; ++cluster->volume->ondisk->vol0_stat_inodes; diff --git a/sys/vfs/hammer/Makefile b/sys/vfs/hammer/Makefile index aac8b9d94e..ca03221a6b 100644 --- a/sys/vfs/hammer/Makefile +++ b/sys/vfs/hammer/Makefile @@ -1,11 +1,12 @@ # -# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.4 2007/11/19 00:53:40 dillon Exp $ +# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.5 2008/01/09 00:46:22 dillon Exp $ KMOD= hammer SRCS= hammer_vfsops.c hammer_vnops.c hammer_inode.c \ hammer_subs.c hammer_ondisk.c hammer_io.c \ hammer_cursor.c hammer_btree.c hammer_transaction.c \ - hammer_alist.c hammer_object.c hammer_spike.c + hammer_alist.c hammer_object.c hammer_spike.c \ + hammer_recover.c NOMAN= diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index d78dd4c09d..ed4015b540 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.19 2008/01/03 06:48:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.20 2008/01/09 00:46:22 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -463,6 +463,7 @@ int hammer_unload_inode(hammer_inode_t ip, void *data); int hammer_unload_volume(hammer_volume_t volume, void *data __unused); int hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused); int hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused); +void hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid); int hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused); int hammer_install_volume(hammer_mount_t hmp, const char *volname); @@ -575,6 +576,8 @@ void *hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes, int *errorp, struct hammer_buffer **bufferp); void *hammer_alloc_record(struct hammer_cluster *cluster, int *errorp, struct hammer_buffer **bufferp); +void hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, + u_int64_t type); void hammer_free_data_ptr(struct hammer_buffer *buffer, void *data, int bytes); void hammer_free_record_ptr(struct hammer_buffer *buffer, @@ -629,6 +632,7 @@ int hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t rec, void hammer_load_spike(hammer_cursor_t cursor, struct hammer_cursor **spikep); int hammer_spike(struct hammer_cursor **spikep); +int hammer_recover(struct hammer_cluster *cluster); int hammer_io_read(struct vnode *devvp, struct hammer_io *io); int hammer_io_new(struct vnode *devvp, struct hammer_io *io); diff --git a/sys/vfs/hammer/hammer_alist.c b/sys/vfs/hammer/hammer_alist.c index 28c39e340c..9d9372b90e 100644 --- a/sys/vfs/hammer/hammer_alist.c +++ b/sys/vfs/hammer/hammer_alist.c @@ -38,7 +38,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.c,v 1.6 2008/01/03 06:48:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.c,v 1.7 2008/01/09 00:46:22 dillon Exp $ */ /* * This module implements a generic allocator through the use of a hinted @@ -119,7 +119,6 @@ void panic(const char *ctl, ...); /* * static support functions */ - static int32_t hammer_alst_leaf_alloc_fwd(hammer_almeta_t scan, int32_t blk, int count, int32_t atblk); static int32_t hammer_alst_meta_alloc_fwd(hammer_alist_t live, @@ -132,6 +131,9 @@ static int32_t hammer_alst_meta_alloc_rev(hammer_alist_t live, hammer_almeta_t scan, int32_t blk, int32_t count, int32_t radix, int skip, int32_t atblk); +static int32_t hammer_alst_find(hammer_alist_t live, hammer_almeta_t scan, + int32_t blk, int32_t radix, + int32_t skip, int32_t atblk); static void hammer_alst_leaf_free(hammer_almeta_t scan, int32_t relblk, int count); static void hammer_alst_meta_free(hammer_alist_t live, hammer_almeta_t scan, @@ -139,6 +141,9 @@ static void hammer_alst_meta_free(hammer_alist_t live, hammer_almeta_t scan, int32_t radix, int skip, int32_t blk); static int32_t hammer_alst_radix_init(hammer_almeta_t scan, int32_t radix, int skip, int32_t count); +static void hammer_alst_radix_recover(hammer_alist_recover_t info, + hammer_almeta_t scan, int32_t blk, + int32_t radix, int skip, int32_t count); #ifdef ALIST_DEBUG static void hammer_alst_radix_print(hammer_alist_t live, hammer_almeta_t scan, int32_t blk, @@ -213,12 +218,18 @@ hammer_alist_template(hammer_alist_config_t bl, int32_t blocks, #endif } +/* + * Initialize a new A-list + */ void hammer_alist_init(hammer_alist_t live, int32_t start, int32_t count, enum hammer_alloc_state state) { hammer_alist_config_t bl = live->config; + /* + * Note: base_freeblks is a count, not a block number limit. + */ live->meta->bm_alist_freeblks = 0; live->meta->bm_alist_base_freeblks = count; hammer_alst_radix_init(live->meta + 1, bl->bl_radix, @@ -394,7 +405,25 @@ hammer_alist_alloc_rev(hammer_alist_t live, int32_t count, int32_t atblk) } /* - * alist_free() + * hammer_alist_find() + * + * Locate the first block >= atblk marked as allocated in the A-list + * and return it. Return HAMMER_ALIST_BLOCK_NONE if no block could + * be found. + */ +int32_t +hammer_alist_find(hammer_alist_t live, int32_t atblk) +{ + hammer_alist_config_t bl = live->config; + KKASSERT(live->config != NULL); + KKASSERT(atblk >= 0); + atblk = hammer_alst_find(live, live->meta + 1, 0, bl->bl_radix, + bl->bl_skip, atblk); + return(atblk); +} + +/* + * hammer_alist_free() * * Free up space in the block bitmap. Return the base of a contiguous * region. Panic if an inconsistancy is found. @@ -422,6 +451,52 @@ hammer_alist_free(hammer_alist_t live, int32_t blkno, int32_t count) live->meta->bm_alist_freeblks += count; } +/* + * Recover an A-list. This will dive down to the leaves and regenerate + * the hints and the freeblks count. This function will also recurse + * through any stacked A-lists. > 0 is returned on success, a negative + * error code on failure. + * + * Since A-lists have no pointers the only thing that can prevent recovery + * is an I/O error in e.g. a stacked A-list. This doesn't mean the recovered + * map will be meaningful, however. + * + * blk is usually passed as 0 at the top level and is adjusted as the recovery + * code scans the A-list. It is only used when recursing down a stacked + * A-list. + */ +int +hammer_alist_recover(hammer_alist_t live, int32_t blk, int32_t start, + int32_t count) +{ + hammer_alist_config_t bl = live->config; + struct hammer_alist_recover info; + int32_t r; + + info.live = live; + info.error = 0; + + live->meta->bm_alist_freeblks = 0; + live->meta->bm_alist_base_freeblks = count; + hammer_alst_radix_recover(&info, live->meta + 1, blk, bl->bl_radix, + bl->bl_skip, bl->bl_blocks); + if (info.error) + return(info.error); + + /* + * Any garbage between 0 and start, and >= start, is removed. + */ + while ((r = hammer_alist_find(live, 0)) != HAMMER_ALIST_BLOCK_NONE && + r < start) { + hammer_alist_free(live, r, 1); + } + while ((r = hammer_alist_find(live, start + count)) != + HAMMER_ALIST_BLOCK_NONE) { + hammer_alist_free(live, r, 1); + } + return(live->meta->bm_alist_freeblks); +} + int hammer_alist_isfull(hammer_alist_t live) { @@ -446,8 +521,10 @@ hammer_alist_print(hammer_alist_t live, int tab) { hammer_alist_config_t bl = live->config; - kprintf("%*.*sALIST (%d free blocks) {\n", - tab, tab, "", live->meta->bm_alist_freeblks); + kprintf("%*.*sALIST (%d/%d free blocks) {\n", + tab, tab, "", + live->meta->bm_alist_freeblks, + live->meta->bm_alist_base_freeblks); hammer_alst_radix_print(live, live->meta + 1, 0, bl->bl_radix, bl->bl_skip, tab + 4); kprintf("%*.*s}\n", tab, tab, ""); @@ -1044,7 +1121,100 @@ failed: } /* - * BLST_LEAF_FREE() + * HAMMER_ALST_FIND() + * + * Locate the first allocated block greater or equal to atblk. + */ +static int32_t +hammer_alst_find(hammer_alist_t live, hammer_almeta_t scan, int32_t blk, + int32_t radix, int32_t skip, int32_t atblk) +{ + u_int32_t mask; + u_int32_t pmask; + int32_t next_skip; + int32_t tmpblk; + int i; + int j; + + /* + * Leaf node (currently hammer_alist_find() only works on terminal + * a-list's and the case is asserted in hammer_alist_find()). + */ + if (skip == 1 && live->config->bl_terminal) { + if (scan->bm_bitmap == (u_int32_t)-1) + return(HAMMER_ALIST_BLOCK_NONE); + for (i = 0; i < (int)HAMMER_ALIST_BMAP_RADIX; ++i) { + if (blk + i < atblk) + continue; + if ((scan->bm_bitmap & (1 << i)) == 0) + return(blk + i); + } + return(HAMMER_ALIST_BLOCK_NONE); + } + + /* + * Meta + */ + radix /= HAMMER_ALIST_META_RADIX; + next_skip = (skip - 1) / HAMMER_ALIST_META_RADIX; + mask = 0x00000003; + pmask = 0x00000001; + for (j = 0, i = 1; j < HAMMER_ALIST_META_RADIX; (i += next_skip), ++j) { + /* + * Check Terminator + */ + if (scan[i].bm_bighint == (int32_t)-1) { + break; + } + + /* + * Recurse if this meta might contain a desired block. + */ + if (blk + radix > atblk) { + if ((scan->bm_bitmap & mask) == 0) { + /* + * 00 - all-allocated, uninitialized + */ + return(atblk < blk ? blk : atblk); + } else if ((scan->bm_bitmap & mask) == (pmask << 1)) { + /* + * 10 - all-allocated, initialized + */ + return(atblk < blk ? blk : atblk); + } else if ((scan->bm_bitmap & mask) == mask) { + /* + * 11 - all-free (skip) + */ + } else if (next_skip == 0) { + /* + * Partially allocated but we have to recurse + * into a stacked A-list. + */ + tmpblk = live->config->bl_radix_find( + live->info, blk, radix, atblk); + if (tmpblk != HAMMER_ALIST_BLOCK_NONE) + return(tmpblk); + } else if ((scan->bm_bitmap & mask) == pmask) { + /* + * 01 - partially-allocated + */ + tmpblk = hammer_alst_find(live, &scan[i], + blk, radix, + next_skip, atblk); + if (tmpblk != HAMMER_ALIST_BLOCK_NONE) + return(tmpblk); + + } + } + mask <<= 2; + pmask <<= 2; + blk += radix; + } + return(HAMMER_ALIST_BLOCK_NONE); +} + +/* + * HAMMER_ALST_LEAF_FREE() * * Free allocated blocks from leaf bitmap. The allocation code is * restricted to powers of 2, the freeing code is not. @@ -1264,7 +1434,11 @@ hammer_alst_radix_init(hammer_almeta_t scan, int32_t radix, } /* - * We are at a leaf, we only eat one meta element. + * We are at a terminal node, we only eat one meta element. If + * live->config->bl_terminal is set this is a leaf node, otherwise + * it is a meta node for a stacked A-list. We do NOT recurse into + * stacked A-lists but simply mark the entire stack as all-free using + * code 00 (meaning all-free & uninitialized). */ if (skip == 1) return(memindex); @@ -1333,6 +1507,187 @@ hammer_alst_radix_init(hammer_almeta_t scan, int32_t radix, return(memindex); } +/* + * hammer_alst_radix_recover() + * + * This code is basically a duplicate of hammer_alst_radix_init() + * except it recovers the a-list instead of initializes it. + */ +static void +hammer_alst_radix_recover(hammer_alist_recover_t info, hammer_almeta_t scan, + int32_t blk, int32_t radix, int skip, int32_t count) +{ + hammer_alist_t live = info->live; + u_int32_t mask; + u_int32_t pmask; + int next_skip; + int i; + int j; + int n; + + /* + * Don't try to recover bighint, just set it to its maximum + * value and let the A-list allocations reoptimize it. XXX + */ + scan->bm_bighint = radix; + + /* + * If we are at a terminal node (i.e. not stacked on top of another + * A-list), just count the free blocks. + */ + if (skip == 1 && live->config->bl_terminal) { + for (i = 0; i < (int)HAMMER_ALIST_BMAP_RADIX; ++i) { + if (scan->bm_bitmap & (1 << i)) + ++info->live->meta->bm_alist_freeblks; + } + return; + } + + /* + * Recursive meta node (next_skip != 0) or terminal meta + * node (next_skip == 0). + */ + radix /= HAMMER_ALIST_META_RADIX; + next_skip = (skip - 1) / HAMMER_ALIST_META_RADIX; + mask = 0x00000003; + pmask = 0x00000001; + + for (i = 1, j = 0; j < (int)HAMMER_ALIST_META_RADIX; + ++j, (i += next_skip)) { + /* + * Check mask: + * + * 00 ALL-ALLOCATED - UNINITIALIZED + * 01 PARTIALLY-FREE/PARTIALLY-ALLOCATED + * 10 ALL-ALLOCATED - INITIALIZED + * 11 ALL-FREE - UNINITIALIZED + */ + KKASSERT(mask); + if (count >= radix) { + /* + * Recover the entire object + */ + if ((scan->bm_bitmap & mask) == 0) { + /* + * All-allocated (uninited), do nothing + */ + } else if ((scan->bm_bitmap & mask) == mask) { + /* + * All-free (uninited), do nothing + */ + live->meta->bm_alist_freeblks += radix; + } else if (next_skip) { + /* + * Normal meta node, initialized. Recover and + * adjust to either an all-allocated (inited) + * or partially-allocated state. + */ + hammer_alst_radix_recover( + info, + &scan[i], + blk, + radix, + next_skip, + radix + ); + if (scan[i].bm_bitmap == 0) { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | + (pmask << 1); + } else if (scan[i].bm_bitmap == (u_int32_t)-1) { + scan->bm_bitmap |= mask; + } else { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | pmask; + } + } else { + /* + * Stacked meta node, recurse. + */ + n = live->config->bl_radix_recover( + live->info, + blk, radix, radix); + if (n >= 0) { + live->meta->bm_alist_freeblks += n; + if (n == 0) { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | + (pmask << 1); + } else if (n == radix) { + scan->bm_bitmap |= mask; + } else { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | + pmask; + } + } else { + info->error = n; + } + } + count -= radix; + } else if (count > 0) { + /* + * Recover a partial object. The object can become + * wholely allocated but never wholely free. + */ + if (next_skip) { + hammer_alst_radix_recover( + info, + &scan[i], + blk, + radix, + next_skip, + count + ); + if (scan[i].bm_bitmap == 0) { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | + (pmask << 1); + } else { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | pmask; + } + } else { + n = live->config->bl_radix_recover( + live->info, + blk, radix, count); + if (n >= 0) { + live->meta->bm_alist_freeblks += n; + if (n == 0) { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | + (pmask << 1); + } else { + scan->bm_bitmap = + (scan->bm_bitmap & ~mask) | + pmask; + } + } else { + info->error = n; + } + } + count = 0; + } else if (next_skip) { + /* + * Add terminator. The terminator eats the meta + * node at scan[i]. There is only ONE terminator, + * make sure we don't write out any more (set count to + * -1) or we may overflow our allocation. + */ + if (count == 0) { + scan[i].bm_bighint = (int32_t)-1; + count = -1; + } + scan->bm_bitmap &= ~mask; /* all-allocated/uni */ + } else { + scan->bm_bitmap &= ~mask; /* all-allocated/uni */ + } + mask <<= 2; + pmask <<= 2; + blk += radix; + } +} + #ifdef ALIST_DEBUG static void @@ -1491,6 +1846,40 @@ debug_radix_init(void *info, int32_t blk, int32_t radix, return(0); } +static +int32_t +debug_radix_recover(void *info, int32_t blk, int32_t radix, int32_t count) +{ + hammer_alist_t layer; + int layer_no = blk / layer_radix; + int32_t n; + + KKASSERT(layer_radix == radix); + KKASSERT(layers[layer_no] != NULL); + layer = layers[layer_no]; + n = hammer_alist_recover(layer, blk, 0, count); + printf("Recover layer %d blk %d result %d/%d\n", + layer_no, blk, n, count); + return(n); +} + +static +int32_t +debug_radix_find(void *info, int32_t blk, int32_t radix, int32_t atblk) +{ + hammer_alist_t layer; + int layer_no = blk / layer_radix; + int32_t res; + + KKASSERT(layer_radix == radix); + KKASSERT(layers[layer_no] != NULL); + layer = layers[layer_no]; + res = hammer_alist_find(layer, atblk - blk); + if (res != HAMMER_ALIST_BLOCK_NONE) + res += blk; + return(res); +} + /* * This is called when a zone becomes entirely free, typically after a * call to debug_radix_free() has indicated that the entire zone is now @@ -1609,6 +1998,8 @@ main(int ac, char **av) size, live->config->bl_radix / layer_radix, layer_radix); live->config->bl_radix_init = debug_radix_init; + live->config->bl_radix_recover = debug_radix_recover; + live->config->bl_radix_find = debug_radix_find; live->config->bl_radix_destroy = debug_radix_destroy; live->config->bl_radix_alloc_fwd = debug_radix_alloc_fwd; live->config->bl_radix_alloc_rev = debug_radix_alloc_rev; @@ -1632,6 +2023,13 @@ main(int ac, char **av) switch(buf[0]) { case 'p': hammer_alist_print(live, 0); + atblk = 0; + kprintf("allocated: "); + while ((atblk = hammer_alist_find(live, atblk)) != HAMMER_ALIST_BLOCK_NONE) { + kprintf(" %d", atblk); + ++atblk; + } + kprintf("\n"); break; case 'a': atblk = 0; @@ -1660,6 +2058,18 @@ main(int ac, char **av) kprintf("?\n"); } break; + case 'R': + { + int n; + + n = hammer_alist_recover(live, 0, 0, + live->meta->bm_alist_base_freeblks); + if (n < 0) + kprintf("recover: error %d\n", -n); + else + kprintf("recover: %d free\n", n); + } + break; case '?': case 'h': puts( @@ -1667,6 +2077,7 @@ main(int ac, char **av) "a %d -allocate\n" "r %d -allocate reverse\n" "f %x %d -free\n" + "R -recovery a-list\n" "h/? -help" ); break; diff --git a/sys/vfs/hammer/hammer_alist.h b/sys/vfs/hammer/hammer_alist.h index 10cd08d697..8966d50fca 100644 --- a/sys/vfs/hammer/hammer_alist.h +++ b/sys/vfs/hammer/hammer_alist.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.h,v 1.3 2008/01/03 06:48:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.h,v 1.4 2008/01/09 00:46:22 dillon Exp $ */ /* @@ -90,6 +90,10 @@ typedef struct hammer_alist_config { int32_t bl_terminal; /* terminal alist, else layer recursion */ int (*bl_radix_init)(void *info, int32_t blk, int32_t radix, hammer_alloc_state_t state); + int32_t (*bl_radix_recover)(void *info, int32_t blk, int32_t radix, + int32_t count); + int32_t (*bl_radix_find)(void *info, int32_t blk, int32_t radix, + int32_t atblk); int (*bl_radix_destroy)(void *info, int32_t blk, int32_t radix); int32_t (*bl_radix_alloc_fwd)(void *info, int32_t blk, int32_t radix, int32_t count, int32_t atblk, @@ -113,6 +117,15 @@ typedef struct hammer_alist_live { void *info; /* chaining call info argument */ } *hammer_alist_t; +/* + * In-memory structure used to track A-list recovery operations. + */ +typedef struct hammer_alist_recover { + hammer_alist_t live; + int error; +} *hammer_alist_recover_t; + + #define HAMMER_ALIST_META_RADIX (sizeof(int32_t) * 4) /* 16 */ #define HAMMER_ALIST_BMAP_RADIX (sizeof(int32_t) * 8) /* 32 */ #define HAMMER_ALIST_BLOCK_NONE ((int32_t)-1) @@ -136,11 +149,14 @@ void hammer_alist_template(hammer_alist_config_t bl, int32_t blocks, int32_t base_radix, int32_t maxmeta); void hammer_alist_init(hammer_alist_t live, int32_t start, int32_t count, hammer_alloc_state_t state); +int32_t hammer_alist_recover(hammer_alist_t live, int32_t blk, int32_t start, + int32_t count); int32_t hammer_alist_alloc(hammer_alist_t live, int32_t count); int32_t hammer_alist_alloc_fwd(hammer_alist_t live, int32_t count, int32_t atblk); int32_t hammer_alist_alloc_rev(hammer_alist_t live, int32_t count, int32_t atblk); +int32_t hammer_alist_find(hammer_alist_t live, int32_t atblk); int hammer_alist_isfull(hammer_alist_t live); int hammer_alist_isempty(hammer_alist_t live); void hammer_alist_free(hammer_alist_t live, int32_t blkno, int32_t count); diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index c950469d39..35af10c787 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.14 2007/12/31 05:33:12 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.15 2008/01/09 00:46:22 dillon Exp $ */ #ifndef _SYS_UUID_H_ @@ -59,6 +59,7 @@ */ #define HAMMER_BUFSIZE 16384 #define HAMMER_BUFMASK (HAMMER_BUFSIZE - 1) +#define HAMMER_MAXDATA (256*1024) /* * Hammer transction ids are 64 bit unsigned integers and are usually @@ -335,8 +336,8 @@ struct hammer_cluster_ondisk { u_int32_t clu_reserved07; /* - * These fields are heuristics to aid in locality of reference - * allocations. + * These fields are mostly heuristics to aid in locality of + * reference allocations. */ int32_t idx_data; /* data append point (element no) */ int32_t idx_index; /* index append point (element no) */ @@ -377,16 +378,34 @@ struct hammer_cluster_ondisk { int32_t clu_btree_parent_offset; hammer_tid_t clu_btree_parent_clu_gen; - u_int64_t synchronized_rec_id; + /* + * The synchronized record id is used for recovery purposes. + */ + u_int64_t synchronized_tid; + u_int32_t reserved16[510]; struct hammer_almeta clu_master_meta[HAMMER_CLU_MASTER_METAELMS]; struct hammer_almeta clu_btree_meta[HAMMER_CLU_SLAVE_METAELMS]; struct hammer_almeta clu_record_meta[HAMMER_CLU_SLAVE_METAELMS]; struct hammer_almeta clu_mdata_meta[HAMMER_CLU_SLAVE_METAELMS]; + + /* + * A straight bitmap records which filesystem buffers contain records. + * The recovery code reconstructs the A-lists using this bitmap. + */ + u_int32_t clu_record_buf_bitmap[HAMMER_CLU_MAXBUFFERS / 32]; }; typedef struct hammer_cluster_ondisk *hammer_cluster_ondisk_t; +/* + * Cluster clu_flags + * + * OPEN - A cluster is marked open and synchronized to disk prior to any + * modifications being made to either the cluster header or any cluster + * buffers. If initial access to a cluster finds this flag set, the + * cluster is recovered before any further operations are performed on it. + */ #define HAMMER_CLUF_OPEN 0x0001 /* cluster is dirty */ /* diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 679ef0e77e..7e5dd1e606 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.12 2008/01/03 06:48:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.13 2008/01/09 00:46:22 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -62,8 +62,7 @@ hammer_io_disassociate(union hammer_io_structure *io) struct buf *bp = io->io.bp; KKASSERT(io->io.released && io->io.modified == 0); - LIST_INIT(&bp->b_dep); /* clear the association */ - bp->b_ops = NULL; + buf_dep_init(bp); io->io.bp = NULL; bp->b_flags &= ~B_LOCKED; @@ -77,6 +76,7 @@ hammer_io_disassociate(union hammer_io_structure *io) io->supercl.alist.meta = NULL; break; case HAMMER_STRUCTURE_CLUSTER: + /*KKASSERT((io->cluster.ondisk->clu_flags & HAMMER_CLUF_OPEN) == 0);*/ io->cluster.ondisk = NULL; io->cluster.alist_master.meta = NULL; io->cluster.alist_btree.meta = NULL; @@ -110,14 +110,15 @@ hammer_close_cluster(hammer_cluster_t cluster) /* * Hack XXX - called from kernel syncer via hammer_io_checkwrite() when it - * wants to flush buffer. + * wants to flush buffer. Because we disassociate after this call and + * because the kernel is already intending to write out the buffer, don't + * set the io.modified bit. */ static void hammer_close_cluster_quick(hammer_cluster_t cluster) { if (cluster->state == HAMMER_CLUSTER_OPEN) { cluster->state = HAMMER_CLUSTER_IDLE; - cluster->io.modified = 1; cluster->ondisk->clu_flags &= ~HAMMER_CLUF_OPEN; kprintf("CLOSE CLUSTER ON KERNEL WRITE\n"); } @@ -258,7 +259,8 @@ hammer_io_release(struct hammer_io *io, int flush) } /* - * Either we want to flush the buffer or the kernel tried. + * Either we want to flush the buffer or the kernel tried to + * flush the buffer. * * If this is a hammer_buffer we may have to wait for the * cluster header write to complete. diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index eacdbb0dc1..822587b0c1 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.15 2008/01/03 06:48:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.16 2008/01/09 00:46:22 dillon Exp $ */ #include "hammer.h" @@ -495,8 +495,10 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, elm.leaf.data_crc = rec->base.data_crc; error = hammer_btree_insert(&cursor, &elm); - if (error == 0) + if (error == 0) { + hammer_update_syncid(cursor.record_buffer->cluster, trans->tid); goto done; + } hammer_free_record_ptr(cursor.record_buffer, rec); fail1: @@ -646,6 +648,8 @@ again: if (error == 0) { record->flags |= HAMMER_RECF_DELETED; record->flags &= ~HAMMER_RECF_SYNCING; + hammer_update_syncid(cursor.record_buffer->cluster, + record->rec.base.base.create_tid); goto done; } @@ -764,8 +768,11 @@ hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, elm.leaf.data_crc = nrec->base.data_crc; error = hammer_btree_insert(cursor, &elm); - if (error == 0) + if (error == 0) { + hammer_update_syncid(cursor->record_buffer->cluster, + nrec->base.base.create_tid); goto done; + } hammer_free_record_ptr(cursor->record_buffer, nrec); fail1: @@ -1276,6 +1283,7 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) elm = &cursor->node->ondisk->elms[cursor->index]; elm->leaf.base.delete_tid = tid; hammer_modify_node_done(cursor->node); + hammer_update_syncid(cursor->record_buffer->cluster, tid); } /* diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 784c1fdb9a..3b5ec286db 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.16 2008/01/03 06:48:49 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.17 2008/01/09 00:46:22 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -54,8 +54,6 @@ static int hammer_load_cluster(hammer_cluster_t cluster, static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type); static void hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node); -static void initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, - u_int64_t type); static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live, int32_t start, int *errorp, @@ -634,7 +632,7 @@ hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew) dummy.config = &Buf_alist_config; dummy.meta = ondisk->head.buf_almeta; dummy.info = NULL; - initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL); + hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL); nclusters = volume->ondisk->vol_nclusters - ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS); @@ -809,6 +807,14 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) cluster->alist_mdata.info = cluster; if (isnew == 0) { + /* + * Recover a cluster that was marked open. This + * can be rather involved and block for a hefty + * chunk of time. + */ + if (ondisk->clu_flags & HAMMER_CLUF_OPEN) + hammer_recover(cluster); + cluster->clu_btree_beg = ondisk->clu_btree_beg; cluster->clu_btree_end = ondisk->clu_btree_end; } @@ -835,7 +841,7 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) dummy.config = &Buf_alist_config; dummy.meta = ondisk->head.buf_almeta; dummy.info = NULL; - initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER); + hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER); ondisk->vol_fsid = voldisk->vol_fsid; ondisk->vol_fstype = voldisk->vol_fstype; @@ -856,11 +862,17 @@ hammer_load_cluster(hammer_cluster_t cluster, hammer_alloc_state_t isnew) KKASSERT(isnew == HAMMER_ASTATE_FREE); hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1, HAMMER_ASTATE_FREE); - hammer_alist_init(&cluster->alist_btree, 1, nbuffers - 1, + hammer_alist_init(&cluster->alist_btree, + HAMMER_FSBUF_MAXBLKS, + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, HAMMER_ASTATE_ALLOC); - hammer_alist_init(&cluster->alist_record, 1, nbuffers - 1, + hammer_alist_init(&cluster->alist_record, + HAMMER_FSBUF_MAXBLKS, + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, HAMMER_ASTATE_ALLOC); - hammer_alist_init(&cluster->alist_mdata, 1, nbuffers - 1, + hammer_alist_init(&cluster->alist_mdata, + HAMMER_FSBUF_MAXBLKS, + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, HAMMER_ASTATE_ALLOC); ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS; @@ -909,6 +921,20 @@ hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused) return(0); } +/* + * Update the cluster's synchronization TID, which is used during cluster + * recovery. NOTE: The cluster header is not written out until all related + * records have been written out. + */ +void +hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid) +{ + hammer_modify_cluster(cluster); + if (cluster->ondisk->synchronized_tid < tid) + cluster->ondisk->synchronized_tid = tid; + hammer_modify_cluster_done(cluster); +} + /* * Reference a cluster that is either already referenced or via a specially * handled pointer (aka rootcl). @@ -1105,7 +1131,7 @@ hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type) } if (error == 0 && buf_type) { ondisk = buffer->ondisk; - initbuffer(&buffer->alist, &ondisk->head, buf_type); + hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type); buffer->buf_type = ondisk->head.buf_type; } hammer_unlock(&buffer->io.lock); @@ -1780,7 +1806,7 @@ hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes, live = &cluster->alist_mdata; elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data); if (elm_no == HAMMER_ALIST_BLOCK_NONE) - elm_no = hammer_alist_alloc_fwd(live, 1, 0); + elm_no = hammer_alist_alloc_fwd(live, nblks, 0); if (elm_no == HAMMER_ALIST_BLOCK_NONE) { alloc_new_buffer(cluster, HAMMER_FSBUF_DATA, live, cluster->ondisk->idx_data, errorp, bufferp); @@ -1987,6 +2013,8 @@ hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset) * Allocate a new filesystem buffer and assign it to the specified * filesystem buffer type. The new buffer will be added to the * type-specific A-list and initialized. + * + * buffers used for records will also be added to the clu_record_buf_bitmap. */ static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live, @@ -2017,9 +2045,8 @@ alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live, *bufferp = buffer; /* - * Finally, do a meta-free of the buffer's elements into the - * type-specific A-list and update our statistics to reflect - * the allocation. + * Do a meta-free of the buffer's elements into the type-specific + * A-list and update our statistics to reflect the allocation. */ if (buffer) { #if 0 @@ -2038,6 +2065,21 @@ alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live, HAMMER_FSBUF_MAXBLKS); } + + /* + * And, finally, update clu_record_buf_bitmap for record buffers. + * Since buffers are synced to disk before their associated cluster + * header, a recovery operation will only see synced record buffers + * in the bitmap. XXX We can't use alist_record for recovery due + * to the way we currently manage it. + */ + if (buffer && type == HAMMER_FSBUF_RECORDS) { + KKASSERT(buf_no >= 0 && buf_no < HAMMER_CLU_MAXBUFFERS); + hammer_modify_cluster(cluster); + cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5] |= + (1 << (buf_no & 31)); + hammer_modify_cluster_done(cluster); + } } /* @@ -2151,8 +2193,8 @@ hammer_sync_buffer(hammer_buffer_t buffer, void *data) /* * Generic buffer initialization */ -static void -initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type) +void +hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type) { head->buf_type = type; @@ -2238,8 +2280,7 @@ calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no) } /* - * - * + * Allocate nblks buffers from the cluster's master alist. */ static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks, @@ -2333,6 +2374,33 @@ buffer_alist_init(void *info, int32_t blk, int32_t radix, return(0); } +static int +buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count) +{ + hammer_cluster_t cluster = info; + hammer_buffer_t buffer; + int32_t buf_no; + int error = 0; + + buf_no = blk / HAMMER_FSBUF_MAXBLKS; + buffer = hammer_get_buffer(cluster, buf_no, 0, &error); + if (buffer) { + hammer_modify_buffer(buffer); + error = hammer_alist_recover(&buffer->alist, blk, 0, count); + /* free block count is returned if >= 0 */ + hammer_modify_buffer_done(buffer); + hammer_rel_buffer(buffer, 0); + } else { + error = -error; + } + return (error); +} + +/* + * Note: This routine is only called when freeing the last elements of + * an initialized buffer. Freeing all elements of the buffer when the + * buffer was not previously initialized does not call this routine. + */ static int buffer_alist_destroy(void *info, int32_t blk, int32_t radix) { @@ -2466,6 +2534,33 @@ super_alist_init(void *info, int32_t blk, int32_t radix, return (error); } +static int +super_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count) +{ + hammer_volume_t volume = info; + hammer_supercl_t supercl; + int32_t scl_no; + int error = 0; + + /* + * Calculate the super-cluster number containing the cluster (blk) + * and obtain the super-cluster buffer. + */ + scl_no = blk / HAMMER_SCL_MAXCLUSTERS; + supercl = hammer_get_supercl(volume, scl_no, &error, + HAMMER_ASTATE_NONE); + if (supercl) { + hammer_modify_supercl(supercl); + error = hammer_alist_recover(&supercl->alist, blk, 0, count); + /* free block count is returned if >= 0 */ + hammer_modify_supercl_done(supercl); + hammer_rel_supercl(supercl, 0); + } else { + error = -error; + } + return (error); +} + /* * This occurs when freeing a cluster via the volume a-list and the * supercl is now 100% free. We can destroy the supercl. @@ -2600,6 +2695,7 @@ hammer_init_alist_config(void) config = &Vol_super_alist_config; config->bl_radix_init = super_alist_init; + config->bl_radix_recover = super_alist_recover; config->bl_radix_destroy = super_alist_destroy; config->bl_radix_alloc_fwd = super_alist_alloc_fwd; config->bl_radix_alloc_rev = super_alist_alloc_rev; @@ -2608,6 +2704,7 @@ hammer_init_alist_config(void) config = &Clu_slave_alist_config; config->bl_radix_init = buffer_alist_init; + config->bl_radix_recover = buffer_alist_recover; config->bl_radix_destroy = buffer_alist_destroy; config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd; config->bl_radix_alloc_rev = buffer_alist_alloc_rev; diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c new file mode 100644 index 0000000000..34fd223319 --- /dev/null +++ b/sys/vfs/hammer/hammer_recover.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.1 2008/01/09 00:46:22 dillon Exp $ + */ + +#include "hammer.h" + +static void hammer_recover_buffer_stage1(hammer_cluster_t cluster, + int32_t buf_no); +static void hammer_recover_buffer_stage2(hammer_cluster_t cluster, + int32_t buf_no); +static int hammer_recover_record(hammer_cluster_t cluster, + hammer_buffer_t buffer, int32_t rec_offset, + hammer_record_ondisk_t rec); +static int hammer_recover_btree(hammer_cluster_t cluster, + hammer_buffer_t buffer, int32_t rec_offset, + hammer_record_ondisk_t rec); + +/* + * Recover a cluster. The caller has referenced and locked the cluster. + * + * Generally returns 0 on success and EIO if the recovery was unsuccessful. + */ +int +hammer_recover(hammer_cluster_t cluster) +{ + int buf_no; + int nbuffers; + int32_t r; + u_int32_t bitmap; + + return(0); /* XXX temporarily disabled */ + Debugger("hammer_recover"); + KKASSERT(cluster->ondisk->synchronized_tid); + + nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE; + hammer_modify_cluster(cluster); + + /* + * Re-initialize the A-lists. + */ + hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1, + HAMMER_ASTATE_FREE); + hammer_alist_init(&cluster->alist_btree, + HAMMER_FSBUF_MAXBLKS, + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, + HAMMER_ASTATE_ALLOC); + hammer_alist_init(&cluster->alist_mdata, + HAMMER_FSBUF_MAXBLKS, + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, + HAMMER_ASTATE_ALLOC); + hammer_alist_init(&cluster->alist_record, + HAMMER_FSBUF_MAXBLKS, + (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS, + HAMMER_ASTATE_ALLOC); + + /* + * Scan the cluster's clu_record_buf_bitmap, reserve record buffers + * from the master bitmap before we try to recover their data. Free + * the block of records to alist_record. + * + * We need to mark the blocks as free in alist_record so future + * allocations will dive into the buffer A-list's, but we don't + * want to destroy the underlying buffer A-list's. Because the + * meta data in cluster->alist_record indicates state 00 (all-allocated + * but not initialized), it will not dive down into the buffer when + * freeing the entire buffer. + */ + for (buf_no = 1; buf_no < nbuffers; ++buf_no) { + bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5]; + if (bitmap == 0) { + buf_no = ((buf_no + 32) & ~31) - 1; + continue; + } + if ((bitmap & (1 << (buf_no & 31))) == 0) + continue; + r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no); + KKASSERT(r == buf_no); + hammer_alist_free(&cluster->alist_record, + buf_no * HAMMER_FSBUF_MAXBLKS, + HAMMER_FSBUF_MAXBLKS); + } + + /* + * Scan the cluster's clu_record_buf_bitmap, reassign buffers + * from alist_master to alist_record, and reallocate individual + * records and any related data reference if they meet the critera. + */ + for (buf_no = 1; buf_no < nbuffers; ++buf_no) { + bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5]; + if (bitmap == 0) { + buf_no = ((buf_no + 32) & ~31) - 1; + continue; + } + if ((bitmap & (1 << (buf_no & 31))) == 0) + continue; + hammer_recover_buffer_stage1(cluster, buf_no); + } + + /* + * The cluster is now in good enough shape that general allocations + * are possible. Construct an empty B-Tree root. + */ + { + hammer_node_t croot; + int error; + + croot = hammer_alloc_btree(cluster, &error); + if (error == 0) { + hammer_modify_node(croot); + bzero(croot->ondisk, sizeof(*croot->ondisk)); + croot->ondisk->count = 0; + croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF; + hammer_modify_node_done(croot); + cluster->ondisk->clu_btree_root = croot->node_offset; + } + } + + /* + * Scan the cluster's clu_record_buf_bitmap again and regenerate + * the B-Tree. + * + * XXX B-tree record for cluster-push + */ + for (buf_no = 1; buf_no < nbuffers; ++buf_no) { + bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5]; + if (bitmap == 0) { + buf_no = ((buf_no + 32) & ~31) - 1; + continue; + } + if ((bitmap & (1 << (buf_no & 31))) == 0) + continue; + hammer_recover_buffer_stage2(cluster, buf_no); + } + hammer_modify_cluster_done(cluster); + + /* + * Validate the parent cluster pointer. XXX + */ + return(0); +} + +/* + * Reassign buf_no as a record buffer and recover any related data + * references. + */ +static void +hammer_recover_buffer_stage1(hammer_cluster_t cluster, int32_t buf_no) +{ + hammer_record_ondisk_t rec; + hammer_buffer_t buffer; + int32_t rec_no; + int32_t rec_offset; + int error; + + buffer = hammer_get_buffer(cluster, buf_no, 0, &error); + if (error) { + /* + * If we are unable to access the buffer leave it in a + * reserved state on the master alist. + */ + kprintf("hammer_recover_buffer_stage1: error " + "recovering %d:%d:%d\n", + cluster->volume->vol_no, cluster->clu_no, buf_no); + return; + } + + /* + * Recover the buffer, scan and validate allocated records. Records + * which cannot be recovered are freed. + */ + hammer_modify_buffer(buffer); + hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES); + rec_no = -1; + for (;;) { + rec_no = hammer_alist_find(&buffer->alist, rec_no + 1); + if (rec_no == HAMMER_ALIST_BLOCK_NONE) + break; + rec_offset = offsetof(union hammer_fsbuf_ondisk, + record.recs[rec_no]); + rec_offset += buf_no * HAMMER_BUFSIZE; + rec = &buffer->ondisk->record.recs[rec_no]; + error = hammer_recover_record(cluster, buffer, rec_offset, rec); + if (error) { + kprintf("hammer_recover_record: failed %d:%d@%d\n", + cluster->clu_no, buffer->buf_no, rec_offset); + hammer_alist_free(&buffer->alist, rec_no, 1); + } + } + hammer_modify_buffer_done(buffer); + hammer_rel_buffer(buffer, 0); +} + +/* + * Recover a record, at least into a state that doesn't blow up the + * filesystem. Returns 0 on success, non-zero if the record is + * unrecoverable. + */ +static int +hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer, + int32_t rec_offset, hammer_record_ondisk_t rec) +{ + hammer_buffer_t dbuf; + hammer_tid_t syncid = cluster->ondisk->synchronized_tid; + int32_t data_offset; + int32_t data_len; + int32_t nblks; + int32_t dbuf_no; + int32_t dblk_no; + int32_t r; + int error = 0; + + /* + * Discard records created after the synchronization point and + * undo any deletions made after the synchronization point. + */ + if (rec->base.base.create_tid > syncid) + return(EINVAL); + + if (rec->base.base.delete_tid > syncid) + rec->base.base.delete_tid = 0; + + /* + * Validate the record's B-Tree key + */ + if (hammer_btree_cmp(&rec->base.base, + &cluster->ondisk->clu_btree_beg) < 0) { + return(EINVAL); + } + if (hammer_btree_cmp(&rec->base.base, + &cluster->ondisk->clu_btree_end) >= 0) { + return(EINVAL); + } + + /* + * Validate the record's data. If the offset is 0 there is no data + * (or it is zero-fill) and we can return success immediately. + * Otherwise make sure everything is ok. + */ + data_offset = rec->base.data_offset; + data_len = rec->base.data_len; + + if (data_len == 0) + rec->base.data_offset = 0; + if (data_offset == 0) + return(0); + if (data_offset < HAMMER_BUFSIZE || + data_offset >= cluster->ondisk->clu_limit || + data_len < 0 || data_len > HAMMER_MAXDATA || + data_offset + data_len > cluster->ondisk->clu_limit) { + return(EINVAL); + } + + /* + * Check data_offset relative to rec_offset + */ + if (data_offset < rec_offset && data_offset + data_len > rec_offset) + return(EINVAL); + if (data_offset >= rec_offset && + data_offset < rec_offset + sizeof(struct hammer_base_record)) { + return(EINVAL); + } + + /* + * Check for data embedded in the record + */ + if (data_offset >= rec_offset && + data_offset < rec_offset + HAMMER_RECORD_SIZE) { + if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE) + return(EINVAL); + return(0); + } + + /* + * Recover the allocated data either out of the cluster's master alist + * or as a buffer sub-allocation. + */ + if ((data_len & HAMMER_BUFMASK) == 0) { + if (data_offset & HAMMER_BUFMASK) + return(EINVAL); + nblks = data_len / HAMMER_BUFSIZE; + dbuf_no = data_offset / HAMMER_BUFSIZE; + + r = hammer_alist_alloc_fwd(&cluster->alist_master, + nblks, dbuf_no); + if (r == HAMMER_ALIST_BLOCK_NONE) + return(EINVAL); + if (r != dbuf_no) { + hammer_alist_free(&cluster->alist_master, r, nblks); + return(EINVAL); + } + } else { + if ((data_offset & ~HAMMER_BUFMASK) != + ((data_offset + data_len) & ~HAMMER_BUFMASK)) { + return(EINVAL); + } + if ((data_offset & HAMMER_BUFMASK) < + sizeof(struct hammer_fsbuf_head)) { + return(EINVAL); + } + if (data_offset & HAMMER_DATA_BLKMASK) + return(EINVAL); + + /* + * Ok, recover the space in the data buffer. + */ + dbuf_no = data_offset / HAMMER_BUFSIZE; + r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no); + if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE) + hammer_alist_free(&cluster->alist_master, r, 1); + if (r == dbuf_no) { + /* + * This is the first time we've tried to recover + * data in this data buffer, reinit it (but don't + * zero it out, obviously). + * + * Calling initbuffer marks the data blocks within + * the buffer as being free. + */ + dbuf = hammer_get_buffer(cluster, dbuf_no, + 0, &error); + if (error == 0) { + hammer_modify_buffer(dbuf); + hammer_initbuffer(&dbuf->alist, + &dbuf->ondisk->head, + HAMMER_FSBUF_DATA); + dbuf->buf_type = HAMMER_FSBUF_DATA; + hammer_modify_buffer_done(dbuf); + } + } else { + /* + * We've seen this data buffer before. + */ + dbuf = hammer_get_buffer(cluster, dbuf_no, + 0, &error); + } + if (error) + return(EINVAL); + + if (dbuf->buf_type != HAMMER_FSBUF_DATA) { + hammer_rel_buffer(dbuf, 0); + return(EINVAL); + } + + /* + * Figure out the data block number and number of blocks. + */ + nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK; + nblks /= HAMMER_DATA_BLKSIZE; + dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE; + if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) { + kprintf("dblk_no %d does not match data_offset %d/%d\n", + dblk_no, + offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]), + (data_offset & HAMMER_BUFMASK)); + hammer_rel_buffer(dbuf, 0); + Debugger("bad data"); + return(EINVAL); + } + dblk_no *= HAMMER_FSBUF_MAXBLKS; + r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no); + if (r != dblk_no) { + if (r != HAMMER_ALIST_BLOCK_NONE) + hammer_alist_free(&cluster->alist_mdata, r, nblks); + hammer_rel_buffer(dbuf, 0); + return(EINVAL); + } + hammer_rel_buffer(dbuf, 0); + } + return(0); +} + +/* + * Rebuild the B-Ttree for the records residing in the specified buffer. + */ +static void +hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no) +{ + hammer_record_ondisk_t rec; + hammer_buffer_t buffer; + int32_t rec_no; + int32_t rec_offset; + int error; + + buffer = hammer_get_buffer(cluster, buf_no, 0, &error); + if (error) { + /* + * If we are unable to access the buffer leave it in a + * reserved state on the master alist. + */ + kprintf("hammer_recover_buffer_stage2: error " + "recovering %d:%d:%d\n", + cluster->volume->vol_no, cluster->clu_no, buf_no); + return; + } + + /* + * Recover the buffer, scan and validate allocated records. Records + * which cannot be recovered are freed. + */ + rec_no = -1; + for (;;) { + rec_no = hammer_alist_find(&buffer->alist, rec_no + 1); + if (rec_no == HAMMER_ALIST_BLOCK_NONE) + break; + rec_offset = offsetof(union hammer_fsbuf_ondisk, + record.recs[rec_no]); + rec_offset += buf_no * HAMMER_BUFSIZE; + rec = &buffer->ondisk->record.recs[rec_no]; + error = hammer_recover_btree(cluster, buffer, rec_offset, rec); + if (error) { + kprintf("hammer_recover_btree: failed %d:%d@%d\n", + cluster->clu_no, buffer->buf_no, rec_offset); + /* XXX free the record and its data? */ + /*hammer_alist_free(&buffer->alist, rec_no, 1);*/ + } + } + hammer_rel_buffer(buffer, 0); +} + +static int +hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, + int32_t rec_offset, hammer_record_ondisk_t rec) +{ + struct hammer_cursor cursor; + union hammer_btree_elm elm; + int error; + + error = hammer_init_cursor_cluster(&cursor, cluster); + if (error) + goto failed; + cursor.key_beg = rec->base.base; + cursor.flags = HAMMER_CURSOR_INSERT; + error = hammer_btree_lookup(&cursor); + if (error == 0) { + kprintf("hammer_recover_btree: Duplicate record\n"); + hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index); + Debugger("duplicate record"); + } + if (error != ENOENT) + goto failed; + + elm.leaf.base = rec->base.base; + elm.leaf.rec_offset = rec_offset; + elm.leaf.data_offset = rec->base.data_offset; + elm.leaf.data_len = rec->base.data_len; + elm.leaf.data_crc = rec->base.data_crc; + + error = hammer_btree_insert(&cursor, &elm); + if (error) { + kprintf("hammer_recover_btree: insertion failed\n"); + } + /* XXX cluster pushes? */ + +failed: + hammer_done_cursor(&cursor); + return(error); +} + diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index ad7614c2bb..0f66c63400 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.10 2008/01/01 01:00:03 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.11 2008/01/09 00:46:22 dillon Exp $ */ /* * HAMMER structural locking @@ -72,8 +72,10 @@ hammer_lock_ex_try(struct hammer_lock *lock) KKASSERT(lock->refs > 0); crit_enter(); if (lock->locktd != td) { - if (lock->locktd != NULL || lock->lockcount) + if (lock->locktd != NULL || lock->lockcount) { + crit_exit(); return(EAGAIN); + } lock->locktd = td; } KKASSERT(lock->lockcount >= 0); -- 2.41.0