From bf3b416b8b235a1a971e0678428474d8ae83bb7e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 14 Jun 2008 01:42:13 +0000 Subject: [PATCH] HAMMER 55: Performance tuning and bug fixes - MEDIA STRUCTURES CHANGED! * BUG-FIX: Fix a race in hammer_rel_mem_record() which could result in a machine lockup. The code could block at an inappropriate time with both the record and a dependancy inode pointer left unprotected. * BUG-FIX: The direct-write code could assert on (*error != 0) due to an incorrect conditional in the in-memory record scanning code. * Inode data and directory entry data has been given its own zone as a stop-gap until the low level allocator can be rewritten. * Increase the directory object-id cache from 128 entries to 1024 entries. * General cleanup. * Introduce a separate reblocking domain for directories: 'hammer reblock-dirs'. --- sys/vfs/hammer/hammer.h | 6 +- sys/vfs/hammer/hammer_btree.c | 5 +- sys/vfs/hammer/hammer_cursor.h | 4 +- sys/vfs/hammer/hammer_disk.h | 8 +- sys/vfs/hammer/hammer_flusher.c | 4 +- sys/vfs/hammer/hammer_inode.c | 141 +++++++++++----- sys/vfs/hammer/hammer_io.c | 19 +-- sys/vfs/hammer/hammer_ioctl.h | 6 +- sys/vfs/hammer/hammer_object.c | 291 ++++---------------------------- sys/vfs/hammer/hammer_ondisk.c | 44 +++-- sys/vfs/hammer/hammer_reblock.c | 46 +++-- sys/vfs/hammer/hammer_vnops.c | 6 +- 12 files changed, 237 insertions(+), 343 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index ef1c5f72d5..872ccba88a 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.83 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.84 2008/06/14 01:42:12 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -155,7 +155,7 @@ TAILQ_HEAD(hammer_record_list, hammer_record); * directories to retain fairly localized object ids which in turn * improves reblocking performance and layout. */ -#define OBJID_CACHE_SIZE 128 +#define OBJID_CACHE_SIZE 1024 #define OBJID_CACHE_BULK 100000 typedef struct hammer_objid_cache { @@ -865,7 +865,7 @@ void hammer_dup_buffer(struct hammer_buffer **bufferp, struct hammer_buffer *buffer); hammer_node_t hammer_alloc_btree(hammer_transaction_t trans, int *errorp); void *hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, - hammer_off_t *data_offsetp, + u_int16_t rec_type, hammer_off_t *data_offsetp, struct hammer_buffer **data_bufferp, int *errorp); int hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 664ca7d004..2940523f23 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.52 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.53 2008/06/14 01:42:13 dillon Exp $ */ /* @@ -528,7 +528,7 @@ hammer_btree_lookup(hammer_cursor_t cursor) } else { error = btree_search(cursor, 0); } - if (error == 0 && cursor->flags) + if (error == 0) error = hammer_btree_extract(cursor, cursor->flags); return(error); } @@ -602,7 +602,6 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags) elm = &node->elms[cursor->index]; cursor->data = NULL; hmp = cursor->node->hmp; - flags |= cursor->flags & HAMMER_CURSOR_DATAEXTOK; /* * There is nothing to extract for an internal element. diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h index 0041466a7c..c6829aff4a 100644 --- a/sys/vfs/hammer/hammer_cursor.h +++ b/sys/vfs/hammer/hammer_cursor.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.20 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.21 2008/06/14 01:42:13 dillon Exp $ */ /* @@ -126,7 +126,7 @@ typedef struct hammer_cursor *hammer_cursor_t; #define HAMMER_CURSOR_DISKEOF 0x0400 #define HAMMER_CURSOR_MEMEOF 0x0800 #define HAMMER_CURSOR_DELBTREE 0x1000 /* ip_delete from b-tree */ -#define HAMMER_CURSOR_DATAEXTOK 0x2000 /* allow data extension */ +#define HAMMER_CURSOR_UNUSED2000 0x2000 #define HAMMER_CURSOR_ASOF 0x4000 /* as-of lookup */ #define HAMMER_CURSOR_CREATE_CHECK 0x8000 /* as-of lookup */ diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index 3c1994db77..eff5a99faa 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.35 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.36 2008/06/14 01:42:13 dillon Exp $ */ #ifndef VFS_HAMMER_DISK_H_ @@ -132,7 +132,7 @@ typedef u_int32_t hammer_crc_t; #define HAMMER_ZONE_RESERVED06 0x6000000000000000ULL #define HAMMER_ZONE_RESERVED07 0x7000000000000000ULL #define HAMMER_ZONE_BTREE 0x8000000000000000ULL -#define HAMMER_ZONE_RECORD 0x9000000000000000ULL +#define HAMMER_ZONE_META 0x9000000000000000ULL #define HAMMER_ZONE_LARGE_DATA 0xA000000000000000ULL #define HAMMER_ZONE_SMALL_DATA 0xB000000000000000ULL #define HAMMER_ZONE_RESERVED0C 0xC000000000000000ULL @@ -145,7 +145,7 @@ typedef u_int32_t hammer_crc_t; #define HAMMER_ZONE_UNDO_INDEX 3 #define HAMMER_ZONE_FREEMAP_INDEX 4 #define HAMMER_ZONE_BTREE_INDEX 8 -#define HAMMER_ZONE_RECORD_INDEX 9 +#define HAMMER_ZONE_META_INDEX 9 #define HAMMER_ZONE_LARGE_DATA_INDEX 10 #define HAMMER_ZONE_SMALL_DATA_INDEX 11 @@ -528,7 +528,7 @@ typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; #define HAMMER_RECTYPE_LOWEST 1 /* lowest record type avail */ #define HAMMER_RECTYPE_INODE 1 /* inode in obj_id space */ #define HAMMER_RECTYPE_PSEUDO_INODE 2 /* pseudo filesysem */ -#define HAMMER_RECTYPE_CLUSTER 3 /* inter-cluster reference */ +#define HAMMER_RECTYPE_UNUSED03 3 /* inter-cluster reference */ #define HAMMER_RECTYPE_DATA 0x0010 #define HAMMER_RECTYPE_DIRENTRY 0x0011 #define HAMMER_RECTYPE_DB 0x0012 diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index 9acbe16d06..268bc8d44d 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.26 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.27 2008/06/14 01:42:13 dillon Exp $ */ /* * HAMMER dependancy flusher thread @@ -247,7 +247,7 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp) while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) { KKASSERT(--panic_count > 0); KKASSERT(io->mod_list == &hmp->lose_list); - TAILQ_REMOVE(io->mod_list, io, mod_entry); + TAILQ_REMOVE(&hmp->lose_list, io, mod_entry); io->mod_list = NULL; if (io->lock.refs == 0) ++hammer_count_refedbufs; diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index fb74ed5911..bdbc00b56b 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.74 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.75 2008/06/14 01:42:13 dillon Exp $ */ #include "hammer.h" @@ -42,7 +42,8 @@ static int hammer_unload_inode(struct hammer_inode *ip); static void hammer_flush_inode_core(hammer_inode_t ip, int flags); static int hammer_setup_child_callback(hammer_record_t rec, void *data); -static int hammer_setup_parent_inodes(hammer_record_t record); +static int hammer_setup_parent_inodes(hammer_inode_t ip); +static int hammer_setup_parent_inodes_helper(hammer_record_t record); static void hammer_inode_wakereclaims(hammer_inode_t ip); #ifdef DEBUG_TRUNCATE @@ -836,8 +837,7 @@ hammer_modify_inode(hammer_inode_t ip, int flags) void hammer_flush_inode(hammer_inode_t ip, int flags) { - hammer_record_t depend; - int r, good; + int good; /* * Trivial 'nothing to flush' case. If the inode is ina SETUP @@ -873,14 +873,7 @@ hammer_flush_inode(hammer_inode_t ip, int flags) * can't flush, 0 means there weren't any dependancies, and * 1 means we have good connectivity. */ - good = 0; - TAILQ_FOREACH(depend, &ip->target_list, target_entry) { - r = hammer_setup_parent_inodes(depend); - if (r < 0 && good == 0) - good = -1; - if (r > 0) - good = 1; - } + good = hammer_setup_parent_inodes(ip); /* * We can continue if good >= 0. Determine how many records @@ -912,9 +905,78 @@ hammer_flush_inode(hammer_inode_t ip, int flags) } /* + * Scan ip->target_list, which is a list of records owned by PARENTS to our + * ip which reference our ip. + * + * XXX This is a huge mess of recursive code, but not one bit of it blocks + * so for now do not ref/deref the structures. Note that if we use the + * ref/rel code later, the rel CAN block. + */ +static int +hammer_setup_parent_inodes(hammer_inode_t ip) +{ + hammer_record_t depend; +#if 0 + hammer_record_t next; + hammer_inode_t pip; +#endif + int good; + int r; + + good = 0; + TAILQ_FOREACH(depend, &ip->target_list, target_entry) { + r = hammer_setup_parent_inodes_helper(depend); + KKASSERT(depend->target_ip == ip); + if (r < 0 && good == 0) + good = -1; + if (r > 0) + good = 1; + } + return(good); + +#if 0 +retry: + good = 0; + next = TAILQ_FIRST(&ip->target_list); + if (next) { + hammer_ref(&next->lock); + hammer_ref(&next->ip->lock); + } + while ((depend = next) != NULL) { + if (depend->target_ip == NULL) { + pip = depend->ip; + hammer_rel_mem_record(depend); + hammer_rel_inode(pip, 0); + goto retry; + } + KKASSERT(depend->target_ip == ip); + next = TAILQ_NEXT(depend, target_entry); + if (next) { + hammer_ref(&next->lock); + hammer_ref(&next->ip->lock); + } + r = hammer_setup_parent_inodes_helper(depend); + if (r < 0 && good == 0) + good = -1; + if (r > 0) + good = 1; + pip = depend->ip; + hammer_rel_mem_record(depend); + hammer_rel_inode(pip, 0); + } + return(good); +#endif +} + +/* + * This helper function takes a record representing the dependancy between + * the parent inode and child inode. + * + * record->ip = parent inode + * record->target_ip = child inode + * * We are asked to recurse upwards and convert the record from SETUP - * to FLUSH if possible. record->ip is a parent of the caller's inode, - * and record->target_ip is the caller's inode. + * to FLUSH if possible. * * Return 1 if the record gives us connectivity * @@ -923,15 +985,15 @@ hammer_flush_inode(hammer_inode_t ip, int flags) * Return -1 if we can't resolve the dependancy and there is no connectivity. */ static int -hammer_setup_parent_inodes(hammer_record_t record) +hammer_setup_parent_inodes_helper(hammer_record_t record) { - hammer_mount_t hmp = record->ip->hmp; - hammer_record_t depend; - hammer_inode_t ip; - int r, good; + hammer_mount_t hmp; + hammer_inode_t pip; + int good; KKASSERT(record->flush_state != HAMMER_FST_IDLE); - ip = record->ip; + pip = record->ip; + hmp = pip->hmp; /* * If the record is already flushing, is it in our flush group? @@ -943,7 +1005,7 @@ hammer_setup_parent_inodes(hammer_record_t record) */ if (record->flush_state == HAMMER_FST_FLUSH) { if (record->flush_group != hmp->flusher.next) { - ip->flags |= HAMMER_INODE_REFLUSH; + pip->flags |= HAMMER_INODE_REFLUSH; return(-1); } if (record->type == HAMMER_MEM_RECORD_ADD) @@ -958,14 +1020,7 @@ hammer_setup_parent_inodes(hammer_record_t record) */ KKASSERT(record->flush_state == HAMMER_FST_SETUP); - good = 0; - TAILQ_FOREACH(depend, &ip->target_list, target_entry) { - r = hammer_setup_parent_inodes(depend); - if (r < 0 && good == 0) - good = -1; - if (r > 0) - good = 1; - } + good = hammer_setup_parent_inodes(pip); /* * We can't flush ip because it has no connectivity (XXX also check @@ -973,7 +1028,7 @@ hammer_setup_parent_inodes(hammer_record_t record) * recurses back down. */ if (good < 0) { - ip->flags |= HAMMER_INODE_REFLUSH; + pip->flags |= HAMMER_INODE_REFLUSH; return(good); } @@ -983,9 +1038,9 @@ hammer_setup_parent_inodes(hammer_record_t record) * may already be flushing. The record must be in the same flush * group as the parent. */ - if (ip->flush_state != HAMMER_FST_FLUSH) - hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION); - KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); + if (pip->flush_state != HAMMER_FST_FLUSH) + hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION); + KKASSERT(pip->flush_state == HAMMER_FST_FLUSH); KKASSERT(record->flush_state == HAMMER_FST_SETUP); #if 0 @@ -1004,13 +1059,19 @@ hammer_setup_parent_inodes(hammer_record_t record) return(-1); } else #endif - if (ip->flush_group == ip->hmp->flusher.next) { + if (pip->flush_group == pip->hmp->flusher.next) { /* - * This is the record we wanted to synchronize. + * This is the record we wanted to synchronize. If the + * record went into a flush state while we blocked it + * had better be in the correct flush group. */ - record->flush_state = HAMMER_FST_FLUSH; - record->flush_group = ip->flush_group; - hammer_ref(&record->lock); + if (record->flush_state != HAMMER_FST_FLUSH) { + record->flush_state = HAMMER_FST_FLUSH; + record->flush_group = pip->flush_group; + hammer_ref(&record->lock); + } else { + KKASSERT(record->flush_group == pip->flush_group); + } if (record->type == HAMMER_MEM_RECORD_ADD) return(1); @@ -1024,7 +1085,7 @@ hammer_setup_parent_inodes(hammer_record_t record) * We couldn't resolve the dependancies, request that the * inode be flushed when the dependancies can be resolved. */ - ip->flags |= HAMMER_INODE_REFLUSH; + pip->flags |= HAMMER_INODE_REFLUSH; return(-1); } } @@ -1571,6 +1632,8 @@ hammer_sync_inode(hammer_inode_t ip) * Records which are in our flush group can be unlinked from our * inode now, potentially allowing the inode to be physically * deleted. + * + * This cannot block. */ nlinks = ip->ino_data.nlinks; next = TAILQ_FIRST(&ip->target_list); diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 5baeef1037..d25783ff4a 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.40 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.41 2008/06/14 01:42:13 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -68,7 +68,7 @@ hammer_io_init(hammer_io_t io, hammer_mount_t hmp, enum hammer_io_type type) /* * Helper routine to disassociate a buffer cache buffer from an I/O - * structure. Called with the io structure exclusively locked. + * structure. * * The io may have 0 or 1 references depending on who called us. The * caller is responsible for dealing with the refs. @@ -95,8 +95,8 @@ hammer_io_disassociate(hammer_io_structure_t iou, int elseit) } /* - * elseit is 0 when called from the kernel path, the caller is - * holding the buffer locked and will deal with its final disposition. + * elseit is 0 when called from the kernel path when the io + * might have no references. */ if (elseit) { KKASSERT(iou->io.released == 0); @@ -684,6 +684,9 @@ hammer_io_complete(struct buf *bp) KKASSERT(iou->io.released == 1); + /* + * Deal with people waiting for I/O to drain + */ if (iou->io.running) { --hammer_count_io_running_write; if (--iou->io.hmp->io_running_count == 0) @@ -692,18 +695,14 @@ hammer_io_complete(struct buf *bp) iou->io.running = 0; } - /* - * If no lock references remain and we can acquire the IO lock and - * someone at some point wanted us to flush (B_LOCKED test), then - * try to dispose of the IO. - */ if (iou->io.waiting) { iou->io.waiting = 0; wakeup(iou); } /* - * Someone wanted us to flush, try to clean out the buffer. + * If B_LOCKED is set someone wanted to deallocate the bp at some + * point, do it now if refs has become zero. */ if ((bp->b_flags & B_LOCKED) && iou->io.lock.refs == 0) { KKASSERT(iou->io.modified == 0); diff --git a/sys/vfs/hammer/hammer_ioctl.h b/sys/vfs/hammer/hammer_ioctl.h index 859c94ae14..868ddd5874 100644 --- a/sys/vfs/hammer/hammer_ioctl.h +++ b/sys/vfs/hammer/hammer_ioctl.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.10 2008/05/31 18:37:57 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.11 2008/06/14 01:42:13 dillon Exp $ */ /* * HAMMER ioctl's. This file can be #included from userland @@ -59,10 +59,12 @@ struct hammer_ioc_head { #define HAMMER_IOC_DO_BTREE 0x00020000 /* reblocker */ #define HAMMER_IOC_DO_INODES 0x00040000 /* reblocker */ #define HAMMER_IOC_DO_DATA 0x00080000 /* reblocker */ +#define HAMMER_IOC_DO_DIRS 0x00100000 /* reblocker */ #define HAMMER_IOC_DO_FLAGS (HAMMER_IOC_DO_BTREE | \ HAMMER_IOC_DO_INODES | \ - HAMMER_IOC_DO_DATA) + HAMMER_IOC_DO_DATA | \ + HAMMER_IOC_DO_DIRS) /* * HAMMERIOC_PRUNE diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 4ea25b2dd4..0795b22987 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.67 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.68 2008/06/14 01:42:13 dillon Exp $ */ #include "hammer.h" @@ -63,27 +63,14 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) if (rec1->leaf.base.key > rec2->leaf.base.key) return(1); -#if 0 - /* - * XXX create_tid is set during sync, memory records are always - * current. Do not match against create_tid. - */ - if (rec1->leaf.base.create_tid == 0) { - if (rec2->leaf.base.create_tid == 0) - return(0); - return(1); - } - if (rec2->leaf.base.create_tid == 0) - return(-1); - - if (rec1->leaf.base.create_tid < rec2->leaf.base.create_tid) - return(-1); - if (rec1->leaf.base.create_tid > rec2->leaf.base.create_tid) - return(1); -#endif - /* * Never match against an item deleted by the front-end. + * + * rec1 is greater then rec2 if rec1 is marked deleted. + * rec1 is less then rec2 if rec2 is marked deleted. + * + * Multiple deleted records may be present, do not return 0 + * if both are marked deleted. */ if (rec1->flags & HAMMER_RECF_DELETED_FE) return(1); @@ -109,28 +96,12 @@ hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) if (elm->key > rec->leaf.base.key) return(2); -#if 0 - /* - * XXX create_tid is set during sync, memory records are always - * current. Do not match against create_tid. - */ - if (elm->create_tid == 0) { - if (rec->leaf.base.create_tid == 0) - return(0); - return(1); - } - if (rec->leaf.base.create_tid == 0) - return(-1); - if (elm->create_tid < rec->leaf.base.create_tid) - return(-1); - if (elm->create_tid > rec->leaf.base.create_tid) - return(1); -#endif /* * Never match against an item deleted by the front-end. + * elm is less then rec if rec is marked deleted. */ if (rec->flags & HAMMER_RECF_DELETED_FE) - return(1); + return(-1); return(0); } @@ -164,24 +135,12 @@ hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec) return(2); } -#if 0 - if (leaf->base.create_tid == 0) { - if (rec->leaf.base.create_tid == 0) - return(0); - return(1); - } - if (rec->leaf.base.create_tid == 0) - return(-1); - if (leaf->base.create_tid < rec->leaf.base.create_tid) - return(-1); - if (leaf->base.create_tid > rec->leaf.base.create_tid) - return(1); -#endif /* * Never match against an item deleted by the front-end. + * leaf is less then rec if rec is marked deleted. */ if (rec->flags & HAMMER_RECF_DELETED_FE) - return(1); + return(-1); return(0); } @@ -372,6 +331,9 @@ hammer_rel_mem_record(struct hammer_record *record) * Upon release of the last reference wakeup any waiters. * The record structure may get destroyed so callers will * loop up and do a relookup. + * + * WARNING! Record must be removed from RB-TREE before we + * might possibly block. hammer_test_inode() can block! */ ip = record->ip; @@ -380,13 +342,19 @@ hammer_rel_mem_record(struct hammer_record *record) * is destroyed. */ if (record->flags & HAMMER_RECF_DELETED_FE) { + KKASSERT(ip->lock.refs > 0); KKASSERT(record->flush_state != HAMMER_FST_FLUSH); + /* + * target_ip may have zero refs, we have to ref it + * to prevent it from being ripped out from under + * us. + */ if ((target_ip = record->target_ip) != NULL) { TAILQ_REMOVE(&target_ip->target_list, record, target_entry); record->target_ip = NULL; - hammer_test_inode(target_ip); + hammer_ref(&target_ip->lock); } if (record->flags & HAMMER_RECF_ONRBTREE) { @@ -410,6 +378,15 @@ hammer_rel_mem_record(struct hammer_record *record) hammer_test_inode(record->ip); } } + + /* + * Do this test after removing record from the B-Tree. + */ + if (target_ip) { + hammer_test_inode(target_ip); + hammer_rel_inode(target_ip, 0); + } + if (record->flags & HAMMER_RECF_ALLOCDATA) { --hammer_count_record_datas; kfree(record->data, M_HAMMER); @@ -808,6 +785,7 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, hammer_record_t record; hammer_record_t conflict; int zone; + int flags; /* * Deal with conflicting in-memory records. We cannot have multiple @@ -823,10 +801,9 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) { conflict->flags |= HAMMER_RECF_WANTED; tsleep(conflict, 0, "hmrrc3", 0); - hammer_rel_mem_record(conflict); - continue; + } else { + conflict->flags |= HAMMER_RECF_DELETED_FE; } - conflict->flags |= HAMMER_RECF_DELETED_FE; hammer_rel_mem_record(conflict); } @@ -859,6 +836,7 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, record->leaf.base.localization = HAMMER_LOCALIZE_MISC; record->leaf.data_len = bytes; record->leaf.data_crc = crc32(data, bytes); + flags = record->flags; hammer_ref(&record->lock); /* mem_add eats a reference */ *errorp = hammer_mem_add(record); @@ -909,201 +887,6 @@ hammer_rec_trunc_callback(hammer_record_t record, void *data __unused) return(0); } - -/* - * Backend code - * - * Sync data from a buffer cache buffer (typically) to the filesystem. This - * is called via the strategy called from a cached data source. This code - * is responsible for actually writing a data record out to the disk. - * - * This can only occur non-historically (i.e. 'current' data only). - * - * The file offset must be HAMMER_BUFSIZE aligned but the data length - * can be truncated. The record (currently) always represents a BUFSIZE - * swath of space whether the data is truncated or not. - */ -int -hammer_ip_sync_data(hammer_cursor_t cursor, hammer_inode_t ip, - int64_t offset, void *data, int bytes) -{ - hammer_transaction_t trans = cursor->trans; - struct hammer_btree_leaf_elm elm; - hammer_off_t data_offset; - void *bdata; - int error; - int aligned_bytes; - - KKASSERT((offset & HAMMER_BUFMASK) == 0); - KKASSERT(trans->type == HAMMER_TRANS_FLS); - KKASSERT(bytes != 0); - - /* - * We don't have to do this but it's probably a good idea to - * align data allocations to 64-byte boundaries for future - * expansion. - */ - aligned_bytes = (bytes + 15) & ~15; -retry: - hammer_normalize_cursor(cursor); - cursor->key_beg.localization = HAMMER_LOCALIZE_MISC; - cursor->key_beg.obj_id = ip->obj_id; - cursor->key_beg.key = offset + aligned_bytes; - cursor->key_beg.create_tid = trans->tid; - cursor->key_beg.delete_tid = 0; - cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA; - cursor->asof = trans->tid; - cursor->flags &= ~HAMMER_CURSOR_INITMASK; - cursor->flags |= HAMMER_CURSOR_INSERT; - cursor->flags |= HAMMER_CURSOR_BACKEND; - - /* - * Issue a lookup to position the cursor. - */ - error = hammer_btree_lookup(cursor); - if (error == 0) { - kprintf("hammer_ip_sync_data: duplicate data at " - "(%lld,%d) tid %016llx\n", - offset, aligned_bytes, trans->tid); - hammer_print_btree_elm(&cursor->node->ondisk-> - elms[cursor->index], - HAMMER_BTREE_TYPE_LEAF, cursor->index); - panic("Duplicate data"); - error = EIO; - } - if (error != ENOENT) - goto done; - - /* - * Allocate our data. The data buffer is not marked modified (yet) - */ - bdata = hammer_alloc_data(trans, aligned_bytes, &data_offset, - &cursor->data_buffer, &error); - - if (bdata == NULL) - goto done; - - /* - * Fill everything in and insert our B-Tree node. - * - * NOTE: hammer_alloc_data() has already marked the data buffer - * as modified. If we do it again we will generate unnecessary - * undo elements. - */ - elm.base.btype = HAMMER_BTREE_TYPE_RECORD; - elm.base.localization = HAMMER_LOCALIZE_MISC; - elm.base.obj_id = ip->obj_id; - elm.base.key = offset + aligned_bytes; - elm.base.create_tid = trans->tid; - elm.base.delete_tid = 0; - elm.base.rec_type = HAMMER_RECTYPE_DATA; - elm.atime = 0; - elm.data_offset = data_offset; - elm.data_len = aligned_bytes; - - /* - * Copy the data to the allocated buffer. Since we are aligning - * the record size as specified in elm.data_len, make sure to zero - * out any extranious bytes. - */ - hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); - bcopy(data, bdata, bytes); - if (aligned_bytes > bytes) - bzero((char *)bdata + bytes, aligned_bytes - bytes); - hammer_modify_buffer_done(cursor->data_buffer); - elm.data_crc = crc32(bdata, aligned_bytes); - - /* - * Data records can wind up on-disk before the inode itself is - * on-disk. One must assume data records may be on-disk if either - * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set - */ - ip->flags |= HAMMER_INODE_DONDISK; - - error = hammer_btree_insert(cursor, &elm); - if (error == 0) - goto done; - - hammer_blockmap_free(trans, data_offset, aligned_bytes); -done: - if (error == EDEADLK) { - hammer_done_cursor(cursor); - error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip); - if (error == 0) - goto retry; - } - return(error); -} - -#if 0 - -/* - * Backend code which actually performs the write to the media. This - * routine is typically called from the flusher. The bio will be disposed - * of (biodone'd) by this routine. - * - * Iterate the related records and mark for deletion. If existing edge - * records (left and right side) overlap our write they have to be marked - * deleted and new records created, usually referencing a portion of the - * original data. Then add a record to represent the buffer. - */ -int -hammer_dowrite(hammer_cursor_t cursor, hammer_inode_t ip, - off_t file_offset, void *data, int bytes) -{ - int error; - - KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); - - /* - * If the inode is going or gone, just throw away any frontend - * buffers. - */ - if (ip->flags & HAMMER_INODE_DELETED) - return(0); - - /* - * Delete any records overlapping our range. This function will - * (eventually) properly truncate partial overlaps. - */ - if (ip->sync_ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { - error = hammer_ip_delete_range(cursor, ip, file_offset, - file_offset, 0); - } else { - error = hammer_ip_delete_range(cursor, ip, file_offset, - file_offset + bytes - 1, 0); - } - - /* - * Add a single record to cover the write. We can write a record - * with only the actual file data - for example, a small 200 byte - * file does not have to write out a 16K record. - * - * While the data size does not have to be aligned, we still do it - * to reduce fragmentation in a future allocation model. - */ - if (error == 0) { - int limit_size; - - if (ip->sync_ino_data.size - file_offset > bytes) { - limit_size = bytes; - } else { - limit_size = (int)(ip->sync_ino_data.size - - file_offset); - KKASSERT(limit_size >= 0); - } - if (limit_size) { - error = hammer_ip_sync_data(cursor, ip, file_offset, - data, limit_size); - } - } - if (error) - Debugger("hammer_dowrite: error"); - return(error); -} - -#endif - /* * Backend code. Sync a record to the media. */ @@ -1240,6 +1023,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) * Wholely cached record, with data. Allocate the data. */ bdata = hammer_alloc_data(trans, record->leaf.data_len, + record->leaf.base.rec_type, &record->leaf.data_offset, &cursor->data_buffer, &error); if (bdata == NULL) @@ -1576,10 +1360,8 @@ next_memory: int64_t base1 = elm->leaf.base.key - elm->leaf.data_len; int64_t base2 = cursor->iprec->leaf.base.key - cursor->iprec->leaf.data_len; - if (base1 == base2) { - kprintf("G"); + if (base1 == base2) r = 0; - } } if (r < 0) { @@ -2030,6 +1812,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes) switch(data_offset & HAMMER_OFF_ZONE_MASK) { case HAMMER_ZONE_LARGE_DATA: case HAMMER_ZONE_SMALL_DATA: + case HAMMER_ZONE_META: hammer_blockmap_free(cursor->trans, data_offset, data_len); break; diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index e232453720..7c696bd4c8 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.56 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.57 2008/06/14 01:42:13 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -529,6 +529,8 @@ again: * cannot become loose once it gains a ref. Loose * buffers will never be in a modified state. This should * only occur on the 0->1 transition of refs. + * + * lose_list can be modified via a biodone() interrupt. */ if (buffer->io.mod_list == &hmp->lose_list) { crit_enter(); /* biodone race against list */ @@ -549,6 +551,7 @@ again: switch(zone) { case HAMMER_ZONE_LARGE_DATA_INDEX: case HAMMER_ZONE_SMALL_DATA_INDEX: + case HAMMER_ZONE_META_INDEX: /* meta-data isn't a meta-buffer */ iotype = HAMMER_STRUCTURE_DATA_BUFFER; break; case HAMMER_ZONE_UNDO_INDEX: @@ -739,10 +742,14 @@ hammer_ref_buffer(hammer_buffer_t buffer) hammer_ref(&buffer->io.lock); /* + * At this point a biodone() will not touch the buffer other then + * incidental bits. However, lose_list can be modified via + * a biodone() interrupt. + * * No longer loose */ if (buffer->io.mod_list == &buffer->io.hmp->lose_list) { - crit_enter(); /* biodone race against list */ + crit_enter(); TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, mod_entry); buffer->io.mod_list = NULL; crit_exit(); @@ -1264,24 +1271,39 @@ hammer_alloc_btree(hammer_transaction_t trans, int *errorp) */ void * hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, - hammer_off_t *data_offsetp, + u_int16_t rec_type, hammer_off_t *data_offsetp, struct hammer_buffer **data_bufferp, int *errorp) { void *data; + int zone; /* * Allocate data */ if (data_len) { - if (data_len < HAMMER_BUFSIZE) { - *data_offsetp = hammer_blockmap_alloc(trans, - HAMMER_ZONE_SMALL_DATA_INDEX, - data_len, errorp); - } else { - *data_offsetp = hammer_blockmap_alloc(trans, - HAMMER_ZONE_LARGE_DATA_INDEX, - data_len, errorp); + switch(rec_type) { + case HAMMER_RECTYPE_INODE: + case HAMMER_RECTYPE_PSEUDO_INODE: + case HAMMER_RECTYPE_DIRENTRY: + case HAMMER_RECTYPE_EXT: + case HAMMER_RECTYPE_FIX: + zone = HAMMER_ZONE_META_INDEX; + break; + case HAMMER_RECTYPE_DATA: + case HAMMER_RECTYPE_DB: + if (data_len <= HAMMER_BUFSIZE / 2) + zone = HAMMER_ZONE_SMALL_DATA_INDEX; + else + zone = HAMMER_ZONE_LARGE_DATA_INDEX; + break; + default: + panic("hammer_alloc_data: rec_type %04x unknown", + rec_type); + zone = 0; /* NOT REACHED */ + break; } + *data_offsetp = hammer_blockmap_alloc(trans, zone, + data_len, errorp); } else { *data_offsetp = 0; } diff --git a/sys/vfs/hammer/hammer_reblock.c b/sys/vfs/hammer/hammer_reblock.c index 59c6d45915..cc90465626 100644 --- a/sys/vfs/hammer/hammer_reblock.c +++ b/sys/vfs/hammer/hammer_reblock.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.17 2008/06/09 04:19:10 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.18 2008/06/14 01:42:13 dillon Exp $ */ /* * HAMMER reblocker - This code frees up fragmented physical space @@ -165,9 +165,9 @@ hammer_reblock_helper(struct hammer_ioc_reblock *reblock, { hammer_off_t tmp_offset; int error; - int zone; int bytes; int cur; + int iocflags; error = 0; @@ -181,17 +181,42 @@ hammer_reblock_helper(struct hammer_ioc_reblock *reblock, if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) return(0); tmp_offset = elm->leaf.data_offset; - zone = HAMMER_ZONE_DECODE(tmp_offset); /* can be 0 */ - if ((zone == HAMMER_ZONE_SMALL_DATA_INDEX || - zone == HAMMER_ZONE_LARGE_DATA_INDEX) && - error == 0 && (reblock->head.flags & (HAMMER_IOC_DO_DATA | HAMMER_IOC_DO_INODES))) { + if (tmp_offset == 0) + goto skip; + if (error) + goto skip; + + /* + * NOTE: Localization restrictions may also have been set-up, we can't + * just set the match flags willy-nilly here. + */ + switch(elm->leaf.base.rec_type) { + case HAMMER_RECTYPE_INODE: + case HAMMER_RECTYPE_PSEUDO_INODE: + iocflags = HAMMER_IOC_DO_INODES; + break; + case HAMMER_RECTYPE_EXT: + case HAMMER_RECTYPE_FIX: + case HAMMER_RECTYPE_DIRENTRY: + iocflags = HAMMER_IOC_DO_DIRS; + break; + case HAMMER_RECTYPE_DATA: + case HAMMER_RECTYPE_DB: + iocflags = HAMMER_IOC_DO_DATA; + break; + default: + iocflags = 0; + break; + } + if (reblock->head.flags & iocflags) { ++reblock->data_count; reblock->data_byte_count += elm->leaf.data_len; bytes = hammer_blockmap_getfree(cursor->trans->hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) kprintf("D %6d/%d\n", bytes, reblock->free_level); - if (error == 0 && cur == 0 && bytes >= reblock->free_level) { + if (error == 0 && (cur == 0 || reblock->free_level == 0) && + bytes >= reblock->free_level) { error = hammer_cursor_upgrade(cursor); if (error == 0) { error = hammer_reblock_data(reblock, @@ -209,15 +234,15 @@ skip: * Reblock a B-Tree internal or leaf node. */ tmp_offset = cursor->node->node_offset; - zone = HAMMER_ZONE_DECODE(tmp_offset); - if (zone == HAMMER_ZONE_BTREE_INDEX && cursor->index == 0 && + if (cursor->index == 0 && error == 0 && (reblock->head.flags & HAMMER_IOC_DO_BTREE)) { ++reblock->btree_count; bytes = hammer_blockmap_getfree(cursor->trans->hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) kprintf("B %6d/%d\n", bytes, reblock->free_level); - if (error == 0 && cur == 0 && bytes >= reblock->free_level) { + if (error == 0 && (cur == 0 || reblock->free_level == 0) && + bytes >= reblock->free_level) { error = hammer_cursor_upgrade(cursor); if (error == 0) { if (cursor->parent) @@ -265,6 +290,7 @@ hammer_reblock_data(struct hammer_ioc_reblock *reblock, if (error) return (error); ndata = hammer_alloc_data(cursor->trans, elm->leaf.data_len, + elm->leaf.base.rec_type, &ndata_offset, &data_buffer, &error); if (error) goto done; diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 37a6d4feea..5a995b6be7 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.69 2008/06/13 00:25:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.70 2008/06/14 01:42:13 dillon Exp $ */ #include @@ -1891,7 +1891,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) cursor.key_beg.obj_type = 0; cursor.key_beg.key = bio->bio_offset + 1; cursor.asof = ip->obj_asof; - cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK; + cursor.flags |= HAMMER_CURSOR_ASOF; cursor.key_end = cursor.key_beg; KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); @@ -2122,7 +2122,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) if (cursor.key_beg.key < 0) cursor.key_beg.key = 0; cursor.asof = ip->obj_asof; - cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK; + cursor.flags |= HAMMER_CURSOR_ASOF; cursor.key_end = cursor.key_beg; KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); -- 2.41.0