From 0832c9bbe3f718d5923b84fddbf4848999d755f2 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 8 Jun 2008 18:16:26 +0000 Subject: [PATCH] HAMMER 53B/Many: Complete overhaul of strategy code, reservations, etc * Completely overhaul the strategy code. Implement direct reads and writes for all cases. REMOVE THE BACKEND BIO QUEUE. BIOs are no longer queued to the flusher under any circumstances. Remove numerous hacks that were previously emplaced to deal with BIO's being queued to the flusher. * Add a mechanism to invalidate buffer cache buffers that might be shadowed by direct I/O. e.g. if a strategy write uses the vnode's bio directly there may be a shadow hammer_buffer that will then become stale and must be invalidated. * Implement a reservation tracking structure (hammer_reserve) to track storage reservations made by the frontend. The backend will not attempt to free or reuse reserved space if it encounters it. Use reservations to back cached holes (struct hammer_hole) for the same reason. * Index hammer_buffer on the zone-X offset instead of the zone-2 offset. Base the RB tree in the hammer_mount instead of (zone-2) hammer_volume. This removes nearly all blockmap lookup operations from the critical path. * Do a much better job tracking cached dirty data for the purposes of calculating whether the filesystem will become full or not. * Fix a critical bug in the CRC generation of short data buffers. * Fix a VM deadlock. * Use 16-byte alignment for all on-disk data instead of 8-byte alignment. * Major code cleanup. As-of this commit write performance is now extremely good. --- sys/vfs/hammer/hammer.h | 57 ++--- sys/vfs/hammer/hammer_blockmap.c | 344 ++++++++++++++++++++++-------- sys/vfs/hammer/hammer_freemap.c | 3 +- sys/vfs/hammer/hammer_inode.c | 121 +++-------- sys/vfs/hammer/hammer_io.c | 103 ++++----- sys/vfs/hammer/hammer_object.c | 252 +++++++++++++--------- sys/vfs/hammer/hammer_ondisk.c | 354 +++++++++---------------------- sys/vfs/hammer/hammer_recover.c | 8 +- sys/vfs/hammer/hammer_vfsops.c | 21 +- sys/vfs/hammer/hammer_vnops.c | 134 ++++++++---- 10 files changed, 724 insertions(+), 673 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 4a699cafce..54ee7c57ac 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.74 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.75 2008/06/08 18:16:26 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -191,7 +191,7 @@ struct hammer_rec_rb_tree; struct hammer_record; RB_HEAD(hammer_rec_rb_tree, hammer_record); RB_PROTOTYPEX(hammer_rec_rb_tree, INFO, hammer_record, rb_node, - hammer_rec_rb_compare, hammer_base_elm_t); + hammer_rec_rb_compare, hammer_btree_leaf_elm_t); TAILQ_HEAD(hammer_node_list, hammer_node); @@ -210,11 +210,10 @@ struct hammer_inode { int cursor_ip_refs; /* sanity */ int rsv_databufs; int rsv_recs; + int idle_wakeup; struct vnode *vp; struct lockf advlock; struct hammer_lock lock; /* sync copy interlock */ - TAILQ_HEAD(, bio) bio_list; /* BIOs to flush out */ - TAILQ_HEAD(, bio) bio_alt_list; /* BIOs to flush out */ off_t trunc_off; struct hammer_btree_leaf_elm ino_leaf; /* in-memory cache */ struct hammer_inode_data ino_data; /* in-memory cache */ @@ -302,6 +301,7 @@ struct hammer_record { int flush_group; hammer_record_type_t type; struct hammer_lock lock; + struct hammer_reserve *resv; struct hammer_inode *ip; struct hammer_inode *target_ip; struct hammer_btree_leaf_elm leaf; @@ -321,7 +321,8 @@ typedef struct hammer_record *hammer_record_t; #define HAMMER_RECF_DELETED_BE 0x0008 /* deleted (backend) */ #define HAMMER_RECF_UNUSED0010 0x0010 #define HAMMER_RECF_INTERLOCK_BE 0x0020 /* backend interlock */ -#define HAMMER_RECF_WANTED 0x0040 +#define HAMMER_RECF_WANTED 0x0040 /* wanted by the frontend */ +#define HAMMER_RECF_WANTIDLE 0x0080 /* wanted when idle */ #define HAMMER_RECF_CONVERT_DELETE 0x0100 /* special case */ /* @@ -331,10 +332,13 @@ struct hammer_volume; struct hammer_buffer; struct hammer_node; struct hammer_undo; +struct hammer_reserve; + RB_HEAD(hammer_vol_rb_tree, hammer_volume); RB_HEAD(hammer_buf_rb_tree, hammer_buffer); RB_HEAD(hammer_nod_rb_tree, hammer_node); RB_HEAD(hammer_und_rb_tree, hammer_undo); +RB_HEAD(hammer_res_rb_tree, hammer_reserve); RB_PROTOTYPE2(hammer_vol_rb_tree, hammer_volume, rb_node, hammer_vol_rb_compare, int32_t); @@ -344,6 +348,8 @@ RB_PROTOTYPE2(hammer_nod_rb_tree, hammer_node, rb_node, hammer_nod_rb_compare, hammer_off_t); RB_PROTOTYPE2(hammer_und_rb_tree, hammer_undo, rb_node, hammer_und_rb_compare, hammer_off_t); +RB_PROTOTYPE2(hammer_res_rb_tree, hammer_reserve, rb_node, + hammer_res_rb_compare, hammer_off_t); /* * IO management - embedded at the head of various in-memory structures @@ -402,7 +408,6 @@ typedef struct hammer_io *hammer_io_t; struct hammer_volume { struct hammer_io io; RB_ENTRY(hammer_volume) rb_node; - struct hammer_buf_rb_tree rb_bufs_root; struct hammer_volume_ondisk *ondisk; int32_t vol_no; int64_t nblocks; /* note: special calculation for statfs */ @@ -425,8 +430,8 @@ struct hammer_buffer { RB_ENTRY(hammer_buffer) rb_node; void *ondisk; struct hammer_volume *volume; - hammer_off_t zone2_offset; hammer_off_t zoneX_offset; + hammer_off_t zone2_offset; struct hammer_node_list clist; }; @@ -505,12 +510,21 @@ typedef struct hammer_holes *hammer_holes_t; struct hammer_hole { TAILQ_ENTRY(hammer_hole) entry; - hammer_off_t offset; + struct hammer_reserve *resv; + hammer_off_t zone_offset; int bytes; }; typedef struct hammer_hole *hammer_hole_t; +struct hammer_reserve { + RB_ENTRY(hammer_reserve) rb_node; + hammer_off_t zone_offset; + int refs; +}; + +typedef struct hammer_reserve *hammer_reserve_t; + #include "hammer_cursor.h" /* @@ -537,6 +551,8 @@ struct hammer_mount { struct hammer_vol_rb_tree rb_vols_root; struct hammer_nod_rb_tree rb_nods_root; struct hammer_und_rb_tree rb_undo_root; + struct hammer_res_rb_tree rb_resv_root; + struct hammer_buf_rb_tree rb_bufs_root; struct hammer_volume *rootvol; struct hammer_base_elm root_btree_beg; struct hammer_base_elm root_btree_end; @@ -557,7 +573,6 @@ struct hammer_mount { int flusher_exiting; hammer_tid_t flusher_tid; /* last flushed transaction id */ hammer_off_t flusher_undo_start; /* UNDO window for flushes */ - int reclaim_count; thread_t flusher_td; u_int check_interrupt; uuid_t fsid; @@ -565,6 +580,7 @@ struct hammer_mount { struct hammer_io_list volu_list; /* dirty undo buffers */ struct hammer_io_list undo_list; /* dirty undo buffers */ struct hammer_io_list data_list; /* dirty data buffers */ + struct hammer_io_list alt_data_list; /* dirty data buffers */ struct hammer_io_list meta_list; /* dirty meta bufs */ struct hammer_io_list lose_list; /* loose buffers */ int locked_dirty_count; /* meta/volu count */ @@ -621,6 +637,7 @@ extern int hammer_count_volumes; extern int hammer_count_buffers; extern int hammer_count_nodes; extern int hammer_count_dirtybufs; +extern int hammer_count_reservations; extern int hammer_limit_dirtybufs; extern int hammer_limit_irecs; extern int hammer_limit_recs; @@ -740,18 +757,15 @@ void *hammer_bread(struct hammer_mount *hmp, hammer_off_t off, int *errorp, struct hammer_buffer **bufferp); void *hammer_bnew(struct hammer_mount *hmp, hammer_off_t off, int *errorp, struct hammer_buffer **bufferp); -void hammer_binval(hammer_mount_t hmp, hammer_off_t zone2_offset); hammer_volume_t hammer_get_root_volume(hammer_mount_t hmp, int *errorp); -void hammer_cleanup_write_io(hammer_inode_t ip); hammer_volume_t hammer_get_volume(hammer_mount_t hmp, int32_t vol_no, int *errorp); hammer_buffer_t hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, int isnew, int *errorp); -void hammer_clrxlate_buffer(hammer_mount_t hmp, - hammer_off_t buf_offset); -void hammer_uncache_buffer(struct hammer_mount *hmp, hammer_off_t off); +void hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset, + hammer_off_t zone2_offset, int bytes); int hammer_ref_volume(hammer_volume_t volume); int hammer_ref_buffer(hammer_buffer_t buffer); @@ -778,12 +792,6 @@ void hammer_flush_node(hammer_node_t node); void hammer_dup_buffer(struct hammer_buffer **bufferp, struct hammer_buffer *buffer); hammer_node_t hammer_alloc_btree(hammer_transaction_t trans, int *errorp); -void *hammer_alloc_record(hammer_transaction_t trans, - hammer_off_t *rec_offp, u_int16_t rec_type, - struct hammer_buffer **rec_bufferp, - int32_t data_len, void **datap, - hammer_off_t *data_offp, - struct hammer_buffer **data_bufferp, int *errorp); void *hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, hammer_off_t *data_offsetp, struct hammer_buffer **data_bufferp, int *errorp); @@ -801,8 +809,10 @@ void hammer_freemap_free(hammer_transaction_t trans, hammer_off_t phys_offset, int hammer_checkspace(hammer_mount_t hmp); hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone, int bytes, int *errorp); -hammer_off_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone, - int bytes, int *errorp); +hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone, + int bytes, hammer_off_t *zone_offp, int *errorp); +void hammer_blockmap_reserve_complete(hammer_mount_t hmp, + hammer_reserve_t resv); void hammer_blockmap_free(hammer_transaction_t trans, hammer_off_t bmap_off, int bytes); int hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t bmap_off, @@ -845,7 +855,7 @@ int hammer_ip_del_directory(struct hammer_transaction *trans, hammer_cursor_t cursor, hammer_inode_t dip, hammer_inode_t ip); hammer_record_t hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, - void *data, int bytes, int *force_altp); + void *data, int bytes, int *errorp); int hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size); int hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record); @@ -865,7 +875,6 @@ int hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag, void hammer_io_init(hammer_io_t io, hammer_mount_t hmp, enum hammer_io_type type); -void hammer_io_reinit(hammer_io_t io, enum hammer_io_type type); int hammer_io_read(struct vnode *devvp, struct hammer_io *io, hammer_off_t limit); int hammer_io_new(struct vnode *devvp, struct hammer_io *io); diff --git a/sys/vfs/hammer/hammer_blockmap.c b/sys/vfs/hammer/hammer_blockmap.c index f4e240df92..a3b70fb2a4 100644 --- a/sys/vfs/hammer/hammer_blockmap.c +++ b/sys/vfs/hammer/hammer_blockmap.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.15 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.16 2008/06/08 18:16:26 dillon Exp $ */ /* @@ -42,9 +42,26 @@ static hammer_off_t hammer_find_hole(hammer_mount_t hmp, hammer_holes_t holes, int bytes); static void hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes, - hammer_off_t offset, int bytes); + hammer_off_t zone_offset, int bytes); static void hammer_clean_holes(hammer_mount_t hmp, hammer_holes_t holes, - hammer_off_t offset); + hammer_off_t base_offset); +static int hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2); + +/* + * Reserved big-blocks red-black tree support + */ +RB_GENERATE2(hammer_res_rb_tree, hammer_reserve, rb_node, + hammer_res_rb_compare, hammer_off_t, zone_offset); + +static int +hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2) +{ + if (res1->zone_offset < res2->zone_offset) + return(-1); + if (res1->zone_offset > res2->zone_offset) + return(1); + return(0); +} /* * Allocate a big-block from the freemap and stuff it into the blockmap @@ -82,8 +99,10 @@ hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone, int bytes, int *errorp) { + hammer_mount_t hmp; hammer_volume_t root_volume; hammer_blockmap_t rootmap; + hammer_reserve_t resv; struct hammer_blockmap_layer1 *layer1; struct hammer_blockmap_layer2 *layer2; hammer_buffer_t buffer1 = NULL; @@ -91,23 +110,13 @@ hammer_blockmap_alloc(hammer_transaction_t trans, int zone, hammer_buffer_t buffer3 = NULL; hammer_off_t tmp_offset; hammer_off_t next_offset; + hammer_off_t result_offset; hammer_off_t layer1_offset; hammer_off_t layer2_offset; - hammer_off_t bigblock_offset; int loops = 0; int skip_amount; - int used_hole; - KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES); - root_volume = hammer_get_root_volume(trans->hmp, errorp); - if (*errorp) - return(0); - rootmap = &trans->hmp->blockmap[zone]; - KKASSERT(rootmap->phys_offset != 0); - KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) == - HAMMER_ZONE_RAW_BUFFER_INDEX); - KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone); - KKASSERT(HAMMER_ZONE_DECODE(rootmap->next_offset) == zone); + hmp = trans->hmp; /* * Deal with alignment and buffer-boundary issues. @@ -115,23 +124,35 @@ hammer_blockmap_alloc(hammer_transaction_t trans, int zone, * Be careful, certain primary alignments are used below to allocate * new blockmap blocks. */ - bytes = (bytes + 7) & ~7; + bytes = (bytes + 15) & ~15; KKASSERT(bytes > 0 && bytes <= HAMMER_BUFSIZE); - - lockmgr(&trans->hmp->blockmap_lock, LK_EXCLUSIVE|LK_RETRY); + KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES); /* - * Try to use a known-free hole, otherwise append. + * Try to use a known-free hole. */ - next_offset = hammer_find_hole(trans->hmp, &trans->hmp->holes[zone], - bytes); - if (next_offset == 0) { - next_offset = rootmap->next_offset; - used_hole = 0; - } else { - used_hole = 1; + result_offset = hammer_find_hole(hmp, &trans->hmp->holes[zone], bytes); + if (result_offset) { + *errorp = 0; + hammer_blockmap_free(trans, result_offset, -bytes); + return(result_offset); } + /* + * Otherwise scan for space + */ + root_volume = hammer_get_root_volume(hmp, errorp); + if (*errorp) + return(0); + rootmap = &hmp->blockmap[zone]; + KKASSERT(rootmap->phys_offset != 0); + KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) == + HAMMER_ZONE_RAW_BUFFER_INDEX); + KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone); + KKASSERT(HAMMER_ZONE_DECODE(rootmap->next_offset) == zone); + + lockmgr(&hmp->blockmap_lock, LK_EXCLUSIVE|LK_RETRY); + next_offset = rootmap->next_offset; again: /* * The allocation request may not cross a buffer boundary. @@ -140,7 +161,7 @@ again: if ((next_offset ^ tmp_offset) & ~HAMMER_BUFMASK64) { skip_amount = HAMMER_BUFSIZE - ((int)next_offset & HAMMER_BUFMASK); - hammer_add_hole(trans->hmp, &trans->hmp->holes[zone], + hammer_add_hole(hmp, &hmp->holes[zone], next_offset, skip_amount); next_offset = tmp_offset & ~HAMMER_BUFMASK64; } @@ -151,7 +172,7 @@ again: */ layer1_offset = rootmap->phys_offset + HAMMER_BLOCKMAP_LAYER1_OFFSET(next_offset); - layer1 = hammer_bread(trans->hmp, layer1_offset, errorp, &buffer1); + layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1); KKASSERT(*errorp == 0); KKASSERT(next_offset <= rootmap->alloc_offset); @@ -195,11 +216,11 @@ again: ((next_offset ^ rootmap->alloc_offset) & ~HAMMER_BLOCKMAP_LAYER2_MASK) != 0) { next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2_MASK) & ~HAMMER_BLOCKMAP_LAYER2_MASK; - if (next_offset >= trans->hmp->zone_limits[zone]) { + if (next_offset >= hmp->zone_limits[zone]) { hkprintf("blockmap wrap1\n"); next_offset = HAMMER_ZONE_ENCODE(zone, 0); if (++loops == 2) { /* XXX poor-man's */ - next_offset = 0; + result_offset = 0; *errorp = ENOSPC; goto done; } @@ -212,7 +233,7 @@ again: */ layer2_offset = layer1->phys_offset + HAMMER_BLOCKMAP_LAYER2_OFFSET(next_offset); - layer2 = hammer_bread(trans->hmp, layer2_offset, errorp, &buffer2); + layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer2); KKASSERT(*errorp == 0); /* @@ -229,8 +250,13 @@ again: /* * We are at the beginning of a new bigblock */ - if (next_offset == rootmap->alloc_offset || - layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) { + resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, + next_offset & ~HAMMER_LARGEBLOCK_MASK64); + + if (resv) { + goto skip; + } else if (next_offset == rootmap->alloc_offset || + layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) { /* * Allocate the bigblock in layer2 if diving into * uninitialized space or if the block was previously @@ -246,12 +272,13 @@ again: * We have encountered a block that is already * partially allocated. We must skip this block. */ +skip: next_offset += HAMMER_LARGEBLOCK_SIZE; if (next_offset >= trans->hmp->zone_limits[zone]) { next_offset = HAMMER_ZONE_ENCODE(zone, 0); hkprintf("blockmap wrap2\n"); if (++loops == 2) { /* XXX poor-man's */ - next_offset = 0; + result_offset = 0; *errorp = ENOSPC; goto done; } @@ -278,31 +305,39 @@ again: KKASSERT(layer2->bytes_free >= 0); /* - * If the buffer was completely free we do not have to read it from - * disk, call hammer_bnew() to instantiate it. + * If we are allocating from the base of a new buffer we can avoid + * a disk read by calling hammer_bnew(). */ if ((next_offset & HAMMER_BUFMASK) == 0) { - bigblock_offset = layer2->u.phys_offset + - (next_offset & HAMMER_LARGEBLOCK_MASK64); - hammer_bnew(trans->hmp, bigblock_offset, errorp, &buffer3); + hammer_bnew(trans->hmp, next_offset, errorp, &buffer3); } + result_offset = next_offset; /* - * Adjust our iterator and alloc_offset. The layer1 and layer2 - * space beyond alloc_offset is uninitialized. alloc_offset must - * be big-block aligned. + * Process allocated result_offset */ - if (used_hole == 0) { - hammer_modify_volume(trans, root_volume, NULL, 0); - rootmap->next_offset = next_offset + bytes; - if (rootmap->alloc_offset < rootmap->next_offset) { - rootmap->alloc_offset = - (rootmap->next_offset + HAMMER_LARGEBLOCK_MASK) & - ~HAMMER_LARGEBLOCK_MASK64; +done: + hammer_modify_volume(NULL, root_volume, NULL, 0); + if (result_offset) { + if (result_offset == next_offset) { + rootmap->next_offset = next_offset + bytes; + } else { + rootmap->next_offset = next_offset; } - hammer_modify_volume_done(root_volume); + } else { + rootmap->next_offset = next_offset; } -done: + if (rootmap->alloc_offset < rootmap->next_offset) { + rootmap->alloc_offset = + (rootmap->next_offset + HAMMER_LARGEBLOCK_MASK) & + ~HAMMER_LARGEBLOCK_MASK64; + } + hammer_modify_volume_done(root_volume); + lockmgr(&trans->hmp->blockmap_lock, LK_RELEASE); + + /* + * Cleanup + */ if (buffer1) hammer_rel_buffer(buffer1, 0); if (buffer2) @@ -310,8 +345,8 @@ done: if (buffer3) hammer_rel_buffer(buffer3, 0); hammer_rel_volume(root_volume, 0); - lockmgr(&trans->hmp->blockmap_lock, LK_RELEASE); - return(next_offset); + + return(result_offset); } /* @@ -329,8 +364,9 @@ done: * If we return 0 a reservation was not possible and the caller must queue * the I/O to the backend. */ -hammer_off_t -hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, int *errorp) +hammer_reserve_t +hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, + hammer_off_t *zone_offp, int *errorp) { hammer_volume_t root_volume; hammer_blockmap_t rootmap; @@ -343,14 +379,17 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, int *errorp) hammer_off_t next_offset; hammer_off_t layer1_offset; hammer_off_t layer2_offset; - hammer_off_t bigblock_offset; + hammer_reserve_t resv; int loops = 0; int skip_amount; + /* + * Setup + */ KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES); root_volume = hammer_get_root_volume(hmp, errorp); if (*errorp) - return(0); + return(NULL); rootmap = &hmp->blockmap[zone]; KKASSERT(rootmap->phys_offset != 0); KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) == @@ -364,7 +403,7 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, int *errorp) * Be careful, certain primary alignments are used below to allocate * new blockmap blocks. */ - bytes = (bytes + 7) & ~7; + bytes = (bytes + 15) & ~15; KKASSERT(bytes > 0 && bytes <= HAMMER_BUFSIZE); lockmgr(&hmp->blockmap_lock, LK_EXCLUSIVE|LK_RETRY); @@ -376,9 +415,9 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, int *errorp) */ next_offset = rootmap->next_offset; again: + resv = NULL; if (next_offset >= rootmap->alloc_offset) { if (++loops == 2) { /* XXX poor-man's */ - next_offset = 0; *errorp = ENOSPC; goto done; } @@ -434,17 +473,40 @@ again: KKASSERT(*errorp == 0); /* - * Check CRC if not allocating into uninitialized space + * Check CRC if not allocating into uninitialized space (which we + * aren't when reserving space). */ if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) { Debugger("CRC FAILED: LAYER2"); } + /* + * Acquire the related reservation structure. If it exists we can + * only use the bigblock if our current next_offset is already in + * it. + */ + resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, + next_offset & ~HAMMER_LARGEBLOCK_MASK64); + if ((next_offset & HAMMER_LARGEBLOCK_MASK64) == 0) { /* - * We are at the beginning of a new bigblock + * We are at the beginning of a new bigblock. + * + * (1) If the bigblock has already been reserved do not + * try to use it, skip it. + * + * (2) If the bigblock has not been allocated then allocate + * it. + * + * (3) If the bigblock is not completely free we have no + * visibility into what portions may have been allocated, + * so skip it. */ - if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) { + + if (resv) { + next_offset += HAMMER_LARGEBLOCK_SIZE; + goto again; + } else if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) { struct hammer_transaction trans; hammer_start_transaction(&trans, hmp); @@ -455,7 +517,6 @@ again: buffer2, layer2); hammer_sync_unlock(&trans); } else { - hkprintf("e"); hammer_sync_lock_sh(&trans); hammer_blockmap_llalloc(&trans, next_offset, errorp, @@ -466,7 +527,7 @@ again: } hammer_done_transaction(&trans); if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) { - next_offset = 0; + resv = NULL; goto done; } } else if (layer2->bytes_free != HAMMER_LARGEBLOCK_SIZE) { @@ -496,20 +557,45 @@ again: KKASSERT(layer2->bytes_free >= 0); /* - * Reservations are used for direct I/O, make sure there is no - * zone-2 bp cached in the device layer. + * If we are not reserving a whole buffer but are at the start of + * a new block, call hammer_bnew() to avoid a disk read. + * + * If we are reserving a whole buffer the caller will probably use + * a direct read, so do nothing. */ - bigblock_offset = layer2->u.phys_offset + - (next_offset & HAMMER_LARGEBLOCK_MASK64); - hammer_binval(hmp, bigblock_offset); + if (bytes < HAMMER_BUFSIZE && (next_offset & HAMMER_BUFMASK) == 0) { + hammer_bnew(hmp, next_offset, errorp, &buffer3); + } + + /* + * Make the reservation + */ + if (resv) { + ++resv->refs; + } else { + resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO); + resv->refs = 1; + resv->zone_offset = next_offset & ~HAMMER_LARGEBLOCK_MASK64; + RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv); + ++hammer_count_reservations; + } /* * Adjust our iterator and alloc_offset. The layer1 and layer2 * space beyond alloc_offset is uninitialized. alloc_offset must * be big-block aligned. */ - rootmap->next_offset = next_offset + bytes; done: + if (resv) { + hammer_modify_volume(NULL, root_volume, NULL, 0); + rootmap->next_offset = next_offset + bytes; + hammer_modify_volume_done(root_volume); + } else if (rootmap->next_offset != next_offset) { + hammer_modify_volume(NULL, root_volume, NULL, 0); + rootmap->next_offset = next_offset; + hammer_modify_volume_done(root_volume); + } + if (buffer1) hammer_rel_buffer(buffer1, 0); if (buffer2) @@ -518,7 +604,24 @@ done: hammer_rel_buffer(buffer3, 0); hammer_rel_volume(root_volume, 0); lockmgr(&hmp->blockmap_lock, LK_RELEASE); - return(next_offset); + *zone_offp = next_offset; + + return(resv); +} + +/* + * A record with a storage resolution calls this function when it is + * being freed. The storage may or may not have actually been allocated. + */ +void +hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv) +{ + KKASSERT(resv->refs > 0); + if (--resv->refs == 0) { + RB_REMOVE(hammer_res_rb_tree, &hmp->rb_resv_root, resv); + kfree(resv, M_HAMMER); + --hammer_count_reservations; + } } /* @@ -531,7 +634,9 @@ void hammer_blockmap_free(hammer_transaction_t trans, hammer_off_t bmap_off, int bytes) { + hammer_mount_t hmp; hammer_volume_t root_volume; + hammer_reserve_t resv; hammer_blockmap_t rootmap; struct hammer_blockmap_layer1 *layer1; struct hammer_blockmap_layer2 *layer2; @@ -542,8 +647,10 @@ hammer_blockmap_free(hammer_transaction_t trans, int error; int zone; + hmp = trans->hmp; + if (bytes >= 0) { - bytes = (bytes + 7) & ~7; + bytes = (bytes + 15) & ~15; KKASSERT(bytes <= HAMMER_BUFSIZE); KKASSERT(((bmap_off ^ (bmap_off + (bytes - 1))) & ~HAMMER_LARGEBLOCK_MASK64) == 0); @@ -552,13 +659,13 @@ hammer_blockmap_free(hammer_transaction_t trans, } zone = HAMMER_ZONE_DECODE(bmap_off); KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES); - root_volume = hammer_get_root_volume(trans->hmp, &error); + root_volume = hammer_get_root_volume(hmp, &error); if (error) return; - lockmgr(&trans->hmp->blockmap_lock, LK_EXCLUSIVE|LK_RETRY); + lockmgr(&hmp->blockmap_lock, LK_EXCLUSIVE|LK_RETRY); - rootmap = &trans->hmp->blockmap[zone]; + rootmap = &hmp->blockmap[zone]; KKASSERT(rootmap->phys_offset != 0); KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) == HAMMER_ZONE_RAW_BUFFER_INDEX); @@ -575,7 +682,7 @@ hammer_blockmap_free(hammer_transaction_t trans, */ layer1_offset = rootmap->phys_offset + HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off); - layer1 = hammer_bread(trans->hmp, layer1_offset, &error, &buffer1); + layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1); KKASSERT(error == 0); KKASSERT(layer1->phys_offset); if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) { @@ -587,7 +694,7 @@ hammer_blockmap_free(hammer_transaction_t trans, */ layer2_offset = layer1->phys_offset + HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off); - layer2 = hammer_bread(trans->hmp, layer2_offset, &error, &buffer2); + layer2 = hammer_bread(hmp, layer2_offset, &error, &buffer2); KKASSERT(error == 0); KKASSERT(layer2->u.phys_offset); if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) { @@ -609,19 +716,30 @@ hammer_blockmap_free(hammer_transaction_t trans, * reference areas of that block and we cannot overwrite those areas. */ if (layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE) { - if ((rootmap->next_offset ^ bmap_off) & - ~HAMMER_LARGEBLOCK_MASK64) { + hammer_off_t base_off; + + base_off = bmap_off & ~HAMMER_LARGEBLOCK_MASK64; + resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, + base_off); + + if (resv) { + /* + * Portions of this block have been reserved, do + * not free it. + */ + } else if ((rootmap->next_offset ^ bmap_off) & + ~HAMMER_LARGEBLOCK_MASK64) { /* * Our iterator is not in the now-free big-block * and we can release it. */ - hammer_clean_holes(trans->hmp, - &trans->hmp->holes[zone], - bmap_off); + hammer_clean_holes(hmp, &trans->hmp->holes[zone], + base_off); + hammer_del_buffers(hmp, base_off, + layer2->u.phys_offset, + HAMMER_LARGEBLOCK_SIZE); hammer_freemap_free(trans, layer2->u.phys_offset, bmap_off, &error); - hammer_clrxlate_buffer(trans->hmp, - layer2->u.phys_offset); layer2->u.phys_offset = HAMMER_BLOCKMAP_FREE; hammer_modify_buffer(trans, buffer1, @@ -664,7 +782,7 @@ hammer_blockmap_free(hammer_transaction_t trans, layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE); hammer_modify_buffer_done(buffer2); done: - lockmgr(&trans->hmp->blockmap_lock, LK_RELEASE); + lockmgr(&hmp->blockmap_lock, LK_RELEASE); if (buffer1) hammer_rel_buffer(buffer1, 0); @@ -854,7 +972,12 @@ hammer_free_holes(hammer_mount_t hmp, hammer_holes_t holes) while ((hole = TAILQ_FIRST(&holes->list)) != NULL) { TAILQ_REMOVE(&holes->list, hole, entry); + if (hole->resv) { + hammer_blockmap_reserve_complete(hmp, hole->resv); + hole->resv = NULL; + } kfree(hole, M_HAMMER); + --holes->count; } } @@ -870,8 +993,8 @@ hammer_find_hole(hammer_mount_t hmp, hammer_holes_t holes, int bytes) TAILQ_FOREACH(hole, &holes->list, entry) { if (bytes <= hole->bytes) { - result_off = hole->offset; - hole->offset += bytes; + result_off = hole->zone_offset; + hole->zone_offset += bytes; hole->bytes -= bytes; break; } @@ -888,23 +1011,51 @@ hammer_find_hole(hammer_mount_t hmp, hammer_holes_t holes, int bytes) */ static void hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes, - hammer_off_t offset, int bytes) + hammer_off_t zone_offset, int bytes) { hammer_hole_t hole; + hammer_reserve_t resv; if (bytes <= 128) return; + /* + * Allocate or reuse a hole structure + */ if (holes->count < HAMMER_MAX_HOLES) { hole = kmalloc(sizeof(*hole), M_HAMMER, M_WAITOK); ++holes->count; } else { hole = TAILQ_FIRST(&holes->list); TAILQ_REMOVE(&holes->list, hole, entry); + if (hole->resv) { + hammer_blockmap_reserve_complete(hmp, hole->resv); + hole->resv = NULL; + } } - TAILQ_INSERT_TAIL(&holes->list, hole, entry); - hole->offset = offset; + + /* + * Associate the structure with the appropriate reservation so the + * bigblock does not get freed or reused while we have cached holes, + * and install. + */ + hole->zone_offset = zone_offset; hole->bytes = bytes; + + zone_offset &= ~HAMMER_LARGEBLOCK_MASK64; + + resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, zone_offset); + if (resv == NULL) { + resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO); + resv->zone_offset = zone_offset; + resv->refs = 1; + RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv); + ++hammer_count_reservations; + } else { + ++resv->refs; + } + hole->resv = resv; + TAILQ_INSERT_TAIL(&holes->list, hole, entry); } /* @@ -913,16 +1064,21 @@ hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes, */ static void hammer_clean_holes(hammer_mount_t hmp, hammer_holes_t holes, - hammer_off_t offset) + hammer_off_t base_offset) { hammer_hole_t hole; - offset &= ~HAMMER_LARGEBLOCK_MASK64; - restart: TAILQ_FOREACH(hole, &holes->list, entry) { - if ((hole->offset & ~HAMMER_LARGEBLOCK_MASK64) == offset) { + if ((hole->zone_offset & ~HAMMER_LARGEBLOCK_MASK64) == + base_offset) { TAILQ_REMOVE(&holes->list, hole, entry); + if (hole->resv) { + hammer_blockmap_reserve_complete(hmp, + hole->resv); + hole->resv = NULL; + } + --holes->count; kfree(hole, M_HAMMER); goto restart; } diff --git a/sys/vfs/hammer/hammer_freemap.c b/sys/vfs/hammer/hammer_freemap.c index 5b268626d8..bf89d4a592 100644 --- a/sys/vfs/hammer/hammer_freemap.c +++ b/sys/vfs/hammer/hammer_freemap.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_freemap.c,v 1.13 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_freemap.c,v 1.14 2008/06/08 18:16:26 dillon Exp $ */ /* @@ -162,7 +162,6 @@ hammer_freemap_free(hammer_transaction_t trans, hammer_off_t phys_offset, KKASSERT((phys_offset & HAMMER_LARGEBLOCK_MASK64) == 0); - hammer_uncache_buffer(trans->hmp, phys_offset); *errorp = 0; ondisk = trans->rootvol->ondisk; diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 9e7be5d6f0..41a99b5be5 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.65 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.66 2008/06/08 18:16:26 dillon Exp $ */ #include "hammer.h" @@ -44,6 +44,10 @@ static void hammer_flush_inode_core(hammer_inode_t ip, int flags); static int hammer_setup_child_callback(hammer_record_t rec, void *data); static int hammer_setup_parent_inodes(hammer_record_t record); +#ifdef DEBUG_TRUNCATE +extern struct hammer_inode *HammerTruncIp; +#endif + /* * The kernel is not actively referencing this vnode but is still holding * it cached. @@ -227,8 +231,6 @@ loop: ip->flags |= HAMMER_INODE_RO; ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; RB_INIT(&ip->rec_tree); - TAILQ_INIT(&ip->bio_list); - TAILQ_INIT(&ip->bio_alt_list); TAILQ_INIT(&ip->target_list); /* @@ -340,8 +342,6 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; RB_INIT(&ip->rec_tree); - TAILQ_INIT(&ip->bio_list); - TAILQ_INIT(&ip->bio_alt_list); TAILQ_INIT(&ip->target_list); ip->ino_leaf.atime = trans->time; @@ -648,7 +648,12 @@ hammer_rel_inode(struct hammer_inode *ip, int flush) KKASSERT(ip->vp == NULL); hammer_inode_unloadable_check(ip, 0); if (ip->flags & HAMMER_INODE_MODMASK) { - hammer_flush_inode(ip, 0); + if (hmp->rsv_inodes > desiredvnodes) { + hammer_flush_inode(ip, + HAMMER_FLUSH_SIGNAL); + } else { + hammer_flush_inode(ip, 0); + } } else if (ip->lock.refs == 1) { hammer_unload_inode(ip); break; @@ -668,17 +673,6 @@ hammer_rel_inode(struct hammer_inode *ip, int flush) } } } - - /* - * XXX bad hack until I add code to track inodes in SETUP. We - * can queue a lot of inodes to the syncer but if we don't wake - * it up the undo sets will be too large or too many unflushed - * records will build up and blow our malloc limit. - */ - if (++hmp->reclaim_count > 256) { - hmp->reclaim_count = 0; - hammer_flusher_async(hmp); - } } /* @@ -700,8 +694,6 @@ hammer_unload_inode(struct hammer_inode *ip) KKASSERT(RB_EMPTY(&ip->rec_tree)); KKASSERT(TAILQ_EMPTY(&ip->target_list)); - KKASSERT(TAILQ_EMPTY(&ip->bio_list)); - KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list)); RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip); @@ -1043,6 +1035,10 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags) ip->sync_ino_data = ip->ino_data; ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; ip->flags &= ~HAMMER_INODE_MODMASK; +#ifdef DEBUG_TRUNCATE + if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp) + kprintf("truncateS %016llx\n", ip->sync_trunc_off); +#endif /* * The flusher list inherits our inode and reference. @@ -1051,8 +1047,9 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags) if (--ip->hmp->flusher_lock == 0) wakeup(&ip->hmp->flusher_lock); - if (flags & HAMMER_FLUSH_SIGNAL) + if (flags & HAMMER_FLUSH_SIGNAL) { hammer_flusher_async(ip->hmp); + } } /* @@ -1174,8 +1171,12 @@ void hammer_wait_inode(hammer_inode_t ip) { while (ip->flush_state != HAMMER_FST_IDLE) { - ip->flags |= HAMMER_INODE_FLUSHW; - tsleep(&ip->flags, 0, "hmrwin", 0); + if (ip->flush_state == HAMMER_FST_SETUP) { + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + } else { + ip->flags |= HAMMER_INODE_FLUSHW; + tsleep(&ip->flags, 0, "hmrwin", 0); + } } } @@ -1189,7 +1190,6 @@ hammer_wait_inode(hammer_inode_t ip) void hammer_flush_inode_done(hammer_inode_t ip) { - struct bio *bio; int dorel = 0; KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); @@ -1206,20 +1206,11 @@ hammer_flush_inode_done(hammer_inode_t ip) if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks) ip->flags |= HAMMER_INODE_DDIRTY; - /* - * Reflush any BIOs that wound up in the alt list. Our inode will - * also wind up at the end of the flusher's list. - */ - while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) { - TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act); - TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); - } /* * Fix up the dirty buffer status. IO completions will also * try to clean up rsv_databufs. */ - if (TAILQ_FIRST(&ip->bio_list) || - (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree))) { + if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) { ip->flags |= HAMMER_INODE_BUFS; } else { ip->hmp->rsv_databufs -= ip->rsv_databufs; @@ -1230,8 +1221,10 @@ hammer_flush_inode_done(hammer_inode_t ip) * Re-set the XDIRTY flag if some of the inode's in-memory records * could not be flushed. */ - if (RB_ROOT(&ip->rec_tree)) - ip->flags |= HAMMER_INODE_XDIRTY; + KKASSERT((RB_EMPTY(&ip->rec_tree) && + (ip->flags & HAMMER_INODE_XDIRTY) == 0) || + (!RB_EMPTY(&ip->rec_tree) && + (ip->flags & HAMMER_INODE_XDIRTY) != 0)); /* * Do not lose track of inodes which no longer have vnode @@ -1438,8 +1431,6 @@ hammer_sync_inode(hammer_inode_t ip) { struct hammer_transaction trans; struct hammer_cursor cursor; - struct buf *bp; - struct bio *bio; hammer_record_t depend; hammer_record_t next; int error, tmp_error; @@ -1602,19 +1593,12 @@ hammer_sync_inode(hammer_inode_t ip) * if records remain. */ if (error == 0) { - int base_btree_iterations = hammer_stats_btree_iterations; - int base_record_iterations = hammer_stats_record_iterations; tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, hammer_sync_record_callback, &cursor); -#if 0 - kprintf("(%d,%d)", hammer_stats_record_iterations - base_record_iterations, hammer_stats_btree_iterations - base_btree_iterations); -#endif if (tmp_error < 0) tmp_error = -error; if (tmp_error) error = tmp_error; - if (RB_EMPTY(&ip->rec_tree)) - ip->sync_flags &= ~HAMMER_INODE_XDIRTY; } /* @@ -1658,27 +1642,6 @@ hammer_sync_inode(hammer_inode_t ip) } } - /* - * Flush any queued BIOs. These will just biodone() the IO's if - * the inode has been deleted. - */ - while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) { - TAILQ_REMOVE(&ip->bio_list, bio, bio_act); - bp = bio->bio_buf; - tmp_error = hammer_dowrite(&cursor, ip, bio->bio_offset, - bp->b_data, bp->b_bufsize); - if (tmp_error) { - bp->b_resid = bio->bio_buf->b_bufsize; - bp->b_error = error; - bp->b_flags |= B_ERROR; - error = tmp_error; - } else { - bp->b_resid = 0; - } - biodone(bio); - --hammer_bio_count; - hammer_cleanup_write_io(ip); - } ip->sync_flags &= ~HAMMER_INODE_BUFS; if (error) @@ -1782,7 +1745,6 @@ void hammer_inode_unloadable_check(hammer_inode_t ip, int getvp) { struct vnode *vp; - struct bio *bio; /* * Set the DELETING flag when the link count drops to 0 and the @@ -1803,33 +1765,6 @@ hammer_inode_unloadable_check(hammer_inode_t ip, int getvp) return; } - /* - * biodone any buffers with pending IO. These buffers are - * holding a BUF_KERNPROC() exclusive lock and our - * vtruncbuf() call will deadlock if any remain. - * - * (interlocked against hammer_vop_strategy_write via - * HAMMER_INODE_DELETING|HAMMER_INODE_DELETED). - */ - while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) { - TAILQ_REMOVE(&ip->bio_list, bio, bio_act); - bio->bio_buf->b_resid = 0; - biodone(bio); - if (ip->rsv_databufs) { - --ip->rsv_databufs; - --ip->hmp->rsv_databufs; - } - } - while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) { - TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act); - bio->bio_buf->b_resid = 0; - biodone(bio); - if (ip->rsv_databufs) { - --ip->rsv_databufs; - --ip->hmp->rsv_databufs; - } - } - /* * Final cleanup */ diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 1f49a54e95..ea2d3622e9 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.34 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.35 2008/06/08 18:16:26 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -66,45 +66,6 @@ hammer_io_init(hammer_io_t io, hammer_mount_t hmp, enum hammer_io_type type) io->type = type; } -void -hammer_io_reinit(hammer_io_t io, enum hammer_io_type type) -{ - hammer_mount_t hmp = io->hmp; - - if (io->modified) { - KKASSERT(io->mod_list != NULL); - if (io->mod_list == &hmp->volu_list || - io->mod_list == &hmp->meta_list) { - --hmp->locked_dirty_count; - --hammer_count_dirtybufs; - } - TAILQ_REMOVE(io->mod_list, io, mod_entry); - io->mod_list = NULL; - } - io->type = type; - if (io->modified) { - switch(io->type) { - case HAMMER_STRUCTURE_VOLUME: - io->mod_list = &hmp->volu_list; - ++hmp->locked_dirty_count; - ++hammer_count_dirtybufs; - break; - case HAMMER_STRUCTURE_META_BUFFER: - io->mod_list = &hmp->meta_list; - ++hmp->locked_dirty_count; - ++hammer_count_dirtybufs; - break; - case HAMMER_STRUCTURE_UNDO_BUFFER: - io->mod_list = &hmp->undo_list; - break; - case HAMMER_STRUCTURE_DATA_BUFFER: - io->mod_list = &hmp->data_list; - break; - } - TAILQ_INSERT_TAIL(io->mod_list, io, mod_entry); - } -} - /* * Helper routine to disassociate a buffer cache buffer from an I/O * structure. Called with the io structure exclusively locked. @@ -258,8 +219,12 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) (zone2_offset & HAMMER_OFF_SHORT_MASK); if (findblk(volume->devvp, phys_offset)) { bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0); - bp->b_flags |= B_INVAL; - brelse(bp); + if (LIST_FIRST(&bp->b_dep) != NULL) { + hammer_io_deallocate(bp); + } else { + bp->b_flags |= B_INVAL; + brelse(bp); + } } } @@ -812,7 +777,6 @@ struct bio_ops hammer_bioops = { /* * Read a buffer associated with a front-end vnode directly from the * disk media. The bio may be issued asynchronously. - * */ int hammer_io_direct_read(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf, @@ -826,6 +790,7 @@ hammer_io_direct_read(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf, int error; KKASSERT(leaf->data_offset >= HAMMER_ZONE_BTREE); + KKASSERT((leaf->data_offset & HAMMER_BUFMASK) == 0); zone2_offset = hammer_blockmap_lookup(hmp, leaf->data_offset, &error); if (error == 0) { vol_no = HAMMER_VOL_DECODE(zone2_offset); @@ -833,13 +798,10 @@ hammer_io_direct_read(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf, if (error == 0 && zone2_offset >= volume->maxbuf_off) error = EIO; if (error == 0) { + zone2_offset &= HAMMER_OFF_SHORT_MASK; nbio = push_bio(bio); nbio->bio_offset = volume->ondisk->vol_buf_beg + - (zone2_offset & HAMMER_OFF_SHORT_MASK); -#if 0 - kprintf("direct_read %016llx %016llx %016llx\n", - bio->bio_offset, nbio->bio_offset, leaf->data_offset); -#endif + zone2_offset; vn_strategy(volume->devvp, nbio); } hammer_rel_volume(volume, 0); @@ -861,35 +823,62 @@ int hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf, struct bio *bio) { + hammer_off_t buf_offset; hammer_off_t zone2_offset; hammer_volume_t volume; + hammer_buffer_t buffer; struct buf *bp; struct bio *nbio; + char *ptr; int vol_no; int error; - KKASSERT(leaf->data_offset >= HAMMER_ZONE_BTREE); - zone2_offset = hammer_blockmap_lookup(hmp, leaf->data_offset, &error); + buf_offset = leaf->data_offset; + + KKASSERT(buf_offset > HAMMER_ZONE_BTREE); KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE); - if (error == 0) { + if ((buf_offset & HAMMER_BUFMASK) == 0 && + leaf->data_len == HAMMER_BUFSIZE) { + /* + * We are using the vnode's bio to write directly to the + * media, any hammer_buffer at the same zone-X offset will + * now have stale data. + */ + zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); vol_no = HAMMER_VOL_DECODE(zone2_offset); volume = hammer_get_volume(hmp, vol_no, &error); if (error == 0 && zone2_offset >= volume->maxbuf_off) error = EIO; if (error == 0) { + hammer_del_buffers(hmp, buf_offset, + zone2_offset, HAMMER_BUFSIZE); + bp = bio->bio_buf; + KKASSERT(bp->b_bufsize == HAMMER_BUFSIZE); + zone2_offset &= HAMMER_OFF_SHORT_MASK; + nbio = push_bio(bio); nbio->bio_offset = volume->ondisk->vol_buf_beg + - (zone2_offset & HAMMER_OFF_SHORT_MASK); -#if 0 - kprintf("direct_write %016llx %016llx %016llx\n", - bio->bio_offset, nbio->bio_offset, - leaf->data_offset); -#endif + zone2_offset; vn_strategy(volume->devvp, nbio); + kprintf("x"); } hammer_rel_volume(volume, 0); + } else { + KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0); + buffer = NULL; + ptr = hammer_bread(hmp, buf_offset, &error, &buffer); + if (error == 0) { + kprintf("y"); + bp = bio->bio_buf; + hammer_io_modify(&buffer->io, 1); + bcopy(bp->b_data, ptr, leaf->data_len); + hammer_io_modify_done(&buffer->io); + hammer_rel_buffer(buffer, 0); + bp->b_resid = 0; + biodone(bio); + } } if (error) { bp = bio->bio_buf; diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 9afef572db..72b0acd881 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.61 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.62 2008/06/08 18:16:26 dillon Exp $ */ #include "hammer.h" @@ -39,6 +39,8 @@ static int hammer_mem_add(hammer_record_t record); static int hammer_mem_lookup(hammer_cursor_t cursor); static int hammer_mem_first(hammer_cursor_t cursor); +static int hammer_rec_trunc_callback(hammer_record_t record, + void *data __unused); struct rec_trunc_info { u_int16_t rec_type; @@ -46,7 +48,7 @@ struct rec_trunc_info { }; /* - * Red-black tree support. + * Red-black tree support. Comparison code for insertion. */ static int hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) @@ -85,29 +87,74 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) return(0); } +/* + * Basic record comparison code similar to hammer_btree_cmp(). + */ static int -hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec) +hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) { - if (info->rec_type < rec->leaf.base.rec_type) + if (elm->rec_type < rec->leaf.base.rec_type) return(-3); - if (info->rec_type > rec->leaf.base.rec_type) + if (elm->rec_type > rec->leaf.base.rec_type) return(3); - if (info->key < rec->leaf.base.key) + if (elm->key < rec->leaf.base.key) return(-2); - if (info->key > rec->leaf.base.key) + if (elm->key > rec->leaf.base.key) return(2); - if (info->create_tid == 0) { + if (elm->create_tid == 0) { if (rec->leaf.base.create_tid == 0) return(0); return(1); } if (rec->leaf.base.create_tid == 0) return(-1); - if (info->create_tid < rec->leaf.base.create_tid) + if (elm->create_tid < rec->leaf.base.create_tid) return(-1); - if (info->create_tid > rec->leaf.base.create_tid) + if (elm->create_tid > rec->leaf.base.create_tid) + return(1); + return(0); +} + +/* + * Special LOOKUP_INFO to locate an overlapping record. This used by + * the reservation code to implement small-block records (whos keys will + * be different depending on data_len, when representing the same base + * offset). + * + * NOTE: The base file offset of a data record is (key - data_len), not (key). + */ +static int +hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec) +{ + if (leaf->base.rec_type < rec->leaf.base.rec_type) + return(-3); + if (leaf->base.rec_type > rec->leaf.base.rec_type) + return(3); + + if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { + if (leaf->base.key <= rec->leaf.base.key - rec->leaf.data_len) + return(-2); + if (leaf->base.key - leaf->data_len >= rec->leaf.base.key) + return(2); + } else { + if (leaf->base.key < rec->leaf.base.key) + return(-2); + if (leaf->base.key > rec->leaf.base.key) + return(2); + } + + if (leaf->base.create_tid == 0) { + if (rec->leaf.base.create_tid == 0) + return(0); + return(1); + } + if (rec->leaf.base.create_tid == 0) + return(-1); + if (leaf->base.create_tid < rec->leaf.base.create_tid) + return(-1); + if (leaf->base.create_tid > rec->leaf.base.create_tid) return(1); return(0); } @@ -117,7 +164,7 @@ hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec) * is reversed so the comparison result has to be negated. key_beg and * key_end are both range-inclusive. * - * The creation timestamp can cause hammer_rec_compare() to return -1 or +1. + * The creation timestamp can cause hammer_rec_cmp() to return -1 or +1. * These do not stop the scan. * * Localized deletions are not cached in-memory. @@ -129,10 +176,10 @@ hammer_rec_scan_cmp(hammer_record_t rec, void *data) hammer_cursor_t cursor = data; int r; - r = hammer_rec_compare(&cursor->key_beg, rec); + r = hammer_rec_cmp(&cursor->key_beg, rec); if (r > 1) return(-1); - r = hammer_rec_compare(&cursor->key_end, rec); + r = hammer_rec_cmp(&cursor->key_end, rec); if (r < -1) return(1); return(0); @@ -148,7 +195,7 @@ hammer_rec_find_cmp(hammer_record_t rec, void *data) hammer_cursor_t cursor = data; int r; - r = hammer_rec_compare(&cursor->key_beg, rec); + r = hammer_rec_cmp(&cursor->key_beg, rec); if (r > 1) return(-1); if (r < -1) @@ -199,7 +246,7 @@ hammer_rec_trunc_cmp(hammer_record_t rec, void *data) RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node, - hammer_rec_compare, hammer_base_elm_t); + hammer_rec_overlap_compare, hammer_btree_leaf_elm_t); /* * Allocate a record for the caller to finish filling in. The record is @@ -297,11 +344,26 @@ hammer_rel_mem_record(struct hammer_record *record) hammer_unref(&record->lock); - if (record->flags & HAMMER_RECF_DELETED_FE) { - if (record->lock.refs == 0) { + if (record->lock.refs == 0) { + /* + * Upon release of the last reference wakeup any waiters. + * The record structure may get destroyed so callers will + * loop up and do a relookup. + */ + ip = record->ip; + if (record->flags & HAMMER_RECF_WANTIDLE) { + record->flags &= ~HAMMER_RECF_WANTIDLE; + ++ip->idle_wakeup; + wakeup(&ip->idle_wakeup); + } + + /* + * Upon release of the last reference a record marked deleted + * is destroyed. + */ + if (record->flags & HAMMER_RECF_DELETED_FE) { KKASSERT(record->flush_state != HAMMER_FST_FLUSH); - ip = record->ip; if ((target_ip = record->target_ip) != NULL) { TAILQ_REMOVE(&target_ip->target_list, record, target_entry); @@ -320,6 +382,7 @@ hammer_rel_mem_record(struct hammer_record *record) record->flags &= ~HAMMER_RECF_ONRBTREE; if (RB_EMPTY(&record->ip->rec_tree)) { record->ip->flags &= ~HAMMER_INODE_XDIRTY; + record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY; hammer_test_inode(record->ip); } } @@ -328,12 +391,14 @@ hammer_rel_mem_record(struct hammer_record *record) kfree(record->data, M_HAMMER); record->flags &= ~HAMMER_RECF_ALLOCDATA; } + if (record->resv) { + hammer_blockmap_reserve_complete(ip->hmp, + record->resv); + record->resv = NULL; + } record->data = NULL; --hammer_count_records; - if (record->type == HAMMER_MEM_RECORD_DATA) - hammer_cleanup_write_io(record->ip); kfree(record, M_HAMMER); - return; } } } @@ -687,19 +752,20 @@ static hammer_record_t hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes) { hammer_record_t record; - struct hammer_base_elm elm; - - bzero(&elm, sizeof(elm)); - elm.obj_id = ip->obj_id; - elm.key = file_offset + bytes; - elm.create_tid = 0; - elm.delete_tid = 0; - elm.rec_type = HAMMER_RECTYPE_DATA; - elm.obj_type = 0; /* unused */ - elm.btype = HAMMER_BTREE_TYPE_RECORD; /* unused */ - elm.localization = HAMMER_LOCALIZE_MISC; - - record = hammer_rec_rb_tree_RB_LOOKUP_INFO(&ip->rec_tree, &elm); + struct hammer_btree_leaf_elm leaf; + + bzero(&leaf, sizeof(leaf)); + leaf.base.obj_id = ip->obj_id; + leaf.base.key = file_offset + bytes; + leaf.base.create_tid = 0; + leaf.base.delete_tid = 0; + leaf.base.rec_type = HAMMER_RECTYPE_DATA; + leaf.base.obj_type = 0; /* unused */ + leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; /* unused */ + leaf.base.localization = HAMMER_LOCALIZE_MISC; + leaf.data_len = bytes; + + record = hammer_rec_rb_tree_RB_LOOKUP_INFO(&ip->rec_tree, &leaf); if (record) hammer_ref(&record->lock); return(record); @@ -712,60 +778,53 @@ hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes) * flush a buffer cache buffer. */ hammer_record_t -hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, - void *data, int bytes, int *force_altp) +hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, + int *errorp) { hammer_record_t record; hammer_record_t conflict; - int error; + int zone; + int save_wakeup; /* - * If the record already exists just return it. If it exists but - * is being flushed we can't reuse the conflict record and we can't - * create a new one (unlike directories data records have no iterator - * so we would be creating a duplicate). In that case return NULL - * to force the front-end to queue the buffer. + * Deal with conflicting in-memory records. * - * This is kinda messy. We can't have an in-memory record AND its - * buffer cache buffer queued to the same flush cycle at the same - * time as that would result in a [delete-]create-delete-create - * sequence with the same transaction id. Set *force_altp to 1 - * to deal with the situation. + * We must wait for the record to become idle so we can ensure + * its deletion. */ - *force_altp = 0; - conflict = hammer_ip_get_bulk(ip, file_offset, bytes); - if (conflict) { - /* - * We can't reuse the record if it is owned by the backend - * or has been deleted. - */ - if (conflict->flush_state == HAMMER_FST_FLUSH) { + while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) { + if (conflict->lock.refs != 1) { + conflict->flags |= HAMMER_RECF_WANTIDLE; + save_wakeup = ip->idle_wakeup; hammer_rel_mem_record(conflict); - *force_altp = 1; - kprintf("a"); - return(NULL); - } - if (conflict->flags & HAMMER_RECF_DELETED_FE) { + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + if (save_wakeup == ip->idle_wakeup) + tsleep(&ip->idle_wakeup, 0, "hmrrc3", 0); + } else { + /* flush state adds a ref, shouldn't be posible */ + KKASSERT(conflict->flush_state != HAMMER_FST_FLUSH); + conflict->flags |= HAMMER_RECF_DELETED_FE; hammer_rel_mem_record(conflict); - *force_altp = 1; - kprintf("b"); - return(NULL); } - KKASSERT(conflict->leaf.data_len == bytes); - conflict->leaf.data_crc = crc32(data, bytes); - - /* reusing conflict, remove extra rsv stats */ - hammer_cleanup_write_io(ip); - return(conflict); } /* - * Otherwise create it. This is called with the related BIO locked - * so there should be no possible conflict. + * Create a record to cover the direct write. This is called with + * the related BIO locked so there should be no possible conflict. + * + * The backend is responsible for finalizing the space reserved in + * this record. + * + * XXX bytes not aligned, depend on the reservation code to + * align the reservation. */ record = hammer_alloc_mem_record(ip, 0); - record->leaf.data_offset = hammer_blockmap_reserve(ip->hmp, HAMMER_ZONE_LARGE_DATA_INDEX, bytes, &error); - if (record->leaf.data_offset == 0) { + zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX : + HAMMER_ZONE_SMALL_DATA_INDEX; + record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, + &record->leaf.data_offset, + errorp); + if (record->resv == NULL) { hammer_rel_mem_record(record); return(NULL); } @@ -779,8 +838,8 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, record->leaf.data_crc = crc32(data, bytes); hammer_ref(&record->lock); /* mem_add eats a reference */ - error = hammer_mem_add(record); - KKASSERT(error == 0); + *errorp = hammer_mem_add(record); + return (record); } @@ -791,20 +850,6 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, * * Partial blocks are not deleted. */ -static int -hammer_rec_trunc_callback(hammer_record_t record, void *data __unused) -{ - if (record->flags & HAMMER_RECF_DELETED_FE) - return(0); - if (record->flush_state == HAMMER_FST_FLUSH) - return(0); - KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); - hammer_ref(&record->lock); - record->flags |= HAMMER_RECF_DELETED_FE; - hammer_rel_mem_record(record); - return(0); -} - int hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) { @@ -822,10 +867,25 @@ hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) } info.trunc_off = file_size; hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp, - hammer_rec_trunc_callback, &file_size); + hammer_rec_trunc_callback, &info); return(0); } +static int +hammer_rec_trunc_callback(hammer_record_t record, void *data __unused) +{ + if (record->flags & HAMMER_RECF_DELETED_FE) + return(0); + if (record->flush_state == HAMMER_FST_FLUSH) + return(0); + KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); + hammer_ref(&record->lock); + record->flags |= HAMMER_RECF_DELETED_FE; + hammer_rel_mem_record(record); + return(0); +} + + /* * Backend code * @@ -859,7 +919,7 @@ hammer_ip_sync_data(hammer_cursor_t cursor, hammer_inode_t ip, * align data allocations to 64-byte boundaries for future * expansion. */ - aligned_bytes = (bytes + 63) & ~63; + aligned_bytes = (bytes + 15) & ~15; retry: hammer_normalize_cursor(cursor); cursor->key_beg.localization = HAMMER_LOCALIZE_MISC; @@ -916,7 +976,6 @@ retry: elm.atime = 0; elm.data_offset = data_offset; elm.data_len = aligned_bytes; - elm.data_crc = crc32(data, aligned_bytes); /* * Copy the data to the allocated buffer. Since we are aligning @@ -928,6 +987,7 @@ retry: if (aligned_bytes > bytes) bzero((char *)bdata + bytes, aligned_bytes - bytes); hammer_modify_buffer_done(cursor->data_buffer); + elm.data_crc = crc32(bdata, aligned_bytes); /* * Data records can wind up on-disk before the inode itself is @@ -1048,10 +1108,11 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) * It is ok for the lookup to return ENOENT. */ if (record->type == HAMMER_MEM_RECORD_DATA) { + KKASSERT(((record->leaf.base.key - record->leaf.data_len) & HAMMER_BUFMASK) == 0); error = hammer_ip_delete_range( cursor, record->ip, record->leaf.base.key - record->leaf.data_len, - record->leaf.base.key - 1, 1); + HAMMER_BUFSIZE - 1, 1); if (error && error != ENOENT) goto done; } @@ -1477,10 +1538,9 @@ next_memory: * Special case. If the entries only differ by their * create_tid, assume they are equal and fall through. * - * This case can occur for memory-data records because - * their initial create_tid is 0 (infinity). + * This case can occur for memory-data records. XXX */ - if (r == -1) + if (r == -1 || r == 1) r = 0; if (r < 0) { error = hammer_btree_extract(cursor, diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 2cd4fe5fb0..c8e5729693 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.50 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.51 2008/06/08 18:16:26 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -94,9 +94,9 @@ hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2) static int hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2) { - if (buf1->zone2_offset < buf2->zone2_offset) + if (buf1->zoneX_offset < buf2->zoneX_offset) return(-1); - if (buf1->zone2_offset > buf2->zone2_offset) + if (buf1->zoneX_offset > buf2->zoneX_offset) return(1); return(0); } @@ -122,7 +122,7 @@ RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node, RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node, hammer_vol_rb_compare, int32_t, vol_no); RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node, - hammer_buf_rb_compare, hammer_off_t, zone2_offset); + hammer_buf_rb_compare, hammer_off_t, zoneX_offset); RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node, hammer_nod_rb_compare, hammer_off_t, node_offset); @@ -216,7 +216,6 @@ hammer_install_volume(struct hammer_mount *hmp, const char *volname) volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no, ondisk->vol_buf_end - ondisk->vol_buf_beg); volume->maxraw_off = ondisk->vol_buf_end; - RB_INIT(&volume->rb_bufs_root); if (RB_EMPTY(&hmp->rb_vols_root)) { hmp->fsid = ondisk->vol_fsid; @@ -305,12 +304,6 @@ hammer_unload_volume(hammer_volume_t volume, void *data __unused) if (hmp->rootvol == volume) hmp->rootvol = NULL; - /* - * Unload buffers. - */ - RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL, - hammer_unload_buffer, NULL); - /* * Release our buffer and flush anything left in the buffer cache. */ @@ -322,7 +315,6 @@ hammer_unload_volume(hammer_volume_t volume, void *data __unused) * no super-clusters. */ KKASSERT(volume->io.lock.refs == 0); - KKASSERT(RB_EMPTY(&volume->rb_bufs_root)); volume->ondisk = NULL; if (volume->devvp) { @@ -506,17 +498,43 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, { hammer_buffer_t buffer; hammer_volume_t volume; - hammer_off_t zoneX_offset; + hammer_off_t zone2_offset; hammer_io_type_t iotype; int vol_no; int zone; - zoneX_offset = buf_offset; - zone = HAMMER_ZONE_DECODE(buf_offset); +again: + /* + * Shortcut if the buffer is already cached + */ + buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, + buf_offset & ~HAMMER_BUFMASK64); + if (buffer) { + hammer_ref(&buffer->io.lock); + if (buffer->ondisk && buffer->io.loading == 0) { + *errorp = 0; + return(buffer); + } + + /* + * The buffer is no longer loose if it has a ref. Loose + * buffers will never be in a modified state. This should + * only occur on the 0->1 transition of refs. + */ + if (buffer->io.mod_list == &hmp->lose_list) { + TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, + mod_entry); + buffer->io.mod_list = NULL; + KKASSERT(buffer->io.modified == 0); + } + goto found; + } /* * What is the buffer class? */ + zone = HAMMER_ZONE_DECODE(buf_offset); + switch(zone) { case HAMMER_ZONE_LARGE_DATA_INDEX: case HAMMER_ZONE_SMALL_DATA_INDEX: @@ -534,83 +552,61 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, * Handle blockmap offset translations */ if (zone >= HAMMER_ZONE_BTREE_INDEX) { - buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp); - KKASSERT(*errorp == 0); + zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp); } else if (zone == HAMMER_ZONE_UNDO_INDEX) { - buf_offset = hammer_undo_lookup(hmp, buf_offset, errorp); - KKASSERT(*errorp == 0); + zone2_offset = hammer_undo_lookup(hmp, buf_offset, errorp); + } else { + KKASSERT(zone == HAMMER_ZONE_RAW_BUFFER_INDEX); + zone2_offset = buf_offset; + *errorp = 0; } + if (*errorp) + return(NULL); /* - * Locate the buffer given its zone-2 offset. + * Calculate the base zone2-offset and acquire the volume + * + * NOTE: zone2_offset and maxbuf_off are both full zone-2 offset + * specifications. */ - buf_offset &= ~HAMMER_BUFMASK64; - KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER); - vol_no = HAMMER_VOL_DECODE(buf_offset); + zone2_offset &= ~HAMMER_BUFMASK64; + KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == + HAMMER_ZONE_RAW_BUFFER); + vol_no = HAMMER_VOL_DECODE(zone2_offset); volume = hammer_get_volume(hmp, vol_no, errorp); if (volume == NULL) return(NULL); - /* - * NOTE: buf_offset and maxbuf_off are both full zone-2 offset - * specifications. - */ - KKASSERT(buf_offset < volume->maxbuf_off); + KKASSERT(zone2_offset < volume->maxbuf_off); /* - * Locate and lock the buffer structure, creating one if necessary. + * Allocate a new buffer structure. We will check for races later. */ -again: - buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root, - buf_offset); - if (buffer == NULL) { - ++hammer_count_buffers; - buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO); - buffer->zone2_offset = buf_offset; - buffer->volume = volume; - - hammer_io_init(&buffer->io, hmp, iotype); - buffer->io.offset = volume->ondisk->vol_buf_beg + - (buf_offset & HAMMER_OFF_SHORT_MASK); - TAILQ_INIT(&buffer->clist); - hammer_ref(&buffer->io.lock); - - /* - * Insert the buffer into the RB tree and handle late - * collisions. - */ - if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) { - hammer_unref(&buffer->io.lock); - --hammer_count_buffers; - kfree(buffer, M_HAMMER); - goto again; - } - hammer_ref(&volume->io.lock); - } else { - hammer_ref(&buffer->io.lock); - - /* - * The buffer is no longer loose if it has a ref. - */ - if (buffer->io.mod_list == &hmp->lose_list) { - TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, - mod_entry); - buffer->io.mod_list = NULL; - } - if (buffer->io.lock.refs == 1) - hammer_io_reinit(&buffer->io, iotype); - else - KKASSERT(buffer->io.type == iotype); - } + ++hammer_count_buffers; + buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO); + buffer->zone2_offset = zone2_offset; + buffer->zoneX_offset = buf_offset; + buffer->volume = volume; + + hammer_io_init(&buffer->io, hmp, iotype); + buffer->io.offset = volume->ondisk->vol_buf_beg + + (zone2_offset & HAMMER_OFF_SHORT_MASK); + TAILQ_INIT(&buffer->clist); + hammer_ref(&buffer->io.lock); /* - * Cache the blockmap translation + * Insert the buffer into the RB tree and handle late collisions. */ - if ((zoneX_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_RAW_BUFFER) - buffer->zoneX_offset = zoneX_offset; + if (RB_INSERT(hammer_buf_rb_tree, &hmp->rb_bufs_root, buffer)) { + hammer_unref(&buffer->io.lock); + --hammer_count_buffers; + kfree(buffer, M_HAMMER); + goto again; + } +found: /* - * Deal with on-disk info + * Deal with on-disk info and loading races. */ if (buffer->ondisk == NULL || buffer->io.loading) { *errorp = hammer_load_buffer(buffer, isnew); @@ -621,31 +617,42 @@ again: } else { *errorp = 0; } - hammer_rel_volume(volume, 0); return(buffer); } /* - * Clear the cached zone-X translation for a buffer. + * Destroy all buffers covering the specified zoneX offset range. This + * is called when the related blockmap layer2 entry is freed. The buffers + * must not be in use or modified. */ void -hammer_clrxlate_buffer(hammer_mount_t hmp, hammer_off_t buf_offset) +hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset, + hammer_off_t zone2_offset, int bytes) { hammer_buffer_t buffer; hammer_volume_t volume; int vol_no; int error; - buf_offset &= ~HAMMER_BUFMASK64; - KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER); - vol_no = HAMMER_VOL_DECODE(buf_offset); + vol_no = HAMMER_VOL_DECODE(zone2_offset); volume = hammer_get_volume(hmp, vol_no, &error); - if (volume == NULL) - return; - buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root, - buf_offset); - if (buffer) - buffer->zoneX_offset = 0; + KKASSERT(error == 0); + + while (bytes > 0) { + buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, + base_offset); + if (buffer) { + KKASSERT(buffer->io.lock.refs == 0); + KKASSERT(buffer->io.modified == 0); + KKASSERT(buffer->zone2_offset == zone2_offset); + KKASSERT(buffer->volume == volume); + hammer_unload_buffer(buffer, NULL); + } + hammer_io_inval(volume, zone2_offset); + base_offset += HAMMER_BUFSIZE; + zone2_offset += HAMMER_BUFSIZE; + bytes -= HAMMER_BUFSIZE; + } hammer_rel_volume(volume, 0); } @@ -762,9 +769,10 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush) /* * Final cleanup */ - volume = buffer->volume; RB_REMOVE(hammer_buf_rb_tree, - &volume->rb_bufs_root, buffer); + &buffer->io.hmp->rb_bufs_root, + buffer); + volume = buffer->volume; buffer->volume = NULL; /* sanity */ hammer_rel_volume(volume, 0); freeme = 1; @@ -782,31 +790,6 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush) } } -/* - * Remove the zoneX translation cache for a buffer given its zone-2 offset. - */ -void -hammer_uncache_buffer(hammer_mount_t hmp, hammer_off_t buf_offset) -{ - hammer_volume_t volume; - hammer_buffer_t buffer; - int vol_no; - int error; - - buf_offset &= ~HAMMER_BUFMASK64; - KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER); - vol_no = HAMMER_VOL_DECODE(buf_offset); - volume = hammer_get_volume(hmp, vol_no, &error); - KKASSERT(volume != 0); - KKASSERT(buf_offset < volume->maxbuf_off); - - buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root, - buf_offset); - if (buffer) - buffer->zoneX_offset = 0; - hammer_rel_volume(volume, 0); -} - /* * Access the filesystem buffer containing the specified hammer offset. * buf_offset is a conglomeration of the volume number and vol_buf_beg @@ -885,29 +868,6 @@ hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, return((char *)buffer->ondisk + xoff); } -/* - * Invalidate HAMMER_BUFSIZE bytes at zone2_offset. This is used to - * make sure that we do not have the related buffer cache buffer at - * the device layer because it is going to be aliased in a high level - * vnode layer. - */ -void -hammer_binval(hammer_mount_t hmp, hammer_off_t zone2_offset) -{ - hammer_volume_t volume; - int vol_no; - int error; - - KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == - HAMMER_ZONE_RAW_BUFFER); - vol_no = HAMMER_VOL_DECODE(zone2_offset); - volume = hammer_get_volume(hmp, vol_no, &error); - if (volume) { - hammer_io_inval(volume, zone2_offset); - hammer_rel_volume(volume, 0); - } -} - /************************************************************************ * NODES * ************************************************************************ @@ -1268,126 +1228,6 @@ hammer_alloc_btree(hammer_transaction_t trans, int *errorp) return(node); } -#if 0 - -/* - * The returned buffers are already appropriately marked as being modified. - * If the caller marks them again unnecessary undo records may be generated. - * - * In-band data is indicated by data_bufferp == NULL. Pass a data_len of 0 - * for zero-fill (caller modifies data_len afterwords). - * - * If the caller is responsible for calling hammer_modify_*() prior to making - * any additional modifications to either the returned record buffer or the - * returned data buffer. - */ -void * -hammer_alloc_record(hammer_transaction_t trans, - hammer_off_t *rec_offp, u_int16_t rec_type, - struct hammer_buffer **rec_bufferp, - int32_t data_len, void **datap, - hammer_off_t *data_offp, - struct hammer_buffer **data_bufferp, int *errorp) -{ - hammer_record_ondisk_t rec; - hammer_off_t rec_offset; - hammer_off_t data_offset; - int32_t reclen; - - if (datap) - *datap = NULL; - - /* - * Allocate the record - */ - rec_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_RECORD_INDEX, - HAMMER_RECORD_SIZE, errorp); - if (*errorp) - return(NULL); - if (data_offp) - *data_offp = 0; - - /* - * Allocate data - */ - if (data_len) { - if (data_bufferp == NULL) { - switch(rec_type) { - case HAMMER_RECTYPE_DATA: - reclen = offsetof(struct hammer_data_record, - data[0]); - break; - case HAMMER_RECTYPE_DIRENTRY: - reclen = offsetof(struct hammer_entry_record, - name[0]); - break; - default: - panic("hammer_alloc_record: illegal " - "in-band data"); - /* NOT REACHED */ - reclen = 0; - break; - } - KKASSERT(reclen + data_len <= HAMMER_RECORD_SIZE); - data_offset = rec_offset + reclen; - } else if (data_len < HAMMER_BUFSIZE) { - data_offset = hammer_blockmap_alloc(trans, - HAMMER_ZONE_SMALL_DATA_INDEX, - data_len, errorp); - *data_offp = data_offset; - } else { - data_offset = hammer_blockmap_alloc(trans, - HAMMER_ZONE_LARGE_DATA_INDEX, - data_len, errorp); - *data_offp = data_offset; - } - } else { - data_offset = 0; - } - if (*errorp) { - hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE); - return(NULL); - } - - /* - * Basic return values. - * - * Note that because this is a 'new' buffer, there is no need to - * generate UNDO records for it. - */ - *rec_offp = rec_offset; - rec = hammer_bread(trans->hmp, rec_offset, errorp, rec_bufferp); - hammer_modify_buffer(trans, *rec_bufferp, NULL, 0); - bzero(rec, sizeof(*rec)); - KKASSERT(*errorp == 0); - rec->base.data_off = data_offset; - rec->base.data_len = data_len; - hammer_modify_buffer_done(*rec_bufferp); - - if (data_bufferp) { - if (data_len) { - *datap = hammer_bread(trans->hmp, data_offset, errorp, - data_bufferp); - KKASSERT(*errorp == 0); - } else { - *datap = NULL; - } - } else if (data_len) { - KKASSERT(data_offset + data_len - rec_offset <= - HAMMER_RECORD_SIZE); - if (datap) { - *datap = (void *)((char *)rec + - (int32_t)(data_offset - rec_offset)); - } - } else { - KKASSERT(datap == NULL); - } - KKASSERT(*errorp == 0); - return(rec); -} - -#endif - /* * Allocate data. If the address of a data buffer is supplied then * any prior non-NULL *data_bufferp will be released and *data_bufferp diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c index f73f62855b..74f2c2feb8 100644 --- a/sys/vfs/hammer/hammer_recover.c +++ b/sys/vfs/hammer/hammer_recover.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.20 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.21 2008/06/08 18:16:26 dillon Exp $ */ #include "hammer.h" @@ -413,6 +413,8 @@ static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *); void hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume) { + RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, + hammer_recover_flush_buffer_callback, NULL); RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, hammer_recover_flush_volume_callback, root_volume); if (root_volume->io.recovered) { @@ -432,8 +434,6 @@ hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data) { hammer_volume_t root_volume = data; - RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL, - hammer_recover_flush_buffer_callback, NULL); if (volume->io.recovered && volume != root_volume) { volume->io.recovered = 0; hammer_io_flush(&volume->io); @@ -449,7 +449,7 @@ hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) if (buffer->io.recovered) { buffer->io.recovered = 0; hammer_io_flush(&buffer->io); - hammer_rel_buffer(buffer, 0); + hammer_rel_buffer(buffer, 2); } return(0); } diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index f2b6827d39..c4dc7b5840 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.39 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.40 2008/06/08 18:16:26 dillon Exp $ */ #include @@ -63,11 +63,12 @@ int hammer_count_volumes; int hammer_count_buffers; int hammer_count_nodes; int hammer_count_dirtybufs; /* global */ +int hammer_count_reservations; int hammer_stats_btree_iterations; int hammer_stats_record_iterations; int hammer_limit_dirtybufs = 100; /* per-mount */ -int hammer_limit_irecs = 8192; /* per-inode */ -int hammer_limit_recs = 16384; /* as a whole */ +int hammer_limit_irecs; /* per-inode */ +int hammer_limit_recs; /* as a whole XXX */ int hammer_bio_count; int64_t hammer_contention_count; int64_t hammer_zone_limit; @@ -113,6 +114,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, count_nodes, CTLFLAG_RD, &hammer_count_nodes, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, count_dirtybufs, CTLFLAG_RD, &hammer_count_dirtybufs, 0, ""); +SYSCTL_INT(_vfs_hammer, OID_AUTO, count_reservations, CTLFLAG_RD, + &hammer_count_reservations, 0, ""); SYSCTL_QUAD(_vfs_hammer, OID_AUTO, zone_limit, CTLFLAG_RW, &hammer_zone_limit, 0, ""); SYSCTL_QUAD(_vfs_hammer, OID_AUTO, contention_count, CTLFLAG_RW, @@ -164,7 +167,10 @@ MODULE_VERSION(hammer, 1); static int hammer_vfs_init(struct vfsconf *conf) { - /*hammer_init_alist_config();*/ + if (hammer_limit_irecs == 0) + hammer_limit_irecs = nbuf; + if (hammer_limit_recs == 0) /* XXX TODO */ + hammer_limit_recs = hammer_limit_irecs * 4; return(0); } @@ -292,6 +298,9 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, RB_INIT(&hmp->rb_inos_root); RB_INIT(&hmp->rb_nods_root); RB_INIT(&hmp->rb_undo_root); + RB_INIT(&hmp->rb_resv_root); + RB_INIT(&hmp->rb_bufs_root); + hmp->ronly = ((mp->mnt_flag & MNT_RDONLY) != 0); TAILQ_INIT(&hmp->volu_list); @@ -513,8 +522,10 @@ hammer_free_hmp(struct mount *mp) */ #endif /* - * Unload the volumes + * Unload buffers and then volumes */ + RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, + hammer_unload_buffer, NULL); RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, hammer_unload_volume, NULL); diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 3e6110bcad..ba56abd269 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.59 2008/06/07 07:41:51 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.60 2008/06/08 18:16:26 dillon Exp $ */ #include @@ -149,10 +149,16 @@ struct vop_ops hammer_fifo_vops = { .vop_setattr = hammer_vop_setattr }; +#ifdef DEBUG_TRUNCATE +struct hammer_inode *HammerTruncIp; +#endif + static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, struct vnode *dvp, struct ucred *cred, int flags); static int hammer_vop_strategy_read(struct vop_strategy_args *ap); static int hammer_vop_strategy_write(struct vop_strategy_args *ap); +static void hammer_cleanup_write_io(hammer_inode_t ip); +static void hammer_update_rsv_databufs(hammer_inode_t ip); #if 0 static @@ -307,16 +313,23 @@ hammer_vop_write(struct vop_write_args *ap) * * Always check at the beginning so separate writes are * not able to bypass this code. + * + * WARNING: Cannot unlock vp when doing a NOCOPY write as + * part of a putpages operation. Doing so could cause us + * to deadlock against the VM system when we try to re-lock. */ if ((count++ & 15) == 0) { - vn_unlock(ap->a_vp); - if ((ap->a_ioflag & IO_NOBWILL) == 0) - bwillwrite(); + if (uio->uio_segflg != UIO_NOCOPY) { + vn_unlock(ap->a_vp); + if ((ap->a_ioflag & IO_NOBWILL) == 0) + bwillwrite(); + } if (ip->rsv_recs > hammer_limit_irecs) { hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); hammer_wait_inode(ip); } - vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY); + if (uio->uio_segflg != UIO_NOCOPY) + vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY); } rel_offset = (int)(uio->uio_offset & HAMMER_BUFMASK); @@ -402,6 +415,9 @@ hammer_vop_write(struct vop_write_args *ap) flags |= HAMMER_INODE_DDIRTY; /* XXX mtime */ hammer_modify_inode(ip, flags); + /* + * Try to keep track of cached dirty data. + */ if ((bp->b_flags & B_DIRTY) == 0) { ++ip->rsv_databufs; ++ip->hmp->rsv_databufs; @@ -424,7 +440,6 @@ hammer_vop_write(struct vop_write_args *ap) * too full, which would trigger non-optimal * flushes. */ - bp->b_flags |= B_NOCACHE; bawrite(bp); #endif } else { @@ -1509,11 +1524,29 @@ hammer_vop_setattr(struct vop_setattr_args *ap) */ if (truncating) { hammer_ip_frontend_trunc(ip, vap->va_size); + hammer_update_rsv_databufs(ip); +#ifdef DEBUG_TRUNCATE + if (HammerTruncIp == NULL) + HammerTruncIp = ip; +#endif if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { ip->flags |= HAMMER_INODE_TRUNCATED; ip->trunc_off = vap->va_size; +#ifdef DEBUG_TRUNCATE + if (ip == HammerTruncIp) + kprintf("truncate1 %016llx\n", ip->trunc_off); +#endif } else if (ip->trunc_off > vap->va_size) { ip->trunc_off = vap->va_size; +#ifdef DEBUG_TRUNCATE + if (ip == HammerTruncIp) + kprintf("truncate2 %016llx\n", ip->trunc_off); +#endif + } else { +#ifdef DEBUG_TRUNCATE + if (ip == HammerTruncIp) + kprintf("truncate3 %016llx (ignored)\n", vap->va_size); +#endif } } @@ -1998,7 +2031,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) hammer_inode_t ip; struct bio *bio; struct buf *bp; - int force_alt = 0; + int bytes; + int error; bio = ap->a_bio; bp = bio->bio_buf; @@ -2028,53 +2062,46 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) /* * Attempt to reserve space and issue a direct-write from the * front-end. If we can't we will queue the BIO to the flusher. + * The bulk/direct-write code will still bcopy if writing less + * then full-sized blocks (at the end of a file). * * If we can the I/O can be issued and an in-memory record will - * be installed to reference the stroage until the flusher can get to + * be installed to reference the storage until the flusher can get to * it. * * Since we own the high level bio the front-end will not try to - * do a read until the write completes. + * do a direct-read until the write completes. */ - if ((bp->b_bufsize & HAMMER_BUFMASK) == 0 && - bio->bio_offset + bp->b_bufsize <= ip->ino_data.size) { - record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, - bp->b_bufsize, &force_alt); - if (record) { - hammer_io_direct_write(ip->hmp, &record->leaf, bio); - hammer_rel_mem_record(record); - if (ip->rsv_recs > hammer_limit_irecs / 2) - hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); - else - hammer_flush_inode(ip, 0); - return(0); - } - } - - /* - * Queue the bio to the flusher and let it deal with it. - * - * If the inode is being flushed we cannot re-queue buffers - * it may have already flushed, or it could result in duplicate - * records in the database. - */ - BUF_KERNPROC(bp); - if (ip->flush_state == HAMMER_FST_FLUSH || force_alt) - TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act); + KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); + KKASSERT(bio->bio_offset < ip->ino_data.size); + if (bio->bio_offset + bp->b_bufsize <= ip->ino_data.size) + bytes = bp->b_bufsize; else - TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); - ++hammer_bio_count; - hammer_modify_inode(ip, HAMMER_INODE_BUFS); - hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); - - return(0); + bytes = (int)(ip->ino_data.size - bio->bio_offset); + + record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, + bytes, &error); + if (record) { + hammer_io_direct_write(ip->hmp, &record->leaf, bio); + hammer_rel_mem_record(record); + if (ip->rsv_recs > hammer_limit_irecs / 2) + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + else + hammer_flush_inode(ip, 0); + } else { + bp->b_error = error; + bp->b_flags |= B_ERROR; + biodone(ap->a_bio); + } + hammer_cleanup_write_io(ip); + return(error); } /* * Clean-up after disposing of a dirty frontend buffer's data. * This is somewhat heuristical so try to be robust. */ -void +static void hammer_cleanup_write_io(hammer_inode_t ip) { if (ip->rsv_databufs) { @@ -2083,6 +2110,31 @@ hammer_cleanup_write_io(hammer_inode_t ip) } } +/* + * We can lose track of dirty buffer cache buffers if we truncate, this + * routine will resynchronize the count. + */ +static +void +hammer_update_rsv_databufs(hammer_inode_t ip) +{ + struct buf *bp; + int delta; + int n; + + if (ip->vp) { + n = 0; + RB_FOREACH(bp, buf_rb_tree, &ip->vp->v_rbdirty_tree) { + ++n; + } + } else { + n = 0; + } + delta = n - ip->rsv_databufs; + ip->rsv_databufs += delta; + ip->hmp->rsv_databufs += delta; +} + /* * dounlink - disconnect a directory entry * -- 2.41.0