From 059819e3ed7c6f24ca4eda1cf9f8b1f9051977e8 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 22 Apr 2008 19:00:15 +0000 Subject: [PATCH] HAMMER 37/Many: Add a flush helper thread, clean up some inconsistencies. This patch generally cleans up transaction id inconsistencies betwen data and meta-data by syncing the data and inode in a single transaction. * Change the I/O path for write strategy calls. Queue the BIO's to the flush helper thread instead of directly updating the media. * We no longer try to sync individual buffers. If the kernel wants to sync a dirty buffer, we sync the whole file to minimize inode updates. --- sys/vfs/hammer/Makefile | 4 +- sys/vfs/hammer/hammer.h | 26 ++- sys/vfs/hammer/hammer_disk.h | 19 ++- sys/vfs/hammer/hammer_flusher.c | 292 ++++++++++++++++++++++++++++++++ sys/vfs/hammer/hammer_inode.c | 20 ++- sys/vfs/hammer/hammer_io.c | 6 +- sys/vfs/hammer/hammer_object.c | 5 +- sys/vfs/hammer/hammer_undo.c | 13 +- sys/vfs/hammer/hammer_vfsops.c | 8 +- sys/vfs/hammer/hammer_vnops.c | 147 +++++++++------- 10 files changed, 453 insertions(+), 87 deletions(-) create mode 100644 sys/vfs/hammer/hammer_flusher.c diff --git a/sys/vfs/hammer/Makefile b/sys/vfs/hammer/Makefile index 98ab82042b..1bf6d435d2 100644 --- a/sys/vfs/hammer/Makefile +++ b/sys/vfs/hammer/Makefile @@ -1,5 +1,5 @@ # -# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.10 2008/03/20 06:08:40 dillon Exp $ +# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.11 2008/04/22 19:00:14 dillon Exp $ KMOD= hammer SRCS= hammer_vfsops.c hammer_vnops.c hammer_inode.c \ @@ -7,7 +7,7 @@ SRCS= hammer_vfsops.c hammer_vnops.c hammer_inode.c \ hammer_cursor.c hammer_btree.c hammer_transaction.c \ hammer_object.c hammer_recover.c hammer_ioctl.c \ hammer_blockmap.c hammer_freemap.c hammer_undo.c \ - hammer_reblock.c + hammer_reblock.c hammer_flusher.c NOMAN= diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index cc23480465..d0dac9d29e 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.46 2008/03/30 21:33:42 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.47 2008/04/22 19:00:14 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -82,6 +82,7 @@ typedef struct hammer_inode_info { struct hammer_transaction { struct hammer_mount *hmp; hammer_tid_t tid; + hammer_seq_t seq; struct hammer_volume *rootvol; /* TAILQ_HEAD(, hammer_io) recycle_list;*/ }; @@ -159,12 +160,15 @@ TAILQ_HEAD(hammer_node_list, hammer_node); struct hammer_inode { RB_ENTRY(hammer_inode) rb_node; + TAILQ_HEAD(, bio) bio_list; /* BIOs to flush out */ + TAILQ_ENTRY(hammer_inode) flush_entry; u_int64_t obj_id; /* (key) object identifier */ hammer_tid_t obj_asof; /* (key) snapshot transid or 0 */ hammer_tid_t last_tid; /* last modified tid (for fsync) */ hammer_tid_t sync_tid; /* last inode tid synced to disk */ struct hammer_mount *hmp; int flags; + int error; /* flush error */ int cursor_ip_refs; /* sanity */ struct vnode *vp; struct lockf advlock; @@ -182,7 +186,7 @@ typedef struct hammer_inode *hammer_inode_t; #define HAMMER_INODE_DDIRTY 0x0001 /* in-memory ino_data is dirty */ #define HAMMER_INODE_RDIRTY 0x0002 /* in-memory ino_rec is dirty */ #define HAMMER_INODE_ITIMES 0x0004 /* in-memory mtime/atime modified */ -#define HAMMER_INODE_XDIRTY 0x0008 /* in-memory records present */ +#define HAMMER_INODE_XDIRTY 0x0008 /* in-memory records/flsbufs present */ #define HAMMER_INODE_ONDISK 0x0010 /* inode is on-disk (else not yet) */ #define HAMMER_INODE_FLUSH 0x0020 /* flush on last ref */ #define HAMMER_INODE_DELETED 0x0080 /* inode ready for deletion */ @@ -192,6 +196,8 @@ typedef struct hammer_inode *hammer_inode_t; #define HAMMER_INODE_DONDISK 0x0800 /* data records may be on disk */ #define HAMMER_INODE_BUFS 0x1000 /* dirty high level bps present */ #define HAMMER_INODE_TIDLOCKED 0x2000 /* tid locked until inode synced */ +#define HAMMER_INODE_FLUSHQ 0x4000 /* On flush queue */ +#define HAMMER_INODE_FLUSHW 0x8000 /* Someone waiting for flush */ #define HAMMER_INODE_MODMASK (HAMMER_INODE_DDIRTY|HAMMER_INODE_RDIRTY| \ HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS| \ @@ -415,6 +421,10 @@ struct hammer_mount { int ronly; int nvolumes; int volume_iterator; + int flusher_seq; + int flusher_act; + int flusher_exiting; + thread_t flusher_td; u_int check_interrupt; uuid_t fsid; udev_t fsid_udev; @@ -424,6 +434,7 @@ struct hammer_mount { struct netexport export; struct lock blockmap_lock; struct hammer_holes holes[HAMMER_MAX_ZONES]; + TAILQ_HEAD(, hammer_inode) flush_list; }; typedef struct hammer_mount *hammer_mount_t; @@ -549,6 +560,8 @@ void *hammer_bnew(struct hammer_mount *hmp, hammer_off_t off, int *errorp, struct hammer_buffer **bufferp); hammer_volume_t hammer_get_root_volume(hammer_mount_t hmp, int *errorp); +int hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, + struct bio *bio); hammer_volume_t hammer_get_volume(hammer_mount_t hmp, int32_t vol_no, int *errorp); @@ -590,8 +603,8 @@ void *hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, hammer_off_t *data_offsetp, struct hammer_buffer **data_bufferp, int *errorp); -int hammer_generate_undo(hammer_transaction_t trans, hammer_off_t zone1_offset, - void *base, int len); +int hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, + hammer_off_t zone1_offset, void *base, int len); void hammer_put_volume(struct hammer_volume *volume, int flush); void hammer_put_buffer(struct hammer_buffer *buffer, int flush); @@ -669,6 +682,11 @@ void hammer_init_holes(hammer_mount_t hmp, hammer_holes_t holes); void hammer_free_holes(hammer_mount_t hmp, hammer_holes_t holes); int hammer_signal_check(hammer_mount_t hmp); +void hammer_flusher_create(hammer_mount_t hmp); +void hammer_flusher_destroy(hammer_mount_t hmp); +void hammer_flusher_sync(hammer_mount_t hmp); +void hammer_flusher_async(hammer_mount_t hmp); + #endif static __inline void diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index 99faf11efc..8a4bdde98a 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.27 2008/03/19 20:18:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.28 2008/04/22 19:00:15 dillon Exp $ */ #ifndef VFS_HAMMER_DISK_H_ @@ -94,6 +94,8 @@ */ typedef u_int64_t hammer_tid_t; typedef u_int64_t hammer_off_t; +typedef u_int32_t hammer_seq_t; +typedef u_int32_t hammer_crc_t; #define HAMMER_MIN_TID 0ULL /* unsigned */ #define HAMMER_MAX_TID 0xFFFFFFFFFFFFFFFFULL /* unsigned */ @@ -219,7 +221,7 @@ struct hammer_blockmap { hammer_off_t first_offset; /* zone-X logical offset (zone 3) */ hammer_off_t next_offset; /* zone-X logical offset */ hammer_off_t alloc_offset; /* zone-X logical offset */ - u_int32_t entry_crc; + hammer_crc_t entry_crc; u_int32_t reserved01; }; @@ -242,13 +244,13 @@ typedef struct hammer_blockmap *hammer_blockmap_t; struct hammer_blockmap_layer1 { hammer_off_t blocks_free; /* big-blocks free */ hammer_off_t phys_offset; /* UNAVAIL or zone-2 */ - u_int32_t layer1_crc; /* crc of this entry */ - u_int32_t layer2_crc; /* xor'd crc's of HAMMER_BLOCKSIZE */ + hammer_crc_t layer1_crc; /* crc of this entry */ + hammer_crc_t layer2_crc; /* xor'd crc's of HAMMER_BLOCKSIZE */ hammer_off_t reserved01; }; struct hammer_blockmap_layer2 { - u_int32_t entry_crc; + hammer_crc_t entry_crc; u_int32_t bytes_free; /* bytes free within this bigblock */ union { hammer_off_t owner; /* used by freemap */ @@ -330,9 +332,8 @@ struct hammer_fifo_head { u_int16_t hdr_signature; u_int16_t hdr_type; u_int32_t hdr_size; /* aligned size of the whole mess */ - u_int32_t hdr_crc; - u_int32_t hdr_seq; - u_int64_t hdr_tid; /* related TID */ + u_int32_t reserved01; /* (0) reserved for future use */ + hammer_crc_t hdr_crc; }; struct hammer_fifo_tail { @@ -497,7 +498,7 @@ typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; struct hammer_base_record { u_int32_t signature; /* record signature */ - u_int32_t data_crc; /* data crc */ + hammer_crc_t data_crc; /* data crc */ struct hammer_base_elm base; /* 40 byte base element */ hammer_off_t data_off; /* in-band or out-of-band */ int32_t data_len; /* size of data in bytes */ diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c new file mode 100644 index 0000000000..806b7e2d78 --- /dev/null +++ b/sys/vfs/hammer/hammer_flusher.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.1 2008/04/22 19:00:15 dillon Exp $ + */ +/* + * HAMMER dependancy flusher thread + * + * Meta data updates create buffer dependancies which are arranged as a + * hierarchy of lists. + */ + +#include "hammer.h" + +static void hammer_flusher_thread(void *arg); +static void hammer_flusher_flush(hammer_mount_t hmp); + +void +hammer_flusher_sync(hammer_mount_t hmp) +{ + int seq; + + seq = ++hmp->flusher_seq; + wakeup(&hmp->flusher_seq); + while ((int)(seq - hmp->flusher_act) > 0) + tsleep(&hmp->flusher_act, 0, "hmrfls", 0); +} + +void +hammer_flusher_async(hammer_mount_t hmp) +{ + ++hmp->flusher_seq; + wakeup(&hmp->flusher_seq); +} + +void +hammer_flusher_create(hammer_mount_t hmp) +{ + lwkt_create(hammer_flusher_thread, hmp, &hmp->flusher_td, NULL, + 0, -1, "hammer"); +} + +void +hammer_flusher_destroy(hammer_mount_t hmp) +{ + hmp->flusher_exiting = 1; + ++hmp->flusher_seq; + wakeup(&hmp->flusher_seq); + while (hmp->flusher_td) + tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0); +} + +static void +hammer_flusher_thread(void *arg) +{ + hammer_mount_t hmp = arg; + int seq; + + for (;;) { + seq = hmp->flusher_seq; + while (TAILQ_FIRST(&hmp->flush_list) != NULL) + hammer_flusher_flush(hmp); + hmp->flusher_act = seq; + wakeup(&hmp->flusher_act); + if (hmp->flusher_exiting) + break; + while (hmp->flusher_seq == hmp->flusher_act) + tsleep(&hmp->flusher_seq, 0, "hmrflt", 0); + } + hmp->flusher_td = NULL; + wakeup(&hmp->flusher_exiting); + lwkt_exit(); +} + +/* + * Flush stuff + */ +static void +hammer_flusher_flush(hammer_mount_t hmp) +{ + hammer_inode_t ip; + + while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) { + TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry); + ip->flags &= ~HAMMER_INODE_FLUSHQ; + + /* + * We inherit the inode ref from the flush list + */ + ip->error = hammer_sync_inode(ip, MNT_WAIT, (ip->vp ? 0 : 1)); + if (ip->flags & HAMMER_INODE_FLUSHW) { + ip->flags &= ~HAMMER_INODE_FLUSHW; + wakeup(ip); + } + hammer_rel_inode(ip, 0); + } +} + +#if 0 + +static __inline +int +undo_seq_cmp(hammer_mount_t hmp, hammer_off_t seq1, hammer_off_t seq2) +{ + int delta; + + delta = (int)(seq1 - seq2) & hmp->undo_mask; + if (delta == 0) + return(0); + if (delta > (hmp->undo_mask >> 1)) + return(-1); + return(1); +} + +static void +hammer_flusher_flush(hammer_mount_t hmp) +{ + hammer_off_t undo_seq; + hammer_buffer_t buffer; + hammer_volume_t root_volume; + hammer_blockmap_t rootmap; + int count; + int error; + + /* + * The undo space is sequenced via the undo zone. + */ + root_volume = hammer_get_root_volume(hmp, &error); + if (root_volume == NULL) { + panic("hammer: can't get root volume"); + return; + } + + /* + * Flush pending undo buffers. The kernel may also flush these + * asynchronously. This list may also contain pure data buffers + * (which do not overwrite pre-existing data). + * + * The flush can occur simultaniously with new appends, only flush + * through undo_seq. If an I/O is already in progress the call to + * hammer_ref_buffer() will wait for it to complete. + * + * Note that buffers undergoing I/O not initiated by us are not + * removed from the list until the I/O is complete, so they are + * still visible to us to block on. + */ + + /* + * Lock the meta-data buffers + */ + undo_seq = hmp->undo_zone.next_offset; + TAILQ_FOREACH(buffer, &hmp->undo_dep_list, undo_entry) { + KKASSERT(buffer->io.undo_type == HAMMER_UNDO_TYPE_DEP); + buffer->io.undo_type = HAMMER_UNDO_TYPE_DEP_LOCKED; + if (undo_seq_cmp(hmp, buffer->io.undo_off, undo_seq) >= 0) + break; + } + + /* + * Initiate I/O for the undo fifo buffers + */ + count = 0; + while ((buffer = TAILQ_FIRST(&hmp->undo_buf_list)) != NULL) { + if (undo_seq_cmp(hmp, buffer->io.undo_off, undo_seq) >= 0) { + break; + } + hammer_ref_buffer(buffer); + + if (buffer != (void *)TAILQ_FIRST(&hmp->undo_buf_list)) { + hammer_rel_buffer(buffer, 0); + } else { + TAILQ_REMOVE(&hmp->undo_buf_list, buffer, undo_entry); + buffer->io.undo_type = HAMMER_UNDO_TYPE_NONE; + if (buffer->io.modified) { + buffer->io.decount = &count; + ++count; + } + hammer_rel_buffer(buffer, 1); + } + } + + /* + * Wait for completion + */ + crit_enter(); + while (count) + tsleep(&count, 0, "hmrfwt", 0); + crit_exit(); + + /* + * The root volume must be updated. The previous push is now fully + * synchronized. { first_offset, next_offset } tell the mount code + * what needs to be undone. + */ + hammer_modify_volume(NULL, root_volume, NULL, 0); + rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; + rootmap->first_offset = rootmap->next_offset; + rootmap->next_offset = undo_seq; + hammer_modify_volume_done(NULL, root_volume); + + /* + * cache the first_offset update. the cached next_offset is the + * current next_offset, not the undo_seq that we synchronized to disk. + * XXX + */ + hmp->undo_zone.first_offset = rootmap->first_offset; + + ++count; + root_volume->io.decount = &count; + hammer_rel_volume(root_volume, 2); + + /* + * Wait for completion + */ + crit_enter(); + while (count) + tsleep(&count, 0, "hmrfwt", 0); + crit_exit(); + + /* + * Now we can safely push out buffers containing meta-data + * modifications. If we crash while doing this, the changes will + * be undone on mount. + */ + while ((buffer = TAILQ_FIRST(&hmp->undo_dep_list)) != NULL) { + if (buffer->io.undo_type != HAMMER_UNDO_TYPE_DEP_LOCKED) + break; + hammer_ref_buffer(buffer); + + if (buffer != TAILQ_FIRST(&hmp->undo_dep_list)) { + hammer_rel_buffer(buffer, 0); + } else { + TAILQ_REMOVE(&hmp->undo_dep_list, buffer, undo_entry); + if (buffer->io.modified) { + buffer->io.decount = &count; + ++count; + hammer_rel_buffer(buffer, 2); + } else { + hammer_rel_buffer(buffer, 0); + } + } + } + + /* + * Wait for completion + */ + crit_enter(); + while (count) + tsleep(&count, 0, "hmrfwt", 0); + crit_exit(); + + /* + * The undo bit is only cleared if no new undo's were entered into + * the cache, and first_offset == next_offset. + */ + if (hmp->undo_zone.next_offset == undo_seq && + rootmap->first_offset == rootmap->next_offset) { + hmp->hflags &= ~HMNT_UDIRTY; + } +} + +#endif diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index b7f9aa01bc..1744de4bc6 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.33 2008/03/29 20:12:54 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.34 2008/04/22 19:00:15 dillon Exp $ */ #include "hammer.h" @@ -208,6 +208,7 @@ loop: if (hmp->ronly) ip->flags |= HAMMER_INODE_RO; RB_INIT(&ip->rec_tree); + TAILQ_INIT(&ip->bio_list); /* * Locate the on-disk inode. @@ -302,6 +303,7 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, ip->last_tid = trans->tid; RB_INIT(&ip->rec_tree); + TAILQ_INIT(&ip->bio_list); ip->ino_rec.ino_atime = trans->tid; ip->ino_rec.ino_mtime = trans->tid; @@ -530,6 +532,7 @@ hammer_unload_inode(struct hammer_inode *ip, void *data) kprintf("hammer_sync_inode failed error %d\n", error); if (ip->lock.refs == 1) { KKASSERT(RB_EMPTY(&ip->rec_tree)); + KKASSERT(TAILQ_EMPTY(&ip->bio_list)); RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip); hammer_uncache_node(&ip->cache[0]); @@ -618,6 +621,7 @@ int hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete) { struct hammer_transaction trans; + struct bio *bio; int error; if ((ip->flags & HAMMER_INODE_MODMASK) == 0) { @@ -662,16 +666,24 @@ hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete) } /* - * Sync the buffer cache. + * Sync the buffer cache. This will queue the BIOs */ if (ip->vp != NULL) { - error = vfsync(ip->vp, waitfor, 1, NULL, NULL); + error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL); if (RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL) ip->flags &= ~HAMMER_INODE_BUFS; } else { error = 0; } + /* + * Flush the queued BIOs + */ + while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) { + TAILQ_REMOVE(&ip->bio_list, bio, bio_act); + hammer_dowrite(&trans, ip, bio); + } + /* * Now sync related records @@ -684,7 +696,7 @@ hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete) error = -error; break; } - if (RB_EMPTY(&ip->rec_tree)) + if (RB_EMPTY(&ip->rec_tree) && TAILQ_EMPTY(&ip->bio_list)) ip->flags &= ~HAMMER_INODE_XDIRTY; /* diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 4b073ea801..4e5a412460 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.23 2008/03/24 23:50:23 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.24 2008/04/22 19:00:15 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -353,7 +353,7 @@ hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume, if (len) { intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk; KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); - hammer_generate_undo(trans, + hammer_generate_undo(trans, &volume->io, HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset), base, len); } @@ -372,7 +372,7 @@ hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer, if (len) { intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk; KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); - hammer_generate_undo(trans, + hammer_generate_undo(trans, &buffer->io, buffer->zone2_offset + rel_offset, base, len); } diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index d79bfd70fc..f395c91832 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.39 2008/03/29 20:12:54 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.40 2008/04/22 19:00:15 dillon Exp $ */ #include "hammer.h" @@ -438,7 +438,8 @@ hammer_ip_del_directory(struct hammer_transaction *trans, */ if (error == 0) { --ip->ino_rec.ino_nlinks; - hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); + hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY | + HAMMER_INODE_TIDLOCKED); if (ip->ino_rec.ino_nlinks == 0 && (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { hammer_done_cursor(cursor); diff --git a/sys/vfs/hammer/hammer_undo.c b/sys/vfs/hammer/hammer_undo.c index ca79243e55..1213eba035 100644 --- a/sys/vfs/hammer/hammer_undo.c +++ b/sys/vfs/hammer/hammer_undo.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.3 2008/03/24 23:50:23 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.4 2008/04/22 19:00:15 dillon Exp $ */ /* @@ -74,8 +74,8 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp) * offset. */ int -hammer_generate_undo(hammer_transaction_t trans, hammer_off_t zone1_off, - void *base, int len) +hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, + hammer_off_t zone1_off, void *base, int len) { hammer_volume_t root_volume; hammer_volume_ondisk_t ondisk; @@ -160,11 +160,18 @@ again: undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD; undo->head.hdr_size = bytes; + undo->head.reserved01 = 0; + undo->head.hdr_crc = 0; undo->undo_offset = zone1_off; undo->undo_data_bytes = len; bcopy(base, undo + 1, len); undo->head.hdr_crc = crc32(undo, bytes); + /* + * Update the undo offset space in the IO XXX + */ + + undomap->next_offset += bytes; if (buffer) diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index 4235001d82..da8a7a3183 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.24 2008/03/30 21:33:42 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.25 2008/04/22 19:00:15 dillon Exp $ */ #include @@ -186,6 +186,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, hmp->root_btree_end.obj_type = 0; lockinit(&hmp->blockmap_lock, "blkmap", 0, 0); + TAILQ_INIT(&hmp->flush_list); + for (i = 0; i < HAMMER_MAX_ZONES; ++i) { hmp->zone_limits[i] = HAMMER_ZONE_ENCODE(i, HAMMER_ZONE_LIMIT); @@ -291,6 +293,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, crc32((char *)&rootvol->ondisk->vol_fsid + 8, 8); hammer_rel_volume(rootvol, 0); + hammer_flusher_create(hmp); + /* * Locate the root directory using the root cluster's B-Tree as a * starting point. The root directory uses an obj_id of 1. @@ -358,6 +362,8 @@ hammer_free_hmp(struct mount *mp) hmp->rootvp = NULL; } #endif + hammer_flusher_sync(hmp); + hammer_flusher_destroy(hmp); /* * Unload & flush inodes diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 6bcf8a5476..0f4c7da571 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.36 2008/03/19 20:18:17 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.37 2008/04/22 19:00:15 dillon Exp $ */ #include @@ -171,11 +171,21 @@ int hammer_vop_fsync(struct vop_fsync_args *ap) { hammer_inode_t ip; - int error; ip = VTOI(ap->a_vp); - error = hammer_sync_inode(ip, ap->a_waitfor, 0); - return (error); + if ((ip->flags & HAMMER_INODE_FLUSHQ) == 0) { + ++ip->lock.refs; + ip->flags |= HAMMER_INODE_FLUSHQ; + TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); + hammer_flusher_async(ip->hmp); + } + if (ap->a_waitfor == MNT_WAIT) { + while (ip->flags & HAMMER_INODE_FLUSHQ) { + ip->flags |= HAMMER_INODE_FLUSHW; + tsleep(ip, 0, "hmrifl", 0); + } + } + return (ip->error); } /* @@ -256,6 +266,7 @@ hammer_vop_write(struct vop_write_args *ap) int error; int n; int flags; + int count; if (ap->a_vp->v_type != VREG) return (EINVAL); @@ -288,9 +299,20 @@ hammer_vop_write(struct vop_write_args *ap) /* * Access the data in HAMMER_BUFSIZE blocks via the buffer cache. */ + count = 0; while (uio->uio_resid > 0) { int fixsize = 0; + /* + * Do not allow huge writes to deadlock the buffer cache + */ + if ((++count & 15) == 0) { + vn_unlock(ap->a_vp); + if ((ap->a_ioflag & IO_NOBWILL) == 0) + bwillwrite(); + vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY); + } + offset = uio->uio_offset & HAMMER_BUFMASK; n = HAMMER_BUFSIZE - offset; if (n > uio->uio_resid) @@ -366,33 +388,17 @@ hammer_vop_write(struct vop_write_args *ap) flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS; hammer_modify_inode(&trans, ip, flags); -#if 0 - /* - * The file write must be tagged with the same TID as the - * inode, for consistency in case the inode changed size. - * This guarantees the on-disk data records will have a - * TID <= the inode TID representing the size change. - * - * If a prior write has not yet flushed, retain its TID. - */ - if (bp->b_tid == 0) - bp->b_tid = ip->last_tid; -#endif - /* - * For now we can't use ip->last_tid because we may wind - * up trying to flush the same buffer with the same TID - * (but different data) multiple times, which will cause - * a panic. - */ - if (bp->b_tid == 0) - bp->b_tid = trans.tid; - if (ap->a_ioflag & IO_SYNC) { bwrite(bp); } else if (ap->a_ioflag & IO_DIRECT) { bawrite(bp); - } else if ((ap->a_ioflag >> 16) > 1 && +#if 0 + } else if ((ap->a_ioflag >> 16) == IO_SEQMAX && (uio->uio_offset & HAMMER_BUFMASK) == 0) { + /* + * XXX HAMMER can only fsync the whole inode, + * doing it on every buffer would be a bad idea. + */ /* * If seqcount indicates sequential operation and * we just finished filling a buffer, push it out @@ -400,7 +406,8 @@ hammer_vop_write(struct vop_write_args *ap) * too full, which would trigger non-optimal * flushes. */ - bawrite(bp); + bdwrite(bp); +#endif } else { bdwrite(bp); } @@ -1418,8 +1425,6 @@ hammer_vop_setattr(struct vop_setattr_args *ap) if (error == 0) { bzero(bp->b_data + offset, HAMMER_BUFSIZE - offset); - if (bp->b_tid == 0) - bp->b_tid = trans.tid; bdwrite(bp); } else { brelse(bp); @@ -1627,13 +1632,11 @@ hammer_vop_strategy(struct vop_strategy_args *ap) error = hammer_vop_strategy_write(ap); break; default: - error = EINVAL; + bp->b_error = error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(ap->a_bio); break; } - bp->b_error = error; - if (error) - bp->b_flags |= B_ERROR; - biodone(ap->a_bio); return (error); } @@ -1738,6 +1741,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) KKASSERT(n > 0); if (n > bp->b_bufsize - boff) n = bp->b_bufsize - boff; + bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n); boff += n; @@ -1759,18 +1763,20 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) /* boff = bp->b_bufsize; */ } bp->b_resid = 0; + bp->b_error = error; + if (error) + bp->b_flags |= B_ERROR; + biodone(ap->a_bio); return(error); } /* - * Write to a regular file. Iterate the related records and mark for - * deletion. If existing edge records (left and right side) overlap our - * write they have to be marked deleted and new records created, usually - * referencing a portion of the original data. Then add a record to - * represent the buffer. + * Write to a regular file. Because this is a strategy call the OS is + * trying to actually sync data to the media. HAMMER can only flush + * the entire inode (so the TID remains properly synchronized). * - * The support code in hammer_object.c should be used to deal with mixed - * in-memory and on-disk records. + * Basically all we do here is place the bio on the inode's flush queue + * and activate the flusher. */ static int @@ -1780,30 +1786,57 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) hammer_inode_t ip; struct bio *bio; struct buf *bp; - int error; bio = ap->a_bio; bp = bio->bio_buf; ip = ap->a_vp->v_data; - if (ip->flags & HAMMER_INODE_RO) - return (EROFS); + if (ip->flags & HAMMER_INODE_RO) { + bp->b_error = EROFS; + bp->b_flags |= B_ERROR; + biodone(ap->a_bio); + return(EROFS); + } + BUF_KERNPROC(bp); + TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); + hammer_start_transaction(&trans, ip->hmp); /* XXX */ + hammer_modify_inode(&trans, ip, HAMMER_INODE_XDIRTY); + hammer_commit_transaction(&trans); - /* - * Start a transaction using the TID stored with the bp. - */ - KKASSERT(bp->b_tid != 0); - hammer_start_transaction_tid(&trans, ip->hmp, bp->b_tid); + if ((ip->flags & HAMMER_INODE_FLUSHQ) == 0) { + ++ip->lock.refs; + ip->flags |= HAMMER_INODE_FLUSHQ; + TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); + hammer_flusher_async(ip->hmp); + } + return(0); +} + +/* + * Back-end code which actually performs the write to the media. This + * routine is typically called from the flusher. The bio will be disposed + * of (biodone'd) by this routine. + * + * Iterate the related records and mark for deletion. If existing edge + * records (left and right side) overlap our write they have to be marked + * deleted and new records created, usually referencing a portion of the + * original data. Then add a record to represent the buffer. + */ +int +hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio) +{ + struct buf *bp = bio->bio_buf; + int error; /* * Delete any records overlapping our range. This function will * (eventually) properly truncate partial overlaps. */ if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) { - error = hammer_ip_delete_range(&trans, ip, bio->bio_offset, + error = hammer_ip_delete_range(trans, ip, bio->bio_offset, bio->bio_offset); } else { - error = hammer_ip_delete_range(&trans, ip, bio->bio_offset, + error = hammer_ip_delete_range(trans, ip, bio->bio_offset, bio->bio_offset + bp->b_bufsize - 1); } @@ -1827,22 +1860,18 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) KKASSERT(limit_size >= 0); limit_size = (limit_size + 63) & ~63; } - error = hammer_ip_sync_data(&trans, ip, bio->bio_offset, + error = hammer_ip_sync_data(trans, ip, bio->bio_offset, bp->b_data, limit_size); } - /* - * If an error occured abort the transaction - */ if (error) { - /* XXX undo deletion */ - hammer_abort_transaction(&trans); bp->b_resid = bp->b_bufsize; + bp->b_error = error; + bp->b_flags |= B_ERROR; } else { - hammer_commit_transaction(&trans); bp->b_resid = 0; - bp->b_tid = 0; } + biodone(bio); return(error); } -- 2.41.0