From 6362a262e633b546c510362dad6fd7adfea9f32e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 28 Jan 2010 09:04:34 -0800 Subject: [PATCH] kernel - Even more buffer cache / VM coherency work * nvtruncbuf/nvextendbuf now clear the cached layer 2 disk offset from the buffer cache buffer being zero-extended or zero-truncated. This is required by HAMMER since HAMMER never overwrites data in the same media block. * Convert HAMMER over to the new nvtruncbuf/nvextendbuf API. The new API automatically handles zero-truncations and zero-extensions within the buffer straddling the file EOF and also changes the way backing VM pages are handled. Instead of cutting the VM pages off at the nearest boundary past file EOF any pages in the straddling buffer are left fully valid and intact, which avoids numerous pitfalls the old API had in dealing with VM page valid/dirty bits during file truncations and extensions. * Make sure the PG_ZERO flag in the VM page is cleared in allocbuf(). * Refactor HAMMER's strategy code to close two small windows of opportunity where stale data might be read from the media. In particular, refactor hammer_ip_*_bulk(), hammer_frontend_trunc*(), and hammer_io_direct_write(). These were detected by the fsx test program on a heavily paging system with physical memory set artificially low. Data flows through three stages in HAMMER: (1) Buffer cache. (2) In-memory records referencing the direct-write data offset on the media until the actual B-Tree is updated on-media at a later time. (3) Media B-Tree lookups referencing the committed data offset on the media. HAMMER must perform a careful, fragile dance to ensure that access to the data from userland doesn't slip through any cracks while the data is transitioning between stages. Two cracks were found and fixed: (A) The direct-write code was allowing the BUF/BIO in the strategy call to complete before adding the in-memory record to the index for the stage 1->2 transition. Now fixed. (B) The HAMMER truncation code was skipping in-memory records queued to the backend flusher under the assumption that the backend flusher would deal with them, which it will eventually, but there was a small window where the data was still accessible by userland after the truncation if userland did a truncation followed by an extension. Now fixed. --- sys/kern/vfs_bio.c | 1 + sys/kern/vfs_vm.c | 10 +++ sys/vfs/hammer/hammer.h | 6 +- sys/vfs/hammer/hammer_inode.c | 9 +-- sys/vfs/hammer/hammer_io.c | 25 +++---- sys/vfs/hammer/hammer_object.c | 119 ++++++++++++++++++++------------- sys/vfs/hammer/hammer_subs.c | 9 +++ sys/vfs/hammer/hammer_vnops.c | 87 ++++++++++++------------ 8 files changed, 156 insertions(+), 110 deletions(-) diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 07679e8a43..6a4cfba638 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -3163,6 +3163,7 @@ allocbuf(struct buf *bp, int size) if (m) { vm_page_wire(m); vm_page_wakeup(m); + vm_page_flag_clear(m, PG_ZERO); bp->b_flags &= ~B_CACHE; bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; ++bp->b_xio.xio_npages; diff --git a/sys/kern/vfs_vm.c b/sys/kern/vfs_vm.c index 02e9f8d51e..36c3955ee2 100644 --- a/sys/kern/vfs_vm.c +++ b/sys/kern/vfs_vm.c @@ -116,6 +116,10 @@ static int nvtruncbuf_bp_metasync(struct buf *bp, void *data); * any remainder when writing to the media in the strategy function when * it is able to do so without the page being mapped. The page may still * be mapped by userland here. + * + * When modifying a buffer we must clear any cached raw disk offset. + * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, + * never overwrite existing data blocks. */ int nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff) @@ -174,6 +178,7 @@ nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff) if (bp->b_dirtyend > boff) bp->b_dirtyend = boff; } + bp->b_bio2.bio_offset = NOOFFSET; bdwrite(bp); } } else { @@ -329,6 +334,10 @@ nvtruncbuf_bp_metasync(struct buf *bp, void *data) * any remainder when writing to the media in the strategy function when * it is able to do so without the page being mapped. The page may still * be mapped by userland here. + * + * When modifying a buffer we must clear any cached raw disk offset. + * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, + * never overwrite existing data blocks. */ int nvextendbuf(struct vnode *vp, off_t olength, off_t nlength, @@ -349,6 +358,7 @@ nvextendbuf(struct vnode *vp, off_t olength, off_t nlength, error = bread(vp, truncboffset, oblksize, &bp); if (error == 0) { bzero(bp->b_data + oboff, oblksize - oboff); + bp->b_bio2.bio_offset = NOOFFSET; bdwrite(bp); } } diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index b3cae002a0..99a41d4524 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -1236,6 +1236,7 @@ int hammer_ip_add_directory(struct hammer_transaction *trans, int hammer_ip_del_directory(struct hammer_transaction *trans, hammer_cursor_t cursor, hammer_inode_t dip, hammer_inode_t ip); +void hammer_ip_replace_bulk(hammer_mount_t hmp, hammer_record_t record); hammer_record_t hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, int *errorp); int hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size); @@ -1274,8 +1275,8 @@ void hammer_io_waitdep(struct hammer_io *io); void hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush); int hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio, hammer_btree_leaf_elm_t leaf); -int hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record, - struct bio *bio); +int hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio, + hammer_record_t record); void hammer_io_direct_wait(hammer_record_t record); void hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf); void hammer_io_write_interlock(hammer_io_t io); @@ -1354,6 +1355,7 @@ udev_t hammer_fsid_to_udev(uuid_t *uuid); int hammer_blocksize(int64_t file_offset); +int hammer_blockoff(int64_t file_offset); int64_t hammer_blockdemarc(int64_t file_offset1, int64_t file_offset2); /* diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index ed706b3a29..0904eda1e5 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -3045,13 +3045,10 @@ hammer_inode_unloadable_check(hammer_inode_t ip, int getvp) /* * Final cleanup */ - if (ip->vp) { - vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE); - vnode_pager_setsize(ip->vp, 0); - } - if (getvp) { + if (ip->vp) + nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0); + if (getvp) vput(vp); - } } } diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 40f6ef7bd6..d679e645f8 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -1242,8 +1242,8 @@ hammer_io_direct_read_complete(struct bio *nbio) * is set. The recorded is added to its object. */ int -hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record, - struct bio *bio) +hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio, + hammer_record_t record) { hammer_btree_leaf_elm_t leaf = &record->leaf; hammer_off_t buf_offset; @@ -1261,6 +1261,11 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record, KKASSERT(buf_offset > HAMMER_ZONE_BTREE); KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE); + /* + * Issue or execute the I/O. The new memory record must replace + * the old one before the I/O completes, otherwise a reaquisition of + * the buffer will load the old media data instead of the new. + */ if ((buf_offset & HAMMER_BUFMASK) == 0 && leaf->data_len >= HAMMER_BUFSIZE) { /* @@ -1305,6 +1310,7 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record, nbio->bio_offset = volume->ondisk->vol_buf_beg + zone2_offset; hammer_stats_disk_write += bp->b_bufsize; + hammer_ip_replace_bulk(hmp, record); vn_strategy(volume->devvp, nbio); hammer_io_flush_mark(volume); } @@ -1326,20 +1332,15 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record, hammer_io_modify_done(&buffer->io); hammer_rel_buffer(buffer, 0); bp->b_resid = 0; + hammer_ip_replace_bulk(hmp, record); biodone(bio); } } - if (error == 0) { - /* - * The record is all setup now, add it. Potential conflics - * have already been dealt with. - */ - error = hammer_mem_add(record); - KKASSERT(error == 0); - } else { + if (error) { /* - * Major suckage occured. Also note: The record was never added - * to the tree so we do not have to worry about the backend. + * Major suckage occured. Also note: The record was + * never added to the tree so we do not have to worry + * about the backend. */ kprintf("hammer_direct_write: failed @ %016llx\n", (long long)leaf->data_offset); diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index b947084766..6f578db67a 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -54,7 +54,7 @@ struct rec_trunc_info { struct hammer_bulk_info { hammer_record_t record; - struct hammer_btree_leaf_elm leaf; + hammer_record_t conflict; }; /* @@ -140,7 +140,7 @@ static int hammer_rec_overlap_cmp(hammer_record_t rec, void *data) { struct hammer_bulk_info *info = data; - hammer_btree_leaf_elm_t leaf = &info->leaf; + hammer_btree_leaf_elm_t leaf = &info->record->leaf; if (rec->leaf.base.rec_type < leaf->base.rec_type) return(-3); @@ -893,33 +893,23 @@ hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) } /* - * Locate a bulk record in-memory. Bulk records allow disk space to be - * reserved so the front-end can flush large data writes without having - * to queue the BIO to the flusher. Only the related record gets queued - * to the flusher. + * Locate a pre-existing bulk record in memory. The caller wishes to + * replace the record with a new one. The existing record may have a + * different length (and thus a different key) so we have to use an + * overlap check function. */ - static hammer_record_t -hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes) +hammer_ip_get_bulk(hammer_record_t record) { struct hammer_bulk_info info; - - bzero(&info, sizeof(info)); - info.leaf.base.obj_id = ip->obj_id; - info.leaf.base.key = file_offset + bytes; - info.leaf.base.create_tid = 0; - info.leaf.base.delete_tid = 0; - info.leaf.base.rec_type = HAMMER_RECTYPE_DATA; - info.leaf.base.obj_type = 0; /* unused */ - info.leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; /* unused */ - info.leaf.base.localization = ip->obj_localization + /* unused */ - HAMMER_LOCALIZE_MISC; - info.leaf.data_len = bytes; + hammer_inode_t ip = record->ip; + info.record = record; + info.conflict = NULL; hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp, hammer_bulk_scan_callback, &info); - return(info.record); /* may be NULL */ + return(info.conflict); /* may be NULL */ } /* @@ -936,7 +926,7 @@ hammer_bulk_scan_callback(hammer_record_t record, void *data) return(0); } hammer_ref(&record->lock); - info->record = record; + info->conflict = record; return(-1); /* stop scan */ } @@ -948,39 +938,21 @@ hammer_bulk_scan_callback(hammer_record_t record, void *data) * cache buffers and we should be able to manipulate any overlapping * in-memory records. * - * The caller is responsible for adding the returned record. + * The caller is responsible for adding the returned record and deleting + * the returned conflicting record (if any), typically by calling + * hammer_ip_replace_bulk() (via hammer_io_direct_write()). */ hammer_record_t hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, int *errorp) { hammer_record_t record; - hammer_record_t conflict; int zone; /* - * Deal with conflicting in-memory records. We cannot have multiple - * in-memory records for the same base offset without seriously - * confusing the backend, including but not limited to the backend - * issuing delete-create-delete or create-delete-create sequences - * and asserting on the delete_tid being the same as the create_tid. - * - * If we encounter a record with the backend interlock set we cannot - * immediately delete it without confusing the backend. - */ - while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) { - if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) { - conflict->flags |= HAMMER_RECF_WANTED; - tsleep(conflict, 0, "hmrrc3", 0); - } else { - conflict->flags |= HAMMER_RECF_DELETED_FE; - } - hammer_rel_mem_record(conflict); - } - - /* - * Create a record to cover the direct write. This is called with - * the related BIO locked so there should be no possible conflict. + * Create a record to cover the direct write. The record cannot + * be added to the in-memory RB tree here as it might conflict + * with an existing memory record. See hammer_io_direct_write(). * * The backend is responsible for finalizing the space reserved in * this record. @@ -1009,15 +981,53 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, record->leaf.data_len = bytes; hammer_crc_set_leaf(data, &record->leaf); KKASSERT(*errorp == 0); + return(record); } +/* + * Called by hammer_io_direct_write() prior to any possible completion + * of the BIO to emplace the memory record associated with the I/O and + * to replace any prior memory record which might still be active. + * + * Setting the FE deleted flag on the old record (if any) avoids any RB + * tree insertion conflict, amoung other things. + * + * This has to be done prior to the caller completing any related buffer + * cache I/O or a reinstantiation of the buffer may load data from the + * old media location instead of the new media location. The holding + * of the locked buffer cache buffer serves to interlock the record + * replacement operation. + */ +void +hammer_ip_replace_bulk(hammer_mount_t hmp, hammer_record_t record) +{ + hammer_record_t conflict; + int error; + + while ((conflict = hammer_ip_get_bulk(record)) != NULL) { + if ((conflict->flags & HAMMER_RECF_INTERLOCK_BE) == 0) { + conflict->flags |= HAMMER_RECF_DELETED_FE; + break; + } + conflict->flags |= HAMMER_RECF_WANTED; + tsleep(conflict, 0, "hmrrc3", 0); + hammer_rel_mem_record(conflict); + } + error = hammer_mem_add(record); + if (conflict) + hammer_rel_mem_record(conflict); + KKASSERT(error == 0); +} + /* * Frontend truncation code. Scan in-memory records only. On-disk records * and records in a flushing state are handled by the backend. The vnops * setattr code will handle the block containing the truncation point. * * Partial blocks are not deleted. + * + * This code is only called on regular files. */ int hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) @@ -1040,15 +1050,30 @@ hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) return(0); } +/* + * Scan callback for frontend records to destroy during a truncation. + * We must ensure that DELETED_FE is set on the record or the frontend + * will get confused in future read() calls. + * + * NOTE: DELETED_FE cannot be set while the record interlock (BE) is held. + * In this rare case we must wait for the interlock to be cleared. + * + * NOTE: This function is only called on regular files. There are further + * restrictions to the setting of DELETED_FE on directory records + * undergoing a flush due to sensitive inode link count calculations. + */ static int hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused) { if (record->flags & HAMMER_RECF_DELETED_FE) return(0); +#if 0 if (record->flush_state == HAMMER_FST_FLUSH) return(0); - KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); +#endif hammer_ref(&record->lock); + while (record->flags & HAMMER_RECF_INTERLOCK_BE) + hammer_wait_mem_record_ident(record, "hmmtrr"); record->flags |= HAMMER_RECF_DELETED_FE; hammer_rel_mem_record(record); return(0); diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index 83d844021f..267159a8fa 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -773,6 +773,15 @@ hammer_blocksize(int64_t file_offset) return(HAMMER_XBUFSIZE); } +int +hammer_blockoff(int64_t file_offset) +{ + if (file_offset < HAMMER_XDEMARC) + return((int)file_offset & HAMMER_BUFMASK); + else + return((int)file_offset & HAMMER_XBUFMASK); +} + /* * Return the demarkation point between the two offsets where * the block size changes. diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index c7582eb36e..c12fc97692 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -533,6 +533,8 @@ hammer_vop_write(struct vop_write_args *ap) int fixsize = 0; int blksize; int blkmask; + int trivial; + off_t nsize; if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) break; @@ -626,8 +628,20 @@ hammer_vop_write(struct vop_write_args *ap) n = blksize - offset; if (n > uio->uio_resid) n = uio->uio_resid; - if (uio->uio_offset + n > ip->ino_data.size) { - vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); + nsize = uio->uio_offset + n; + if (nsize > ip->ino_data.size) { + if (uio->uio_offset > ip->ino_data.size) + trivial = 0; + else + trivial = 1; + nvextendbuf(ap->a_vp, + ip->ino_data.size, + nsize, + hammer_blocksize(ip->ino_data.size), + hammer_blocksize(nsize), + hammer_blockoff(ip->ino_data.size), + hammer_blockoff(nsize), + trivial); fixsize = 1; kflags |= NOTE_EXTEND; } @@ -711,8 +725,9 @@ hammer_vop_write(struct vop_write_args *ap) if (error) { brelse(bp); if (fixsize) { - vtruncbuf(ap->a_vp, ip->ino_data.size, - hammer_blocksize(ip->ino_data.size)); + nvtruncbuf(ap->a_vp, ip->ino_data.size, + hammer_blocksize(ip->ino_data.size), + hammer_blockoff(ip->ino_data.size)); } break; } @@ -722,7 +737,6 @@ hammer_vop_write(struct vop_write_args *ap) if (ip->ino_data.size < uio->uio_offset) { ip->ino_data.size = uio->uio_offset; flags = HAMMER_INODE_SDIRTY; - vnode_pager_setsize(ap->a_vp, ip->ino_data.size); } else { flags = 0; } @@ -2037,7 +2051,9 @@ hammer_vop_setattr(struct vop_setattr_args *ap) int truncating; int blksize; int kflags; +#if 0 int64_t aligned_size; +#endif u_int32_t flags; vap = ap->a_vap; @@ -2132,11 +2148,20 @@ hammer_vop_setattr(struct vop_setattr_args *ap) * big deal here. */ if (vap->va_size < ip->ino_data.size) { - vtruncbuf(ap->a_vp, vap->va_size, blksize); + nvtruncbuf(ap->a_vp, vap->va_size, + blksize, + hammer_blockoff(vap->va_size)); truncating = 1; kflags |= NOTE_WRITE; } else { - vnode_pager_setsize(ap->a_vp, vap->va_size); + nvextendbuf(ap->a_vp, + ip->ino_data.size, + vap->va_size, + hammer_blocksize(ip->ino_data.size), + hammer_blocksize(vap->va_size), + hammer_blockoff(ip->ino_data.size), + hammer_blockoff(vap->va_size), + 0); truncating = 0; kflags |= NOTE_WRITE | NOTE_EXTEND; } @@ -2146,8 +2171,9 @@ hammer_vop_setattr(struct vop_setattr_args *ap) modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; /* - * on-media truncation is cached in the inode until - * the inode is synchronized. + * On-media truncation is cached in the inode until + * the inode is synchronized. We must immediately + * handle any frontend records. */ if (truncating) { hammer_ip_frontend_trunc(ip, vap->va_size); @@ -2179,45 +2205,20 @@ hammer_vop_setattr(struct vop_setattr_args *ap) } } +#if 0 /* - * If truncating we have to clean out a portion of - * the last block on-disk. We do this in the - * front-end buffer cache. - * - * NOTE: Calling bdwrite() (or bwrite/bawrite) on - * the buffer will clean its pages. This - * is necessary to set the valid bits on - * pages which vtruncbuf() may have cleared - * via vnode_pager_setsize(). - * - * If we don't do this the bp can be left - * with invalid pages and B_CACHE set, - * creating a situation where getpages can - * fail. + * When truncating, nvtruncbuf() may have cleaned out + * a portion of the last block on-disk in the buffer + * cache. We must clean out any frontend records + * for blocks beyond the new last block. */ aligned_size = (vap->va_size + (blksize - 1)) & ~(int64_t)(blksize - 1); if (truncating && vap->va_size < aligned_size) { - struct buf *bp; - int offset; - aligned_size -= blksize; - - offset = (int)vap->va_size & (blksize - 1); - error = bread(ap->a_vp, aligned_size, - blksize, &bp); hammer_ip_frontend_trunc(ip, aligned_size); - if (error == 0) { - bzero(bp->b_data + offset, - blksize - offset); - /* must de-cache direct-io offset */ - bp->b_bio2.bio_offset = NOOFFSET; - bdwrite(bp); - } else { - kprintf("ERROR %d\n", error); - brelse(bp); - } } +#endif break; case VDATABASE: if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { @@ -2657,8 +2658,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) * buffers and frontend-owned in-memory records synchronously. */ if (ip->flags & HAMMER_INODE_TRUNCATED) { - if (hammer_cursor_ondisk(&cursor) || - cursor.iprec->flush_state == HAMMER_FST_FLUSH) { + if (hammer_cursor_ondisk(&cursor)/* || + cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { if (ip->trunc_off <= rec_offset) n = 0; else if (ip->trunc_off < rec_offset + n) @@ -3082,7 +3083,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) record->flags |= HAMMER_RECF_REDO; bp->b_flags &= ~B_VFSFLAG1; } - hammer_io_direct_write(hmp, record, bio); + hammer_io_direct_write(hmp, bio, record); if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) hammer_flush_inode(ip, 0); } else { -- 2.41.0