X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/394f8c5ca4d1524adc80cef95994527807e78563..507df98a152612f739140d9f1ac5b30cd022eea2:/sys/vfs/hammer/hammer_vnops.c diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 3e4625b6e1..8ddb6a218d 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -47,6 +47,9 @@ #include #include #include + +#include + #include "hammer.h" /* @@ -160,7 +163,7 @@ void hammer_knote(struct vnode *vp, int flags) { if (flags) - KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags); + KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); } #ifdef DEBUG_TRUNCATE @@ -203,37 +206,89 @@ int hammer_vop_fsync(struct vop_fsync_args *ap) { hammer_inode_t ip = VTOI(ap->a_vp); + hammer_mount_t hmp = ip->hmp; int waitfor = ap->a_waitfor; + int mode; + + lwkt_gettoken(&hmp->fs_token); /* - * Fsync rule relaxation (default disabled) + * Fsync rule relaxation (default is either full synchronous flush + * or REDO semantics with synchronous flush). */ if (ap->a_flags & VOP_FSYNC_SYSCALL) { switch(hammer_fsync_mode) { case 0: - /* full semantics */ - break; +mode0: + /* no REDO, full synchronous flush */ + goto skip; case 1: - /* asynchronous */ +mode1: + /* no REDO, full asynchronous flush */ if (waitfor == MNT_WAIT) waitfor = MNT_NOWAIT; - break; + goto skip; case 2: - /* synchronous fsync on close */ - ip->flags |= HAMMER_INODE_CLOSESYNC; - return(0); + /* REDO semantics, synchronous flush */ + if (hmp->version < HAMMER_VOL_VERSION_FOUR) + goto mode0; + mode = HAMMER_FLUSH_UNDOS_AUTO; + break; case 3: - /* asynchronous fsync on close */ - ip->flags |= HAMMER_INODE_CLOSEASYNC; + /* REDO semantics, relaxed asynchronous flush */ + if (hmp->version < HAMMER_VOL_VERSION_FOUR) + goto mode1; + mode = HAMMER_FLUSH_UNDOS_RELAXED; + if (waitfor == MNT_WAIT) + waitfor = MNT_NOWAIT; + break; + case 4: + /* ignore the fsync() system call */ + lwkt_reltoken(&hmp->fs_token); return(0); default: - /* ignore the fsync() system call */ + /* we have to do something */ + mode = HAMMER_FLUSH_UNDOS_RELAXED; + if (waitfor == MNT_WAIT) + waitfor = MNT_NOWAIT; + break; + } + + /* + * Fast fsync only needs to flush the UNDO/REDO fifo if + * HAMMER_INODE_REDO is non-zero and the only modifications + * made to the file are write or write-extends. + */ + if ((ip->flags & HAMMER_INODE_REDO) && + (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 + ) { + ++hammer_count_fsyncs; + hammer_flusher_flush_undos(hmp, mode); + ip->redo_count = 0; + lwkt_reltoken(&hmp->fs_token); return(0); } + + /* + * REDO is enabled by fsync(), the idea being we really only + * want to lay down REDO records when programs are using + * fsync() heavily. The first fsync() on the file starts + * the gravy train going and later fsync()s keep it hot by + * resetting the redo_count. + * + * We weren't running REDOs before now so we have to fall + * through and do a full fsync of what we have. + */ + if (hmp->version >= HAMMER_VOL_VERSION_FOUR && + (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { + ip->flags |= HAMMER_INODE_REDO; + ip->redo_count = 0; + } } +skip: /* - * Go do it + * Do a full flush sequence. */ ++hammer_count_fsyncs; vfsync(ap->a_vp, waitfor, 1, NULL, NULL); @@ -243,13 +298,14 @@ hammer_vop_fsync(struct vop_fsync_args *ap) hammer_wait_inode(ip); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); } + lwkt_reltoken(&hmp->fs_token); return (ip->error); } /* * hammer_vop_read { vp, uio, ioflag, cred } * - * MPALMOSTSAFE + * MPSAFE (for the cache safe does not require fs_token) */ static int @@ -257,6 +313,7 @@ hammer_vop_read(struct vop_read_args *ap) { struct hammer_transaction trans; hammer_inode_t ip; + hammer_mount_t hmp; off_t offset; struct buf *bp; struct uio *uio; @@ -265,12 +322,13 @@ hammer_vop_read(struct vop_read_args *ap) int seqcount; int ioseqcount; int blksize; - int got_mplock; int bigread; + int got_fstoken; if (ap->a_vp->v_type != VREG) return (EINVAL); ip = VTOI(ap->a_vp); + hmp = ip->hmp; error = 0; uio = ap->a_uio; @@ -278,32 +336,18 @@ hammer_vop_read(struct vop_read_args *ap) * Allow the UIO's size to override the sequential heuristic. */ blksize = hammer_blocksize(uio->uio_offset); - seqcount = (uio->uio_resid + (blksize - 1)) / blksize; - ioseqcount = ap->a_ioflag >> 16; + seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; + ioseqcount = (ap->a_ioflag >> 16); if (seqcount < ioseqcount) seqcount = ioseqcount; - /* - * Temporary hack until more of HAMMER can be made MPSAFE. - */ -#ifdef SMP - if (curthread->td_mpcount) { - got_mplock = -1; - hammer_start_transaction(&trans, ip->hmp); - } else { - got_mplock = 0; - } -#else - hammer_start_transaction(&trans, ip->hmp); - got_mplock = -1; -#endif - /* * If reading or writing a huge amount of data we have to break * atomicy and allow the operation to be interrupted by a signal * or it can DOS the machine. */ bigread = (uio->uio_resid > 100 * 1024 * 1024); + got_fstoken = 0; /* * Access the data typically in HAMMER_BUFSIZE blocks via the @@ -337,9 +381,9 @@ hammer_vop_read(struct vop_read_args *ap) /* * MPUNSAFE */ - if (got_mplock == 0) { - got_mplock = 1; - get_mplock(); + if (got_fstoken == 0) { + lwkt_gettoken(&hmp->fs_token); + got_fstoken = 1; hammer_start_transaction(&trans, ip->hmp); } @@ -356,17 +400,23 @@ hammer_vop_read(struct vop_read_args *ap) } error = cluster_read(ap->a_vp, file_limit, base_offset, - blksize, MAXPHYS, - seqcount, &bp); + blksize, uio->uio_resid, + seqcount * BKVASIZE, &bp); } else { error = bread(ap->a_vp, base_offset, blksize, &bp); } if (error) { - kprintf("error %d\n", error); brelse(bp); break; } skip: + if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { + kprintf("doff %016jx read file %016jx@%016jx\n", + (intmax_t)bp->b_bio2.bio_offset, + (intmax_t)ip->obj_id, + (intmax_t)bp->b_loffset); + } + bp->b_flags &= ~B_IODEBUG; /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ n = blksize - offset; @@ -388,15 +438,14 @@ skip: * XXX only update the atime if we had to get the MP lock. * XXX hack hack hack, fixme. */ - if (got_mplock) { + if (got_fstoken) { if ((ip->flags & HAMMER_INODE_RO) == 0 && (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { ip->ino_data.atime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_ATIME); + hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); } hammer_done_transaction(&trans); - if (got_mplock > 0) - rel_mplock(); + lwkt_reltoken(&hmp->fs_token); } return (error); } @@ -436,6 +485,7 @@ hammer_vop_write(struct vop_write_args *ap) /* * Create a transaction to cover the operations we perform. */ + lwkt_gettoken(&hmp->fs_token); hammer_start_transaction(&trans, hmp); uio = ap->a_uio; @@ -453,11 +503,13 @@ hammer_vop_write(struct vop_write_args *ap) */ if (uio->uio_offset < 0) { hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (EFBIG); } base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (EFBIG); } @@ -465,8 +517,15 @@ hammer_vop_write(struct vop_write_args *ap) * If reading or writing a huge amount of data we have to break * atomicy and allow the operation to be interrupted by a signal * or it can DOS the machine. + * + * Preset redo_count so we stop generating REDOs earlier if the + * limit is exceeded. */ bigwrite = (uio->uio_resid > 100 * 1024 * 1024); + if ((ip->flags & HAMMER_INODE_REDO) && + ip->redo_count < hammer_limit_redo) { + ip->redo_count += uio->uio_resid; + } /* * Access the data typically in HAMMER_BUFSIZE blocks via the @@ -477,6 +536,9 @@ hammer_vop_write(struct vop_write_args *ap) int fixsize = 0; int blksize; int blkmask; + int trivial; + int endofblk; + off_t nsize; if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) break; @@ -568,10 +630,26 @@ hammer_vop_write(struct vop_write_args *ap) offset = (int)uio->uio_offset & blkmask; base_offset = uio->uio_offset & ~(int64_t)blkmask; n = blksize - offset; - if (n > uio->uio_resid) + if (n > uio->uio_resid) { n = uio->uio_resid; - if (uio->uio_offset + n > ip->ino_data.size) { - vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); + endofblk = 0; + } else { + endofblk = 1; + } + nsize = uio->uio_offset + n; + if (nsize > ip->ino_data.size) { + if (uio->uio_offset > ip->ino_data.size) + trivial = 0; + else + trivial = 1; + nvextendbuf(ap->a_vp, + ip->ino_data.size, + nsize, + hammer_blocksize(ip->ino_data.size), + hammer_blocksize(nsize), + hammer_blockoff(ip->ino_data.size), + hammer_blockoff(nsize), + trivial); fixsize = 1; kflags |= NOTE_EXTEND; } @@ -617,9 +695,35 @@ hammer_vop_write(struct vop_write_args *ap) if (error == 0) bheavy(bp); } - if (error == 0) { - error = uiomove((char *)bp->b_data + offset, - n, uio); + if (error == 0) + error = uiomove(bp->b_data + offset, n, uio); + + /* + * Generate REDO records if enabled and redo_count will not + * exceeded the limit. + * + * If redo_count exceeds the limit we stop generating records + * and clear HAMMER_INODE_REDO. This will cause the next + * fsync() to do a full meta-data sync instead of just an + * UNDO/REDO fifo update. + * + * When clearing HAMMER_INODE_REDO any pre-existing REDOs + * will still be tracked. The tracks will be terminated + * when the related meta-data (including possible data + * modifications which are not tracked via REDO) is + * flushed. + */ + if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { + if (ip->redo_count < hammer_limit_redo) { + bp->b_flags |= B_VFSFLAG1; + error = hammer_generate_redo(&trans, ip, + base_offset + offset, + HAMMER_REDO_WRITE, + bp->b_data + offset, + (size_t)n); + } else { + ip->flags &= ~HAMMER_INODE_REDO; + } } /* @@ -629,8 +733,9 @@ hammer_vop_write(struct vop_write_args *ap) if (error) { brelse(bp); if (fixsize) { - vtruncbuf(ap->a_vp, ip->ino_data.size, - hammer_blocksize(ip->ino_data.size)); + nvtruncbuf(ap->a_vp, ip->ino_data.size, + hammer_blocksize(ip->ino_data.size), + hammer_blockoff(ip->ino_data.size)); } break; } @@ -639,14 +744,13 @@ hammer_vop_write(struct vop_write_args *ap) /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ if (ip->ino_data.size < uio->uio_offset) { ip->ino_data.size = uio->uio_offset; - flags = HAMMER_INODE_DDIRTY; - vnode_pager_setsize(ap->a_vp, ip->ino_data.size); + flags = HAMMER_INODE_SDIRTY; } else { flags = 0; } ip->ino_data.mtime = trans.time; flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; - hammer_modify_inode(ip, flags); + hammer_modify_inode(&trans, ip, flags); /* * Once we dirty the buffer any cached zone-X offset @@ -665,18 +769,25 @@ hammer_vop_write(struct vop_write_args *ap) * buffer cache simply cannot keep up. * * WARNING! blksize is variable. cluster_write() is - * expected to not blow up if it encounters buffers that - * do not match the passed blksize. + * expected to not blow up if it encounters + * buffers that do not match the passed blksize. * * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). * The ip->rsv_recs check should burst-flush the data. * If we queue it immediately the buf could be left * locked on the device queue for a very long time. + * + * NOTE! To avoid degenerate stalls due to mismatched block + * sizes we only honor IO_DIRECT on the write which + * abuts the end of the buffer. However, we must + * honor IO_SYNC in case someone is silly enough to + * configure a HAMMER file as swap, or when HAMMER + * is serving NFS (for commits). Ick ick. */ bp->b_flags |= B_AGE; if (ap->a_ioflag & IO_SYNC) { bwrite(bp); - } else if (ap->a_ioflag & IO_DIRECT) { + } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { bawrite(bp); } else { #if 0 @@ -695,11 +806,14 @@ hammer_vop_write(struct vop_write_args *ap) } hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); + lwkt_reltoken(&hmp->fs_token); return (error); } /* * hammer_vop_access { vp, mode, cred } + * + * MPSAFE - does not require fs_token */ static int @@ -721,6 +835,8 @@ hammer_vop_access(struct vop_access_args *ap) /* * hammer_vop_advlock { vp, id, op, fl, flags } + * + * MPSAFE - does not require fs_token */ static int @@ -734,16 +850,16 @@ hammer_vop_advlock(struct vop_advlock_args *ap) /* * hammer_vop_close { vp, fflag } * - * We can only sync-on-close for normal closes. + * We can only sync-on-close for normal closes. XXX disabled for now. */ static int hammer_vop_close(struct vop_close_args *ap) { +#if 0 struct vnode *vp = ap->a_vp; hammer_inode_t ip = VTOI(vp); int waitfor; - if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { if (vn_islocked(vp) == LK_EXCLUSIVE && (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { @@ -756,6 +872,7 @@ hammer_vop_close(struct vop_close_args *ap) VOP_FSYNC(vp, MNT_NOWAIT, waitfor); } } +#endif return (vop_stdclose(ap)); } @@ -773,20 +890,23 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -801,6 +921,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) hkprintf("hammer_create_inode error %d\n", error); hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -831,6 +952,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) } hammer_knote(ap->a_dvp, NOTE_WRITE); } + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -842,7 +964,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) * The atime field is stored in the B-Tree element and allowed to be * updated without cycling the element. * - * MPSAFE + * MPSAFE - does not require fs_token */ static int @@ -923,8 +1045,6 @@ hammer_vop_getattr(struct vop_getattr_args *ap) vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); vap->va_filerev = 0; /* XXX */ - /* mtime uniquely identifies any adjustments made to the file XXX */ - vap->va_fsmid = ip->ino_data.mtime; vap->va_uid_uuid = ip->ino_data.uid; vap->va_gid_uuid = ip->ino_data.gid; vap->va_fsid_uuid = ip->hmp->fsid; @@ -955,6 +1075,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) { struct hammer_transaction trans; struct namecache *ncp; + hammer_mount_t hmp; hammer_inode_t dip; hammer_inode_t ip; hammer_tid_t asof; @@ -982,8 +1103,10 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) nlen = ncp->nc_nlen; flags = dip->flags & HAMMER_INODE_RO; ispfs = 0; + hmp = dip->hmp; - hammer_simple_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; for (i = 0; i < nlen; ++i) { @@ -1138,6 +1261,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) } done: hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1164,6 +1288,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) struct hammer_transaction trans; struct hammer_inode *dip; struct hammer_inode *ip; + hammer_mount_t hmp; int64_t parent_obj_id; u_int32_t parent_obj_localization; hammer_tid_t asof; @@ -1171,11 +1296,13 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) dip = VTOI(ap->a_dvp); asof = dip->obj_asof; + hmp = dip->hmp; /* * Whos are parent? This could be the root of a pseudo-filesystem * whos parent is in another localization domain. */ + lwkt_gettoken(&hmp->fs_token); parent_obj_id = dip->ino_data.parent_obj_id; if (dip->obj_id == HAMMER_OBJID_ROOT) parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; @@ -1184,19 +1311,20 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) if (parent_obj_id == 0) { if (dip->obj_id == HAMMER_OBJID_ROOT && - asof != dip->hmp->asof) { + asof != hmp->asof) { parent_obj_id = dip->obj_id; - asof = dip->hmp->asof; + asof = hmp->asof; *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); ksnprintf(*ap->a_fakename, 19, "0x%016llx", (long long)dip->obj_asof); } else { *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return ENOENT; } } - hammer_simple_transaction(&trans, dip->hmp); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; ip = hammer_get_inode(&trans, dip, parent_obj_id, @@ -1209,6 +1337,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) *ap->a_vpp = NULL; } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1223,6 +1352,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) struct hammer_inode *dip; struct hammer_inode *ip; struct nchandle *nch; + hammer_mount_t hmp; int error; if (ap->a_dvp->v_mount != ap->a_vp->v_mount) @@ -1231,6 +1361,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) nch = ap->a_nch; dip = VTOI(ap->a_dvp); ip = VTOI(ap->a_vp); + hmp = dip->hmp; if (dip->obj_localization != ip->obj_localization) return(EXDEV); @@ -1239,13 +1370,14 @@ hammer_vop_nlink(struct vop_nlink_args *ap) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1267,6 +1399,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_LINK); hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1284,20 +1417,23 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1311,6 +1447,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hkprintf("hammer_mkdir error %d\n", error); hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } /* @@ -1340,6 +1477,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1357,20 +1495,23 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1385,6 +1526,7 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) if (error) { hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1413,11 +1555,14 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } /* * hammer_vop_open { vp, mode, cred, fp } + * + * MPSAFE (does not require fs_token) */ static int @@ -1453,6 +1598,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) struct hammer_transaction trans; struct hammer_cursor cursor; struct hammer_inode *ip; + hammer_mount_t hmp; struct uio *uio; hammer_base_elm_t base; int error; @@ -1467,6 +1613,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) ip = VTOI(ap->a_vp); uio = ap->a_uio; saveoff = uio->uio_offset; + hmp = ip->hmp; if (ap->a_ncookies) { ncookies = uio->uio_resid / 16 + 1; @@ -1480,7 +1627,8 @@ hammer_vop_readdir(struct vop_readdir_args *ap) cookie_index = 0; } - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); /* * Handle artificial entries @@ -1595,6 +1743,7 @@ done: *ap->a_cookies = cookies; } } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -1608,12 +1757,16 @@ hammer_vop_readlink(struct vop_readlink_args *ap) struct hammer_transaction trans; struct hammer_cursor cursor; struct hammer_inode *ip; + hammer_mount_t hmp; char buf[32]; u_int32_t localization; hammer_pseudofs_inmem_t pfsm; int error; ip = VTOI(ap->a_vp); + hmp = ip->hmp; + + lwkt_gettoken(&hmp->fs_token); /* * Shortcut if the symlink data was stuffed into ino_data. @@ -1632,7 +1785,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) ip->obj_asof == HAMMER_MAX_TID && ip->obj_localization == 0 && strncmp(ptr, "@@PFS", 5) == 0) { - hammer_simple_transaction(&trans, ip->hmp); + hammer_simple_transaction(&trans, hmp); bcopy(ptr + 5, buf, 5); buf[5] = 0; localization = strtoul(buf, NULL, 10) << 16; @@ -1662,17 +1815,18 @@ hammer_vop_readlink(struct vop_readlink_args *ap) bytes = strlen(buf); } if (pfsm) - hammer_rel_pseudofs(trans.hmp, pfsm); + hammer_rel_pseudofs(hmp, pfsm); hammer_done_transaction(&trans); } error = uiomove(ptr, bytes, ap->a_uio); + lwkt_reltoken(&hmp->fs_token); return(error); } /* * Long version */ - hammer_simple_transaction(&trans, ip->hmp); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); @@ -1705,6 +1859,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -1717,21 +1872,25 @@ hammer_vop_nremove(struct vop_nremove_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1748,6 +1907,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) struct hammer_inode *fdip; struct hammer_inode *tdip; struct hammer_inode *ip; + hammer_mount_t hmp; struct hammer_cursor cursor; int64_t namekey; u_int32_t max_iterations; @@ -1765,6 +1925,8 @@ hammer_vop_nrename(struct vop_nrename_args *ap) ip = VTOI(fncp->nc_vp); KKASSERT(ip != NULL); + hmp = ip->hmp; + if (fdip->obj_localization != tdip->obj_localization) return(EXDEV); if (fdip->obj_localization != ip->obj_localization) @@ -1776,10 +1938,11 @@ hammer_vop_nrename(struct vop_nrename_args *ap) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); - hammer_start_transaction(&trans, fdip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1798,7 +1961,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) if (error == 0) { ip->ino_data.parent_obj_id = tdip->obj_id; ip->ino_data.ctime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); } } if (error) @@ -1872,18 +2035,34 @@ retry: /* * Cleanup and tell the kernel that the rename succeeded. + * + * NOTE: ip->vp, if non-NULL, cannot be directly referenced + * without formally acquiring the vp since the vp might + * have zero refs on it, or in the middle of a reclaim, + * etc. */ hammer_done_cursor(&cursor); if (error == 0) { cache_rename(ap->a_fnch, ap->a_tnch); hammer_knote(ap->a_fdvp, NOTE_WRITE); hammer_knote(ap->a_tdvp, NOTE_WRITE); - if (ip->vp) - hammer_knote(ip->vp, NOTE_RENAME); + while (ip->vp) { + struct vnode *vp; + + error = hammer_get_vnode(ip, &vp); + if (error == 0 && vp) { + vn_unlock(vp); + hammer_knote(ip->vp, NOTE_RENAME); + vrele(vp); + break; + } + kprintf("Debug: HAMMER ip/vp race2 avoided\n"); + } } failed: hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1896,21 +2075,25 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1923,21 +2106,25 @@ hammer_vop_markatime(struct vop_markatime_args *ap) { struct hammer_transaction trans; struct hammer_inode *ip; + hammer_mount_t hmp; ip = VTOI(ap->a_vp); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if (ip->hmp->mp->mnt_flag & MNT_NOATIME) + hmp = ip->hmp; + if (hmp->mp->mnt_flag & MNT_NOATIME) return (0); - hammer_start_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; ip->ino_data.atime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_ATIME); + hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_ATTRIB); + lwkt_reltoken(&hmp->fs_token); return (0); } @@ -1949,31 +2136,36 @@ int hammer_vop_setattr(struct vop_setattr_args *ap) { struct hammer_transaction trans; - struct vattr *vap; struct hammer_inode *ip; + struct vattr *vap; + hammer_mount_t hmp; int modflags; int error; int truncating; int blksize; int kflags; +#if 0 int64_t aligned_size; +#endif u_int32_t flags; vap = ap->a_vap; ip = ap->a_vp->v_data; modflags = 0; kflags = 0; + hmp = ip->hmp; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return(EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); if (hammer_nohistory(ip) == 0 && - (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = 0; @@ -2033,28 +2225,58 @@ hammer_vop_setattr(struct vop_setattr_args *ap) case VREG: if (vap->va_size == ip->ino_data.size) break; + + /* + * Log the operation if in fast-fsync mode or if + * there are unterminated redo write records present. + * + * The second check is needed so the recovery code + * properly truncates write redos even if nominal + * REDO operations is turned off due to excessive + * writes, because the related records might be + * destroyed and never lay down a TERM_WRITE. + */ + if ((ip->flags & HAMMER_INODE_REDO) || + (ip->flags & HAMMER_INODE_RDIRTY)) { + error = hammer_generate_redo(&trans, ip, + vap->va_size, + HAMMER_REDO_TRUNC, + NULL, 0); + } + blksize = hammer_blocksize(vap->va_size); + /* * XXX break atomicy, we can deadlock the backend * if we do not release the lock. Probably not a * big deal here. */ - blksize = hammer_blocksize(vap->va_size); if (vap->va_size < ip->ino_data.size) { - vtruncbuf(ap->a_vp, vap->va_size, blksize); + nvtruncbuf(ap->a_vp, vap->va_size, + blksize, + hammer_blockoff(vap->va_size)); truncating = 1; kflags |= NOTE_WRITE; } else { - vnode_pager_setsize(ap->a_vp, vap->va_size); + nvextendbuf(ap->a_vp, + ip->ino_data.size, + vap->va_size, + hammer_blocksize(ip->ino_data.size), + hammer_blocksize(vap->va_size), + hammer_blockoff(ip->ino_data.size), + hammer_blockoff(vap->va_size), + 0); truncating = 0; kflags |= NOTE_WRITE | NOTE_EXTEND; } ip->ino_data.size = vap->va_size; ip->ino_data.mtime = trans.time; + /* XXX safe to use SDIRTY instead of DDIRTY here? */ modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; /* - * on-media truncation is cached in the inode until - * the inode is synchronized. + * On-media truncation is cached in the inode until + * the inode is synchronized. We must immediately + * handle any frontend records. */ if (truncating) { hammer_ip_frontend_trunc(ip, vap->va_size); @@ -2086,34 +2308,20 @@ hammer_vop_setattr(struct vop_setattr_args *ap) } } +#if 0 /* - * If truncating we have to clean out a portion of - * the last block on-disk. We do this in the - * front-end buffer cache. + * When truncating, nvtruncbuf() may have cleaned out + * a portion of the last block on-disk in the buffer + * cache. We must clean out any frontend records + * for blocks beyond the new last block. */ aligned_size = (vap->va_size + (blksize - 1)) & ~(int64_t)(blksize - 1); if (truncating && vap->va_size < aligned_size) { - struct buf *bp; - int offset; - aligned_size -= blksize; - - offset = (int)vap->va_size & (blksize - 1); - error = bread(ap->a_vp, aligned_size, - blksize, &bp); hammer_ip_frontend_trunc(ip, aligned_size); - if (error == 0) { - bzero(bp->b_data + offset, - blksize - offset); - /* must de-cache direct-io offset */ - bp->b_bio2.bio_offset = NOOFFSET; - bdwrite(bp); - } else { - kprintf("ERROR %d\n", error); - brelse(bp); - } } +#endif break; case VDATABASE: if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { @@ -2160,9 +2368,10 @@ hammer_vop_setattr(struct vop_setattr_args *ap) } done: if (error == 0) - hammer_modify_inode(ip, modflags); + hammer_modify_inode(&trans, ip, modflags); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2176,8 +2385,9 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) struct hammer_transaction trans; struct hammer_inode *dip; struct hammer_inode *nip; - struct nchandle *nch; hammer_record_t record; + struct nchandle *nch; + hammer_mount_t hmp; int error; int bytes; @@ -2185,16 +2395,18 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -2208,6 +2420,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) if (error) { hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2239,7 +2452,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) */ if (error == 0) { nip->ino_data.size = bytes; - hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); } } if (error == 0) @@ -2262,6 +2475,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) } } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2274,20 +2488,24 @@ hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags, -1); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2300,10 +2518,15 @@ int hammer_vop_ioctl(struct vop_ioctl_args *ap) { struct hammer_inode *ip = ap->a_vp->v_data; + hammer_mount_t hmp = ip->hmp; + int error; ++hammer_stats_file_iopsr; - return(hammer_ioctl(ip, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_cred)); + lwkt_gettoken(&hmp->fs_token); + error = hammer_ioctl(ip, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_cred); + lwkt_reltoken(&hmp->fs_token); + return (error); } static @@ -2327,8 +2550,9 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) KKASSERT(mp->mnt_data != NULL); hmp = (struct hammer_mount *)mp->mnt_data; - switch(ap->a_op) { + lwkt_gettoken(&hmp->fs_token); + switch(ap->a_op) { case MOUNTCTL_SET_EXPORT: if (ap->a_ctllen != sizeof(struct export_args)) error = EINVAL; @@ -2349,7 +2573,8 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) usedbytes = *ap->a_res; if (usedbytes > 0 && usedbytes < ap->a_buflen) { - usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf, + usedbytes += vfs_flagstostr(hmp->hflags, extraopt, + ap->a_buf, ap->a_buflen - usedbytes, &error); } @@ -2361,6 +2586,7 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) error = vop_stdmountctl(ap); break; } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2396,6 +2622,9 @@ hammer_vop_strategy(struct vop_strategy_args *ap) biodone(ap->a_bio); break; } + + /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ + return (error); } @@ -2417,6 +2646,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) struct hammer_transaction trans; struct hammer_inode *ip; struct hammer_inode *dip; + hammer_mount_t hmp; struct hammer_cursor cursor; hammer_base_elm_t base; hammer_off_t disk_offset; @@ -2434,6 +2664,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) bio = ap->a_bio; bp = bio->bio_buf; ip = ap->a_vp->v_data; + hmp = ip->hmp; /* * The zone-2 disk offset may have been set by the cluster code via @@ -2444,7 +2675,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) nbio = push_bio(bio); if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_LARGE_DATA) { - error = hammer_io_direct_read(ip->hmp, nbio, NULL); + lwkt_gettoken(&hmp->fs_token); + error = hammer_io_direct_read(hmp, nbio, NULL); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2452,7 +2685,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) * Well, that sucked. Do it the hard way. If all the stars are * aligned we may still be able to issue a direct-read. */ - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); /* @@ -2553,8 +2787,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) * buffers and frontend-owned in-memory records synchronously. */ if (ip->flags & HAMMER_INODE_TRUNCATED) { - if (hammer_cursor_ondisk(&cursor) || - cursor.iprec->flush_state == HAMMER_FST_FLUSH) { + if (hammer_cursor_ondisk(&cursor)/* || + cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { if (ip->trunc_off <= rec_offset) n = 0; else if (ip->trunc_off < rec_offset + n) @@ -2586,8 +2820,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_LARGE_DATA); nbio->bio_offset = disk_offset; - error = hammer_io_direct_read(trans.hmp, nbio, - cursor.leaf); + error = hammer_io_direct_read(hmp, nbio, cursor.leaf); + if (hammer_live_dedup) + hammer_dedup_cache_add(ip, cursor.leaf); goto done; } else if (n) { error = hammer_ip_resolve_data(&cursor); @@ -2599,6 +2834,13 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) if (error) break; + /* + * We have to be sure that the only elements added to the + * dedup cache are those which are already on-media. + */ + if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) + hammer_dedup_cache_add(ip, cursor.leaf); + /* * Iterate until we have filled the request. */ @@ -2648,6 +2890,7 @@ done: } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2672,6 +2915,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) { struct hammer_transaction trans; struct hammer_inode *ip; + hammer_mount_t hmp; struct hammer_cursor cursor; hammer_base_elm_t base; int64_t rec_offset; @@ -2688,6 +2932,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) ++hammer_stats_file_iopsr; ip = ap->a_vp->v_data; + hmp = ip->hmp; /* * We can only BMAP regular files. We can't BMAP database files, @@ -2707,7 +2952,8 @@ hammer_vop_bmap(struct vop_bmap_args *ap) * Scan the B-Tree to acquire blockmap addresses, then translate * to raw addresses. */ - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); #if 0 kprintf("bmap_beg %016llx ip->cache %p\n", (long long)ap->a_loffset, ip->cache[1]); @@ -2809,7 +3055,11 @@ hammer_vop_bmap(struct vop_bmap_args *ap) } last_offset = rec_offset + rec_len; last_disk_offset = disk_offset + rec_len; + + if (hammer_live_dedup) + hammer_dedup_cache_add(ip, cursor.leaf); } + error = hammer_ip_next(&cursor); } @@ -2832,6 +3082,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); /* * If we couldn't find any records or the records we did find were @@ -2924,6 +3175,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) return(EROFS); } + lwkt_gettoken(&hmp->fs_token); + /* * Interlock with inode destruction (no in-kernel or directory * topology visibility). If we queue new IO while trying to @@ -2938,6 +3191,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { bp->b_resid = 0; biodone(ap->a_bio); + lwkt_reltoken(&hmp->fs_token); return(0); } @@ -2967,8 +3221,24 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, bytes, &error); + + /* + * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated + * in hammer_vop_write(). We must flag the record so the proper + * REDO_TERM_WRITE entry is generated during the flush. + */ if (record) { - hammer_io_direct_write(hmp, record, bio); + if (bp->b_flags & B_VFSFLAG1) { + record->flags |= HAMMER_RECF_REDO; + bp->b_flags &= ~B_VFSFLAG1; + } + if (record->flags & HAMMER_RECF_DEDUPED) { + bp->b_resid = 0; + hammer_ip_replace_bulk(hmp, record); + biodone(ap->a_bio); + } else { + hammer_io_direct_write(hmp, bio, record); + } if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) hammer_flush_inode(ip, 0); } else { @@ -2977,6 +3247,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) bp->b_flags |= B_ERROR; biodone(ap->a_bio); } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2993,6 +3264,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, struct namecache *ncp; hammer_inode_t dip; hammer_inode_t ip; + hammer_mount_t hmp; struct hammer_cursor cursor; int64_t namekey; u_int32_t max_iterations; @@ -3007,6 +3279,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, */ dip = VTOI(dvp); ncp = nch->ncp; + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); @@ -3063,7 +3336,7 @@ retry: if (error == 0) { hammer_unlock(&cursor.ip->lock); ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, - dip->hmp->asof, + hmp->asof, cursor.data->entry.localization, 0, &error); hammer_lock_sh(&cursor.ip->lock); @@ -3133,10 +3406,30 @@ retry: if (error == 0) { cache_setunresolved(nch); cache_setvp(nch, NULL); - /* XXX locking */ - if (ip && ip->vp) { - hammer_knote(ip->vp, NOTE_DELETE); - cache_inval_vp(ip->vp, CINV_DESTROY); + + /* + * NOTE: ip->vp, if non-NULL, cannot be directly + * referenced without formally acquiring the + * vp since the vp might have zero refs on it, + * or in the middle of a reclaim, etc. + * + * NOTE: The cache_setunresolved() can rip the vp + * out from under us since the vp may not have + * any refs, in which case ip->vp will be NULL + * from the outset. + */ + while (ip && ip->vp) { + struct vnode *vp; + + error = hammer_get_vnode(ip, &vp); + if (error == 0 && vp) { + vn_unlock(vp); + hammer_knote(ip->vp, NOTE_DELETE); + cache_inval_vp(ip->vp, CINV_DESTROY); + vrele(vp); + break; + } + kprintf("Debug: HAMMER ip/vp race1 avoided\n"); } } if (ip) @@ -3155,7 +3448,6 @@ retry: ************************************************************************ * */ - static int hammer_vop_fifoclose (struct vop_close_args *ap) { @@ -3206,11 +3498,11 @@ static int filt_hammerwrite(struct knote *kn, long hint); static int filt_hammervnode(struct knote *kn, long hint); static struct filterops hammerread_filtops = - { 1, NULL, filt_hammerdetach, filt_hammerread }; + { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; static struct filterops hammerwrite_filtops = - { 1, NULL, filt_hammerdetach, filt_hammerwrite }; + { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; static struct filterops hammervnode_filtops = - { 1, NULL, filt_hammerdetach, filt_hammervnode }; + { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; static int @@ -3218,7 +3510,6 @@ hammer_vop_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; - lwkt_tokref vlock; switch (kn->kn_filter) { case EVFILT_READ: @@ -3231,14 +3522,12 @@ hammer_vop_kqfilter(struct vop_kqfilter_args *ap) kn->kn_fop = &hammervnode_filtops; break; default: - return (1); + return (EOPNOTSUPP); } kn->kn_hook = (caddr_t)vp; - lwkt_gettoken(&vlock, &vp->v_token); - SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); - lwkt_reltoken(&vlock); + knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); return(0); } @@ -3247,12 +3536,8 @@ static void filt_hammerdetach(struct knote *kn) { struct vnode *vp = (void *)kn->kn_hook; - lwkt_tokref vlock; - lwkt_gettoken(&vlock, &vp->v_token); - SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, - kn, knote, kn_selnext); - lwkt_reltoken(&vlock); + knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); } static int @@ -3260,12 +3545,19 @@ filt_hammerread(struct knote *kn, long hint) { struct vnode *vp = (void *)kn->kn_hook; hammer_inode_t ip = VTOI(vp); + hammer_mount_t hmp = ip->hmp; + off_t off; if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return(1); } - kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset; + lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ + off = ip->ino_data.size - kn->kn_fp->f_offset; + kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; + lwkt_reltoken(&hmp->fs_token); + if (kn->kn_sfflags & NOTE_OLDAPI) + return(1); return (kn->kn_data != 0); }