X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/eac446c50abe1405a40b6ff5eb347bcae6464101..507df98a152612f739140d9f1ac5b30cd022eea2:/sys/vfs/hammer/hammer_vnops.c diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index a92e9c4276..8ddb6a218d 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -47,6 +47,9 @@ #include #include #include + +#include + #include "hammer.h" /* @@ -160,7 +163,7 @@ void hammer_knote(struct vnode *vp, int flags) { if (flags) - KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags); + KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); } #ifdef DEBUG_TRUNCATE @@ -188,28 +191,121 @@ hammer_vop_vnoperate(struct vop_generic_args *) * fsync() an inode to disk and wait for it to be completely committed * such that the information would not be undone if a crash occured after * return. + * + * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement + * a REDO log. A sysctl is provided to relax HAMMER's fsync() + * operation. + * + * Ultimately the combination of a REDO log and use of fast storage + * to front-end cluster caches will make fsync fast, but it aint + * here yet. And, in anycase, we need real transactional + * all-or-nothing features which are not restricted to a single file. */ static int hammer_vop_fsync(struct vop_fsync_args *ap) { hammer_inode_t ip = VTOI(ap->a_vp); + hammer_mount_t hmp = ip->hmp; + int waitfor = ap->a_waitfor; + int mode; + + lwkt_gettoken(&hmp->fs_token); + + /* + * Fsync rule relaxation (default is either full synchronous flush + * or REDO semantics with synchronous flush). + */ + if (ap->a_flags & VOP_FSYNC_SYSCALL) { + switch(hammer_fsync_mode) { + case 0: +mode0: + /* no REDO, full synchronous flush */ + goto skip; + case 1: +mode1: + /* no REDO, full asynchronous flush */ + if (waitfor == MNT_WAIT) + waitfor = MNT_NOWAIT; + goto skip; + case 2: + /* REDO semantics, synchronous flush */ + if (hmp->version < HAMMER_VOL_VERSION_FOUR) + goto mode0; + mode = HAMMER_FLUSH_UNDOS_AUTO; + break; + case 3: + /* REDO semantics, relaxed asynchronous flush */ + if (hmp->version < HAMMER_VOL_VERSION_FOUR) + goto mode1; + mode = HAMMER_FLUSH_UNDOS_RELAXED; + if (waitfor == MNT_WAIT) + waitfor = MNT_NOWAIT; + break; + case 4: + /* ignore the fsync() system call */ + lwkt_reltoken(&hmp->fs_token); + return(0); + default: + /* we have to do something */ + mode = HAMMER_FLUSH_UNDOS_RELAXED; + if (waitfor == MNT_WAIT) + waitfor = MNT_NOWAIT; + break; + } + /* + * Fast fsync only needs to flush the UNDO/REDO fifo if + * HAMMER_INODE_REDO is non-zero and the only modifications + * made to the file are write or write-extends. + */ + if ((ip->flags & HAMMER_INODE_REDO) && + (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 + ) { + ++hammer_count_fsyncs; + hammer_flusher_flush_undos(hmp, mode); + ip->redo_count = 0; + lwkt_reltoken(&hmp->fs_token); + return(0); + } + + /* + * REDO is enabled by fsync(), the idea being we really only + * want to lay down REDO records when programs are using + * fsync() heavily. The first fsync() on the file starts + * the gravy train going and later fsync()s keep it hot by + * resetting the redo_count. + * + * We weren't running REDOs before now so we have to fall + * through and do a full fsync of what we have. + */ + if (hmp->version >= HAMMER_VOL_VERSION_FOUR && + (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { + ip->flags |= HAMMER_INODE_REDO; + ip->redo_count = 0; + } + } +skip: + + /* + * Do a full flush sequence. + */ ++hammer_count_fsyncs; - vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL); + vfsync(ap->a_vp, waitfor, 1, NULL, NULL); hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); - if (ap->a_waitfor == MNT_WAIT) { + if (waitfor == MNT_WAIT) { vn_unlock(ap->a_vp); hammer_wait_inode(ip); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); } + lwkt_reltoken(&hmp->fs_token); return (ip->error); } /* * hammer_vop_read { vp, uio, ioflag, cred } * - * MPALMOSTSAFE + * MPSAFE (for the cache safe does not require fs_token) */ static int @@ -217,6 +313,7 @@ hammer_vop_read(struct vop_read_args *ap) { struct hammer_transaction trans; hammer_inode_t ip; + hammer_mount_t hmp; off_t offset; struct buf *bp; struct uio *uio; @@ -225,12 +322,13 @@ hammer_vop_read(struct vop_read_args *ap) int seqcount; int ioseqcount; int blksize; - int got_mplock; int bigread; + int got_fstoken; if (ap->a_vp->v_type != VREG) return (EINVAL); ip = VTOI(ap->a_vp); + hmp = ip->hmp; error = 0; uio = ap->a_uio; @@ -238,32 +336,18 @@ hammer_vop_read(struct vop_read_args *ap) * Allow the UIO's size to override the sequential heuristic. */ blksize = hammer_blocksize(uio->uio_offset); - seqcount = (uio->uio_resid + (blksize - 1)) / blksize; - ioseqcount = ap->a_ioflag >> 16; + seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; + ioseqcount = (ap->a_ioflag >> 16); if (seqcount < ioseqcount) seqcount = ioseqcount; - /* - * Temporary hack until more of HAMMER can be made MPSAFE. - */ -#ifdef SMP - if (curthread->td_mpcount) { - got_mplock = -1; - hammer_start_transaction(&trans, ip->hmp); - } else { - got_mplock = 0; - } -#else - hammer_start_transaction(&trans, ip->hmp); - got_mplock = -1; -#endif - /* * If reading or writing a huge amount of data we have to break * atomicy and allow the operation to be interrupted by a signal * or it can DOS the machine. */ bigread = (uio->uio_resid > 100 * 1024 * 1024); + got_fstoken = 0; /* * Access the data typically in HAMMER_BUFSIZE blocks via the @@ -297,9 +381,9 @@ hammer_vop_read(struct vop_read_args *ap) /* * MPUNSAFE */ - if (got_mplock == 0) { - got_mplock = 1; - get_mplock(); + if (got_fstoken == 0) { + lwkt_gettoken(&hmp->fs_token); + got_fstoken = 1; hammer_start_transaction(&trans, ip->hmp); } @@ -316,17 +400,23 @@ hammer_vop_read(struct vop_read_args *ap) } error = cluster_read(ap->a_vp, file_limit, base_offset, - blksize, MAXPHYS, - seqcount, &bp); + blksize, uio->uio_resid, + seqcount * BKVASIZE, &bp); } else { error = bread(ap->a_vp, base_offset, blksize, &bp); } if (error) { - kprintf("error %d\n", error); brelse(bp); break; } skip: + if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { + kprintf("doff %016jx read file %016jx@%016jx\n", + (intmax_t)bp->b_bio2.bio_offset, + (intmax_t)ip->obj_id, + (intmax_t)bp->b_loffset); + } + bp->b_flags &= ~B_IODEBUG; /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ n = blksize - offset; @@ -348,15 +438,14 @@ skip: * XXX only update the atime if we had to get the MP lock. * XXX hack hack hack, fixme. */ - if (got_mplock) { + if (got_fstoken) { if ((ip->flags & HAMMER_INODE_RO) == 0 && (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { ip->ino_data.atime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_ATIME); + hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); } hammer_done_transaction(&trans); - if (got_mplock > 0) - rel_mplock(); + lwkt_reltoken(&hmp->fs_token); } return (error); } @@ -379,7 +468,6 @@ hammer_vop_write(struct vop_write_args *ap) int error; int n; int flags; - int delta; int seqcount; int bigwrite; @@ -397,6 +485,7 @@ hammer_vop_write(struct vop_write_args *ap) /* * Create a transaction to cover the operations we perform. */ + lwkt_gettoken(&hmp->fs_token); hammer_start_transaction(&trans, hmp); uio = ap->a_uio; @@ -414,11 +503,13 @@ hammer_vop_write(struct vop_write_args *ap) */ if (uio->uio_offset < 0) { hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (EFBIG); } base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (EFBIG); } @@ -426,8 +517,15 @@ hammer_vop_write(struct vop_write_args *ap) * If reading or writing a huge amount of data we have to break * atomicy and allow the operation to be interrupted by a signal * or it can DOS the machine. + * + * Preset redo_count so we stop generating REDOs earlier if the + * limit is exceeded. */ bigwrite = (uio->uio_resid > 100 * 1024 * 1024); + if ((ip->flags & HAMMER_INODE_REDO) && + ip->redo_count < hammer_limit_redo) { + ip->redo_count += uio->uio_resid; + } /* * Access the data typically in HAMMER_BUFSIZE blocks via the @@ -438,6 +536,9 @@ hammer_vop_write(struct vop_write_args *ap) int fixsize = 0; int blksize; int blkmask; + int trivial; + int endofblk; + off_t nsize; if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) break; @@ -464,6 +565,22 @@ hammer_vop_write(struct vop_write_args *ap) if ((ap->a_ioflag & IO_RECURSE) == 0) bwillwrite(blksize); + /* + * Control the number of pending records associated with + * this inode. If too many have accumulated start a + * flush. Try to maintain a pipeline with the flusher. + */ + if (ip->rsv_recs >= hammer_limit_inode_recs) { + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + } + if (ip->rsv_recs >= hammer_limit_inode_recs * 2) { + while (ip->rsv_recs >= hammer_limit_inode_recs) { + tsleep(&ip->rsv_recs, 0, "hmrwww", hz); + } + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + } + +#if 0 /* * Do not allow HAMMER to blow out system memory by * accumulating too many records. Records are so well @@ -503,6 +620,7 @@ hammer_vop_write(struct vop_write_args *ap) if (delta > 0) tsleep(&trans, 0, "hmrslo", delta); } +#endif /* * Calculate the blocksize at the current offset and figure @@ -512,10 +630,26 @@ hammer_vop_write(struct vop_write_args *ap) offset = (int)uio->uio_offset & blkmask; base_offset = uio->uio_offset & ~(int64_t)blkmask; n = blksize - offset; - if (n > uio->uio_resid) + if (n > uio->uio_resid) { n = uio->uio_resid; - if (uio->uio_offset + n > ip->ino_data.size) { - vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); + endofblk = 0; + } else { + endofblk = 1; + } + nsize = uio->uio_offset + n; + if (nsize > ip->ino_data.size) { + if (uio->uio_offset > ip->ino_data.size) + trivial = 0; + else + trivial = 1; + nvextendbuf(ap->a_vp, + ip->ino_data.size, + nsize, + hammer_blocksize(ip->ino_data.size), + hammer_blocksize(nsize), + hammer_blockoff(ip->ino_data.size), + hammer_blockoff(nsize), + trivial); fixsize = 1; kflags |= NOTE_EXTEND; } @@ -561,9 +695,35 @@ hammer_vop_write(struct vop_write_args *ap) if (error == 0) bheavy(bp); } - if (error == 0) { - error = uiomove((char *)bp->b_data + offset, - n, uio); + if (error == 0) + error = uiomove(bp->b_data + offset, n, uio); + + /* + * Generate REDO records if enabled and redo_count will not + * exceeded the limit. + * + * If redo_count exceeds the limit we stop generating records + * and clear HAMMER_INODE_REDO. This will cause the next + * fsync() to do a full meta-data sync instead of just an + * UNDO/REDO fifo update. + * + * When clearing HAMMER_INODE_REDO any pre-existing REDOs + * will still be tracked. The tracks will be terminated + * when the related meta-data (including possible data + * modifications which are not tracked via REDO) is + * flushed. + */ + if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { + if (ip->redo_count < hammer_limit_redo) { + bp->b_flags |= B_VFSFLAG1; + error = hammer_generate_redo(&trans, ip, + base_offset + offset, + HAMMER_REDO_WRITE, + bp->b_data + offset, + (size_t)n); + } else { + ip->flags &= ~HAMMER_INODE_REDO; + } } /* @@ -573,8 +733,9 @@ hammer_vop_write(struct vop_write_args *ap) if (error) { brelse(bp); if (fixsize) { - vtruncbuf(ap->a_vp, ip->ino_data.size, - hammer_blocksize(ip->ino_data.size)); + nvtruncbuf(ap->a_vp, ip->ino_data.size, + hammer_blocksize(ip->ino_data.size), + hammer_blockoff(ip->ino_data.size)); } break; } @@ -583,14 +744,13 @@ hammer_vop_write(struct vop_write_args *ap) /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ if (ip->ino_data.size < uio->uio_offset) { ip->ino_data.size = uio->uio_offset; - flags = HAMMER_INODE_DDIRTY; - vnode_pager_setsize(ap->a_vp, ip->ino_data.size); + flags = HAMMER_INODE_SDIRTY; } else { flags = 0; } ip->ino_data.mtime = trans.time; flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; - hammer_modify_inode(ip, flags); + hammer_modify_inode(&trans, ip, flags); /* * Once we dirty the buffer any cached zone-X offset @@ -602,23 +762,58 @@ hammer_vop_write(struct vop_write_args *ap) /* * Final buffer disposition. + * + * Because meta-data updates are deferred, HAMMER is + * especially sensitive to excessive bdwrite()s because + * the I/O stream is not broken up by disk reads. So the + * buffer cache simply cannot keep up. + * + * WARNING! blksize is variable. cluster_write() is + * expected to not blow up if it encounters + * buffers that do not match the passed blksize. + * + * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). + * The ip->rsv_recs check should burst-flush the data. + * If we queue it immediately the buf could be left + * locked on the device queue for a very long time. + * + * NOTE! To avoid degenerate stalls due to mismatched block + * sizes we only honor IO_DIRECT on the write which + * abuts the end of the buffer. However, we must + * honor IO_SYNC in case someone is silly enough to + * configure a HAMMER file as swap, or when HAMMER + * is serving NFS (for commits). Ick ick. */ bp->b_flags |= B_AGE; if (ap->a_ioflag & IO_SYNC) { bwrite(bp); - } else if (ap->a_ioflag & IO_DIRECT) { + } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { bawrite(bp); } else { +#if 0 + if (offset + n == blksize) { + if (hammer_cluster_enable == 0 || + (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { + bawrite(bp); + } else { + cluster_write(bp, ip->ino_data.size, + blksize, seqcount); + } + } else { +#endif bdwrite(bp); } } hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); + lwkt_reltoken(&hmp->fs_token); return (error); } /* * hammer_vop_access { vp, mode, cred } + * + * MPSAFE - does not require fs_token */ static int @@ -640,6 +835,8 @@ hammer_vop_access(struct vop_access_args *ap) /* * hammer_vop_advlock { vp, id, op, fl, flags } + * + * MPSAFE - does not require fs_token */ static int @@ -652,12 +849,30 @@ hammer_vop_advlock(struct vop_advlock_args *ap) /* * hammer_vop_close { vp, fflag } + * + * We can only sync-on-close for normal closes. XXX disabled for now. */ static int hammer_vop_close(struct vop_close_args *ap) { - /*hammer_inode_t ip = VTOI(ap->a_vp);*/ +#if 0 + struct vnode *vp = ap->a_vp; + hammer_inode_t ip = VTOI(vp); + int waitfor; + if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { + if (vn_islocked(vp) == LK_EXCLUSIVE && + (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { + if (ip->flags & HAMMER_INODE_CLOSESYNC) + waitfor = MNT_WAIT; + else + waitfor = MNT_NOWAIT; + ip->flags &= ~(HAMMER_INODE_CLOSESYNC | + HAMMER_INODE_CLOSEASYNC); + VOP_FSYNC(vp, MNT_NOWAIT, waitfor); + } + } +#endif return (vop_stdclose(ap)); } @@ -675,20 +890,23 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -703,6 +921,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) hkprintf("hammer_create_inode error %d\n", error); hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -733,6 +952,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) } hammer_knote(ap->a_dvp, NOTE_WRITE); } + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -744,7 +964,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) * The atime field is stored in the B-Tree element and allowed to be * updated without cycling the element. * - * MPSAFE + * MPSAFE - does not require fs_token */ static int @@ -825,8 +1045,6 @@ hammer_vop_getattr(struct vop_getattr_args *ap) vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); vap->va_filerev = 0; /* XXX */ - /* mtime uniquely identifies any adjustments made to the file XXX */ - vap->va_fsmid = ip->ino_data.mtime; vap->va_uid_uuid = ip->ino_data.uid; vap->va_gid_uuid = ip->ino_data.gid; vap->va_fsid_uuid = ip->hmp->fsid; @@ -857,6 +1075,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) { struct hammer_transaction trans; struct namecache *ncp; + hammer_mount_t hmp; hammer_inode_t dip; hammer_inode_t ip; hammer_tid_t asof; @@ -884,8 +1103,10 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) nlen = ncp->nc_nlen; flags = dip->flags & HAMMER_INODE_RO; ispfs = 0; + hmp = dip->hmp; - hammer_simple_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; for (i = 0; i < nlen; ++i) { @@ -1040,6 +1261,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) } done: hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1066,6 +1288,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) struct hammer_transaction trans; struct hammer_inode *dip; struct hammer_inode *ip; + hammer_mount_t hmp; int64_t parent_obj_id; u_int32_t parent_obj_localization; hammer_tid_t asof; @@ -1073,11 +1296,13 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) dip = VTOI(ap->a_dvp); asof = dip->obj_asof; + hmp = dip->hmp; /* * Whos are parent? This could be the root of a pseudo-filesystem * whos parent is in another localization domain. */ + lwkt_gettoken(&hmp->fs_token); parent_obj_id = dip->ino_data.parent_obj_id; if (dip->obj_id == HAMMER_OBJID_ROOT) parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; @@ -1086,19 +1311,20 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) if (parent_obj_id == 0) { if (dip->obj_id == HAMMER_OBJID_ROOT && - asof != dip->hmp->asof) { + asof != hmp->asof) { parent_obj_id = dip->obj_id; - asof = dip->hmp->asof; + asof = hmp->asof; *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); ksnprintf(*ap->a_fakename, 19, "0x%016llx", (long long)dip->obj_asof); } else { *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return ENOENT; } } - hammer_simple_transaction(&trans, dip->hmp); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; ip = hammer_get_inode(&trans, dip, parent_obj_id, @@ -1111,6 +1337,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) *ap->a_vpp = NULL; } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1125,6 +1352,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) struct hammer_inode *dip; struct hammer_inode *ip; struct nchandle *nch; + hammer_mount_t hmp; int error; if (ap->a_dvp->v_mount != ap->a_vp->v_mount) @@ -1133,6 +1361,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) nch = ap->a_nch; dip = VTOI(ap->a_dvp); ip = VTOI(ap->a_vp); + hmp = dip->hmp; if (dip->obj_localization != ip->obj_localization) return(EXDEV); @@ -1141,13 +1370,14 @@ hammer_vop_nlink(struct vop_nlink_args *ap) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1169,6 +1399,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_LINK); hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1186,20 +1417,23 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1213,6 +1447,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hkprintf("hammer_mkdir error %d\n", error); hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } /* @@ -1242,6 +1477,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1259,20 +1495,23 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1287,6 +1526,7 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) if (error) { hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1315,11 +1555,14 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } /* * hammer_vop_open { vp, mode, cred, fp } + * + * MPSAFE (does not require fs_token) */ static int @@ -1355,6 +1598,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) struct hammer_transaction trans; struct hammer_cursor cursor; struct hammer_inode *ip; + hammer_mount_t hmp; struct uio *uio; hammer_base_elm_t base; int error; @@ -1369,6 +1613,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) ip = VTOI(ap->a_vp); uio = ap->a_uio; saveoff = uio->uio_offset; + hmp = ip->hmp; if (ap->a_ncookies) { ncookies = uio->uio_resid / 16 + 1; @@ -1382,7 +1627,8 @@ hammer_vop_readdir(struct vop_readdir_args *ap) cookie_index = 0; } - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); /* * Handle artificial entries @@ -1497,6 +1743,7 @@ done: *ap->a_cookies = cookies; } } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -1510,12 +1757,16 @@ hammer_vop_readlink(struct vop_readlink_args *ap) struct hammer_transaction trans; struct hammer_cursor cursor; struct hammer_inode *ip; + hammer_mount_t hmp; char buf[32]; u_int32_t localization; hammer_pseudofs_inmem_t pfsm; int error; ip = VTOI(ap->a_vp); + hmp = ip->hmp; + + lwkt_gettoken(&hmp->fs_token); /* * Shortcut if the symlink data was stuffed into ino_data. @@ -1534,7 +1785,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) ip->obj_asof == HAMMER_MAX_TID && ip->obj_localization == 0 && strncmp(ptr, "@@PFS", 5) == 0) { - hammer_simple_transaction(&trans, ip->hmp); + hammer_simple_transaction(&trans, hmp); bcopy(ptr + 5, buf, 5); buf[5] = 0; localization = strtoul(buf, NULL, 10) << 16; @@ -1564,17 +1815,18 @@ hammer_vop_readlink(struct vop_readlink_args *ap) bytes = strlen(buf); } if (pfsm) - hammer_rel_pseudofs(trans.hmp, pfsm); + hammer_rel_pseudofs(hmp, pfsm); hammer_done_transaction(&trans); } error = uiomove(ptr, bytes, ap->a_uio); + lwkt_reltoken(&hmp->fs_token); return(error); } /* * Long version */ - hammer_simple_transaction(&trans, ip->hmp); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); @@ -1607,6 +1859,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -1619,21 +1872,25 @@ hammer_vop_nremove(struct vop_nremove_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1650,6 +1907,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) struct hammer_inode *fdip; struct hammer_inode *tdip; struct hammer_inode *ip; + hammer_mount_t hmp; struct hammer_cursor cursor; int64_t namekey; u_int32_t max_iterations; @@ -1667,6 +1925,8 @@ hammer_vop_nrename(struct vop_nrename_args *ap) ip = VTOI(fncp->nc_vp); KKASSERT(ip != NULL); + hmp = ip->hmp; + if (fdip->obj_localization != tdip->obj_localization) return(EXDEV); if (fdip->obj_localization != ip->obj_localization) @@ -1678,10 +1938,11 @@ hammer_vop_nrename(struct vop_nrename_args *ap) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); - hammer_start_transaction(&trans, fdip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1700,7 +1961,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) if (error == 0) { ip->ino_data.parent_obj_id = tdip->obj_id; ip->ino_data.ctime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); } } if (error) @@ -1774,18 +2035,34 @@ retry: /* * Cleanup and tell the kernel that the rename succeeded. + * + * NOTE: ip->vp, if non-NULL, cannot be directly referenced + * without formally acquiring the vp since the vp might + * have zero refs on it, or in the middle of a reclaim, + * etc. */ hammer_done_cursor(&cursor); if (error == 0) { cache_rename(ap->a_fnch, ap->a_tnch); hammer_knote(ap->a_fdvp, NOTE_WRITE); hammer_knote(ap->a_tdvp, NOTE_WRITE); - if (ip->vp) - hammer_knote(ip->vp, NOTE_RENAME); + while (ip->vp) { + struct vnode *vp; + + error = hammer_get_vnode(ip, &vp); + if (error == 0 && vp) { + vn_unlock(vp); + hammer_knote(ip->vp, NOTE_RENAME); + vrele(vp); + break; + } + kprintf("Debug: HAMMER ip/vp race2 avoided\n"); + } } failed: hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1798,21 +2075,25 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1825,21 +2106,25 @@ hammer_vop_markatime(struct vop_markatime_args *ap) { struct hammer_transaction trans; struct hammer_inode *ip; + hammer_mount_t hmp; ip = VTOI(ap->a_vp); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if (ip->hmp->mp->mnt_flag & MNT_NOATIME) + hmp = ip->hmp; + if (hmp->mp->mnt_flag & MNT_NOATIME) return (0); - hammer_start_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; ip->ino_data.atime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_ATIME); + hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_ATTRIB); + lwkt_reltoken(&hmp->fs_token); return (0); } @@ -1851,31 +2136,36 @@ int hammer_vop_setattr(struct vop_setattr_args *ap) { struct hammer_transaction trans; - struct vattr *vap; struct hammer_inode *ip; + struct vattr *vap; + hammer_mount_t hmp; int modflags; int error; int truncating; int blksize; int kflags; +#if 0 int64_t aligned_size; +#endif u_int32_t flags; vap = ap->a_vap; ip = ap->a_vp->v_data; modflags = 0; kflags = 0; + hmp = ip->hmp; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return(EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); if (hammer_nohistory(ip) == 0 && - (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = 0; @@ -1935,28 +2225,58 @@ hammer_vop_setattr(struct vop_setattr_args *ap) case VREG: if (vap->va_size == ip->ino_data.size) break; + + /* + * Log the operation if in fast-fsync mode or if + * there are unterminated redo write records present. + * + * The second check is needed so the recovery code + * properly truncates write redos even if nominal + * REDO operations is turned off due to excessive + * writes, because the related records might be + * destroyed and never lay down a TERM_WRITE. + */ + if ((ip->flags & HAMMER_INODE_REDO) || + (ip->flags & HAMMER_INODE_RDIRTY)) { + error = hammer_generate_redo(&trans, ip, + vap->va_size, + HAMMER_REDO_TRUNC, + NULL, 0); + } + blksize = hammer_blocksize(vap->va_size); + /* * XXX break atomicy, we can deadlock the backend * if we do not release the lock. Probably not a * big deal here. */ - blksize = hammer_blocksize(vap->va_size); if (vap->va_size < ip->ino_data.size) { - vtruncbuf(ap->a_vp, vap->va_size, blksize); + nvtruncbuf(ap->a_vp, vap->va_size, + blksize, + hammer_blockoff(vap->va_size)); truncating = 1; kflags |= NOTE_WRITE; } else { - vnode_pager_setsize(ap->a_vp, vap->va_size); + nvextendbuf(ap->a_vp, + ip->ino_data.size, + vap->va_size, + hammer_blocksize(ip->ino_data.size), + hammer_blocksize(vap->va_size), + hammer_blockoff(ip->ino_data.size), + hammer_blockoff(vap->va_size), + 0); truncating = 0; kflags |= NOTE_WRITE | NOTE_EXTEND; } ip->ino_data.size = vap->va_size; ip->ino_data.mtime = trans.time; + /* XXX safe to use SDIRTY instead of DDIRTY here? */ modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; /* - * on-media truncation is cached in the inode until - * the inode is synchronized. + * On-media truncation is cached in the inode until + * the inode is synchronized. We must immediately + * handle any frontend records. */ if (truncating) { hammer_ip_frontend_trunc(ip, vap->va_size); @@ -1988,34 +2308,20 @@ hammer_vop_setattr(struct vop_setattr_args *ap) } } +#if 0 /* - * If truncating we have to clean out a portion of - * the last block on-disk. We do this in the - * front-end buffer cache. + * When truncating, nvtruncbuf() may have cleaned out + * a portion of the last block on-disk in the buffer + * cache. We must clean out any frontend records + * for blocks beyond the new last block. */ aligned_size = (vap->va_size + (blksize - 1)) & ~(int64_t)(blksize - 1); if (truncating && vap->va_size < aligned_size) { - struct buf *bp; - int offset; - aligned_size -= blksize; - - offset = (int)vap->va_size & (blksize - 1); - error = bread(ap->a_vp, aligned_size, - blksize, &bp); hammer_ip_frontend_trunc(ip, aligned_size); - if (error == 0) { - bzero(bp->b_data + offset, - blksize - offset); - /* must de-cache direct-io offset */ - bp->b_bio2.bio_offset = NOOFFSET; - bdwrite(bp); - } else { - kprintf("ERROR %d\n", error); - brelse(bp); - } } +#endif break; case VDATABASE: if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { @@ -2062,9 +2368,10 @@ hammer_vop_setattr(struct vop_setattr_args *ap) } done: if (error == 0) - hammer_modify_inode(ip, modflags); + hammer_modify_inode(&trans, ip, modflags); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2078,8 +2385,9 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) struct hammer_transaction trans; struct hammer_inode *dip; struct hammer_inode *nip; - struct nchandle *nch; hammer_record_t record; + struct nchandle *nch; + hammer_mount_t hmp; int error; int bytes; @@ -2087,16 +2395,18 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -2110,6 +2420,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) if (error) { hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2141,7 +2452,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) */ if (error == 0) { nip->ino_data.size = bytes; - hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); } } if (error == 0) @@ -2164,6 +2475,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) } } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2176,20 +2488,24 @@ hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags, -1); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2202,10 +2518,15 @@ int hammer_vop_ioctl(struct vop_ioctl_args *ap) { struct hammer_inode *ip = ap->a_vp->v_data; + hammer_mount_t hmp = ip->hmp; + int error; ++hammer_stats_file_iopsr; - return(hammer_ioctl(ip, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_cred)); + lwkt_gettoken(&hmp->fs_token); + error = hammer_ioctl(ip, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_cred); + lwkt_reltoken(&hmp->fs_token); + return (error); } static @@ -2229,8 +2550,9 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) KKASSERT(mp->mnt_data != NULL); hmp = (struct hammer_mount *)mp->mnt_data; - switch(ap->a_op) { + lwkt_gettoken(&hmp->fs_token); + switch(ap->a_op) { case MOUNTCTL_SET_EXPORT: if (ap->a_ctllen != sizeof(struct export_args)) error = EINVAL; @@ -2251,7 +2573,8 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) usedbytes = *ap->a_res; if (usedbytes > 0 && usedbytes < ap->a_buflen) { - usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf, + usedbytes += vfs_flagstostr(hmp->hflags, extraopt, + ap->a_buf, ap->a_buflen - usedbytes, &error); } @@ -2263,6 +2586,7 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) error = vop_stdmountctl(ap); break; } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2298,6 +2622,9 @@ hammer_vop_strategy(struct vop_strategy_args *ap) biodone(ap->a_bio); break; } + + /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ + return (error); } @@ -2319,6 +2646,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) struct hammer_transaction trans; struct hammer_inode *ip; struct hammer_inode *dip; + hammer_mount_t hmp; struct hammer_cursor cursor; hammer_base_elm_t base; hammer_off_t disk_offset; @@ -2336,6 +2664,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) bio = ap->a_bio; bp = bio->bio_buf; ip = ap->a_vp->v_data; + hmp = ip->hmp; /* * The zone-2 disk offset may have been set by the cluster code via @@ -2346,7 +2675,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) nbio = push_bio(bio); if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_LARGE_DATA) { - error = hammer_io_direct_read(ip->hmp, nbio, NULL); + lwkt_gettoken(&hmp->fs_token); + error = hammer_io_direct_read(hmp, nbio, NULL); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2354,7 +2685,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) * Well, that sucked. Do it the hard way. If all the stars are * aligned we may still be able to issue a direct-read. */ - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); /* @@ -2455,8 +2787,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) * buffers and frontend-owned in-memory records synchronously. */ if (ip->flags & HAMMER_INODE_TRUNCATED) { - if (hammer_cursor_ondisk(&cursor) || - cursor.iprec->flush_state == HAMMER_FST_FLUSH) { + if (hammer_cursor_ondisk(&cursor)/* || + cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { if (ip->trunc_off <= rec_offset) n = 0; else if (ip->trunc_off < rec_offset + n) @@ -2488,8 +2820,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_LARGE_DATA); nbio->bio_offset = disk_offset; - error = hammer_io_direct_read(trans.hmp, nbio, - cursor.leaf); + error = hammer_io_direct_read(hmp, nbio, cursor.leaf); + if (hammer_live_dedup) + hammer_dedup_cache_add(ip, cursor.leaf); goto done; } else if (n) { error = hammer_ip_resolve_data(&cursor); @@ -2501,6 +2834,13 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) if (error) break; + /* + * We have to be sure that the only elements added to the + * dedup cache are those which are already on-media. + */ + if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) + hammer_dedup_cache_add(ip, cursor.leaf); + /* * Iterate until we have filled the request. */ @@ -2550,6 +2890,7 @@ done: } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2574,6 +2915,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) { struct hammer_transaction trans; struct hammer_inode *ip; + hammer_mount_t hmp; struct hammer_cursor cursor; hammer_base_elm_t base; int64_t rec_offset; @@ -2590,6 +2932,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) ++hammer_stats_file_iopsr; ip = ap->a_vp->v_data; + hmp = ip->hmp; /* * We can only BMAP regular files. We can't BMAP database files, @@ -2609,7 +2952,8 @@ hammer_vop_bmap(struct vop_bmap_args *ap) * Scan the B-Tree to acquire blockmap addresses, then translate * to raw addresses. */ - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); #if 0 kprintf("bmap_beg %016llx ip->cache %p\n", (long long)ap->a_loffset, ip->cache[1]); @@ -2711,7 +3055,11 @@ hammer_vop_bmap(struct vop_bmap_args *ap) } last_offset = rec_offset + rec_len; last_disk_offset = disk_offset + rec_len; + + if (hammer_live_dedup) + hammer_dedup_cache_add(ip, cursor.leaf); } + error = hammer_ip_next(&cursor); } @@ -2734,6 +3082,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); /* * If we couldn't find any records or the records we did find were @@ -2826,6 +3175,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) return(EROFS); } + lwkt_gettoken(&hmp->fs_token); + /* * Interlock with inode destruction (no in-kernel or directory * topology visibility). If we queue new IO while trying to @@ -2840,6 +3191,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { bp->b_resid = 0; biodone(ap->a_bio); + lwkt_reltoken(&hmp->fs_token); return(0); } @@ -2869,8 +3221,24 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, bytes, &error); + + /* + * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated + * in hammer_vop_write(). We must flag the record so the proper + * REDO_TERM_WRITE entry is generated during the flush. + */ if (record) { - hammer_io_direct_write(hmp, record, bio); + if (bp->b_flags & B_VFSFLAG1) { + record->flags |= HAMMER_RECF_REDO; + bp->b_flags &= ~B_VFSFLAG1; + } + if (record->flags & HAMMER_RECF_DEDUPED) { + bp->b_resid = 0; + hammer_ip_replace_bulk(hmp, record); + biodone(ap->a_bio); + } else { + hammer_io_direct_write(hmp, bio, record); + } if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) hammer_flush_inode(ip, 0); } else { @@ -2879,6 +3247,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) bp->b_flags |= B_ERROR; biodone(ap->a_bio); } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2895,6 +3264,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, struct namecache *ncp; hammer_inode_t dip; hammer_inode_t ip; + hammer_mount_t hmp; struct hammer_cursor cursor; int64_t namekey; u_int32_t max_iterations; @@ -2909,6 +3279,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, */ dip = VTOI(dvp); ncp = nch->ncp; + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); @@ -2965,7 +3336,7 @@ retry: if (error == 0) { hammer_unlock(&cursor.ip->lock); ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, - dip->hmp->asof, + hmp->asof, cursor.data->entry.localization, 0, &error); hammer_lock_sh(&cursor.ip->lock); @@ -3003,6 +3374,9 @@ retry: * * If any changes whatsoever have been made to the cursor * set EDEADLK and retry. + * + * WARNING: See warnings in hammer_unlock_cursor() + * function. */ if (error == 0 && ip && ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { @@ -3032,10 +3406,30 @@ retry: if (error == 0) { cache_setunresolved(nch); cache_setvp(nch, NULL); - /* XXX locking */ - if (ip && ip->vp) { - hammer_knote(ip->vp, NOTE_DELETE); - cache_inval_vp(ip->vp, CINV_DESTROY); + + /* + * NOTE: ip->vp, if non-NULL, cannot be directly + * referenced without formally acquiring the + * vp since the vp might have zero refs on it, + * or in the middle of a reclaim, etc. + * + * NOTE: The cache_setunresolved() can rip the vp + * out from under us since the vp may not have + * any refs, in which case ip->vp will be NULL + * from the outset. + */ + while (ip && ip->vp) { + struct vnode *vp; + + error = hammer_get_vnode(ip, &vp); + if (error == 0 && vp) { + vn_unlock(vp); + hammer_knote(ip->vp, NOTE_DELETE); + cache_inval_vp(ip->vp, CINV_DESTROY); + vrele(vp); + break; + } + kprintf("Debug: HAMMER ip/vp race1 avoided\n"); } } if (ip) @@ -3054,7 +3448,6 @@ retry: ************************************************************************ * */ - static int hammer_vop_fifoclose (struct vop_close_args *ap) { @@ -3105,11 +3498,11 @@ static int filt_hammerwrite(struct knote *kn, long hint); static int filt_hammervnode(struct knote *kn, long hint); static struct filterops hammerread_filtops = - { 1, NULL, filt_hammerdetach, filt_hammerread }; + { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; static struct filterops hammerwrite_filtops = - { 1, NULL, filt_hammerdetach, filt_hammerwrite }; + { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; static struct filterops hammervnode_filtops = - { 1, NULL, filt_hammerdetach, filt_hammervnode }; + { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; static int @@ -3117,7 +3510,6 @@ hammer_vop_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; - lwkt_tokref vlock; switch (kn->kn_filter) { case EVFILT_READ: @@ -3130,14 +3522,12 @@ hammer_vop_kqfilter(struct vop_kqfilter_args *ap) kn->kn_fop = &hammervnode_filtops; break; default: - return (1); + return (EOPNOTSUPP); } kn->kn_hook = (caddr_t)vp; - lwkt_gettoken(&vlock, &vp->v_token); - SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); - lwkt_reltoken(&vlock); + knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); return(0); } @@ -3146,12 +3536,8 @@ static void filt_hammerdetach(struct knote *kn) { struct vnode *vp = (void *)kn->kn_hook; - lwkt_tokref vlock; - lwkt_gettoken(&vlock, &vp->v_token); - SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, - kn, knote, kn_selnext); - lwkt_reltoken(&vlock); + knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); } static int @@ -3159,12 +3545,19 @@ filt_hammerread(struct knote *kn, long hint) { struct vnode *vp = (void *)kn->kn_hook; hammer_inode_t ip = VTOI(vp); + hammer_mount_t hmp = ip->hmp; + off_t off; if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return(1); } - kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset; + lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ + off = ip->ino_data.size - kn->kn_fp->f_offset; + kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; + lwkt_reltoken(&hmp->fs_token); + if (kn->kn_sfflags & NOTE_OLDAPI) + return(1); return (kn->kn_data != 0); }