brelvp(bp);
}
-/*
- * vfs_bio_awrite:
- *
- * Implement clustered async writes for clearing out B_DELWRI buffers.
- * This is much better then the old way of writing only one buffer at
- * a time. Note that we may not be presented with the buffers in the
- * correct order, so we search for the cluster in both directions.
- *
- * The buffer is locked on call.
- */
-int
-vfs_bio_awrite(struct buf *bp)
-{
- int i;
- int j;
- off_t loffset = bp->b_loffset;
- struct vnode *vp = bp->b_vp;
- int nbytes;
- struct buf *bpa;
- int nwritten;
- int size;
-
- /*
- * right now we support clustered writing only to regular files. If
- * we find a clusterable block we could be in the middle of a cluster
- * rather then at the beginning.
- *
- * NOTE: b_bio1 contains the logical loffset and is aliased
- * to b_loffset. b_bio2 contains the translated block number.
- */
- if ((vp->v_type == VREG) &&
- (vp->v_mount != 0) && /* Only on nodes that have the size info */
- (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
-
- size = vp->v_mount->mnt_stat.f_iosize;
-
- for (i = size; i < MAXPHYS; i += size) {
- if ((bpa = findblk(vp, loffset + i, FINDBLK_TEST)) &&
- BUF_REFCNT(bpa) == 0 &&
- ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
- (B_DELWRI | B_CLUSTEROK)) &&
- (bpa->b_bufsize == size)) {
- if ((bpa->b_bio2.bio_offset == NOOFFSET) ||
- (bpa->b_bio2.bio_offset !=
- bp->b_bio2.bio_offset + i))
- break;
- } else {
- break;
- }
- }
- for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) {
- if ((bpa = findblk(vp, loffset - j, FINDBLK_TEST)) &&
- BUF_REFCNT(bpa) == 0 &&
- ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
- (B_DELWRI | B_CLUSTEROK)) &&
- (bpa->b_bufsize == size)) {
- if ((bpa->b_bio2.bio_offset == NOOFFSET) ||
- (bpa->b_bio2.bio_offset !=
- bp->b_bio2.bio_offset - j))
- break;
- } else {
- break;
- }
- }
- j -= size;
- nbytes = (i + j);
-
- /*
- * this is a possible cluster write
- */
- if (nbytes != size) {
- BUF_UNLOCK(bp);
- nwritten = cluster_wbuild(vp, size,
- loffset - j, nbytes);
- return nwritten;
- }
- }
-
- /*
- * default (old) behavior, writing out only one block
- *
- * XXX returns b_bufsize instead of b_bcount for nwritten?
- */
- nwritten = bp->b_bufsize;
- bremfree(bp);
- bawrite(bp);
-
- return nwritten;
-}
-
/*
* getnewbuf:
*
*
* NOTE: buf_checkwrite is MPSAFE.
*/
+ bremfree(bp);
if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {
- bremfree(bp);
brelse(bp);
} else if (bp->b_flags & B_ERROR) {
tsleep(bp, 0, "bioer", 1);
bp->b_flags &= ~B_AGE;
- vfs_bio_awrite(bp);
+ cluster_awrite(bp);
} else {
bp->b_flags |= B_AGE;
- vfs_bio_awrite(bp);
+ cluster_awrite(bp);
}
spin_lock(&bufqspin);
++r;
* still be fully cached after reinstantiation to be returned.
*/
struct buf *
-getcacheblk(struct vnode *vp, off_t loffset, int blksize)
+getcacheblk(struct vnode *vp, off_t loffset, int blksize, int blkflags)
{
struct buf *bp;
+ int fndflags = (blkflags & GETBLK_NOWAIT) ? FINDBLK_NBLOCK : 0;
if (blksize) {
- bp = getblk(vp, loffset, blksize, 0, 0);
+ bp = getblk(vp, loffset, blksize, blkflags, 0);
if (bp) {
if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) ==
B_CACHE) {
}
}
} else {
- bp = findblk(vp, loffset, 0);
+ bp = findblk(vp, loffset, fndflags);
if (bp) {
if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) ==
B_CACHE) {
struct buf *fbp);
static void cluster_callback (struct bio *);
static void cluster_setram (struct buf *);
+static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize,
+ off_t start_loffset, int bytes);
static int write_behind = 1;
SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
start_loffset -= len;
/* fall through */
case 1:
- r = cluster_wbuild(vp, blksize, start_loffset, len);
+ r = cluster_wbuild(vp, NULL, blksize, start_loffset, len);
/* fall through */
default:
/* fall through */
* flush.
*/
cursize = vp->v_lastw - vp->v_cstart + blksize;
- if (bp->b_loffset + blksize != filesize ||
+ if (bp->b_loffset + blksize < filesize ||
loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) {
if (!async && seqcount > 0) {
cluster_wbuild_wb(vp, blksize,
* existing cluster.
*/
if ((vp->v_type == VREG) &&
- bp->b_loffset + blksize != filesize &&
+ bp->b_loffset + blksize < filesize &&
(bp->b_bio2.bio_offset == NOOFFSET) &&
(VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) ||
bp->b_bio2.bio_offset == NOOFFSET)) {
vp->v_lasta = bp->b_bio2.bio_offset;
}
+/*
+ * This is the clustered version of bawrite(). It works similarly to
+ * cluster_write() except I/O on the buffer is guaranteed to occur.
+ */
+int
+cluster_awrite(struct buf *bp)
+{
+ int total;
+
+ /*
+ * Don't bother if it isn't clusterable.
+ */
+ if ((bp->b_flags & B_CLUSTEROK) == 0 ||
+ bp->b_vp == NULL ||
+ (bp->b_vp->v_flag & VOBJBUF) == 0) {
+ total = bp->b_bufsize;
+ bawrite(bp);
+ return (total);
+ }
+
+ total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize,
+ bp->b_loffset, vmaxiosize(bp->b_vp));
+ if (bp)
+ bawrite(bp);
+
+ return total;
+}
/*
* This is an awful lot like cluster_rbuild...wish they could be combined.
* The last lbn argument is the current block on which I/O is being
* performed. Check to see that it doesn't fall in the middle of
* the current block (if last_bp == NULL).
+ *
+ * cluster_wbuild() normally does not guarantee anything. If bpp is
+ * non-NULL and cluster_wbuild() is able to incorporate it into the
+ * I/O it will set *bpp to NULL, otherwise it will leave it alone and
+ * the caller must dispose of *bpp.
*/
-int
-cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
+static int
+cluster_wbuild(struct vnode *vp, struct buf **bpp,
+ int blksize, off_t start_loffset, int bytes)
{
struct buf *bp, *tbp;
int i, j;
int totalwritten = 0;
+ int must_initiate;
int maxiosize = vmaxiosize(vp);
while (bytes > 0) {
/*
- * If the buffer is not delayed-write (i.e. dirty), or it
- * is delayed-write but either locked or inval, it cannot
- * partake in the clustered write.
+ * If the buffer matches the passed locked & removed buffer
+ * we used the passed buffer (which might not be B_DELWRI).
+ *
+ * Otherwise locate the buffer and determine if it is
+ * compatible.
*/
- tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
- if (tbp == NULL ||
- (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI ||
- (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
- if (tbp)
- BUF_UNLOCK(tbp);
- start_loffset += blksize;
- bytes -= blksize;
- continue;
+ if (bpp && (*bpp)->b_loffset == start_loffset) {
+ tbp = *bpp;
+ *bpp = NULL;
+ bpp = NULL;
+ } else {
+ tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
+ if (tbp == NULL ||
+ (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) !=
+ B_DELWRI ||
+ (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
+ if (tbp)
+ BUF_UNLOCK(tbp);
+ start_loffset += blksize;
+ bytes -= blksize;
+ continue;
+ }
+ bremfree(tbp);
}
- bremfree(tbp);
KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
/*
* From this location in the file, scan forward to see
* if there are buffers with adjacent data that need to
* be written as well.
+ *
+ * IO *must* be initiated on index 0 at this point
+ * (particularly when called from cluster_awrite()).
*/
for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
- if (i != 0) { /* If not the first buffer */
+ if (i == 0) {
+ must_initiate = 1;
+ } else {
+ /*
+ * Not first buffer.
+ */
+ must_initiate = 0;
tbp = findblk(vp, start_loffset,
FINDBLK_NBLOCK);
/*
B_INVAL | B_DELWRI | B_NEEDCOMMIT))
!= (B_DELWRI | B_CLUSTEROK |
(bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
- (tbp->b_flags & B_LOCKED) ||
- (LIST_FIRST(&tbp->b_dep) &&
- buf_checkwrite(tbp))
+ (tbp->b_flags & B_LOCKED)
) {
BUF_UNLOCK(tbp);
break;
* Check that the combined cluster
* would make sense with regard to pages
* and would not be too large
+ *
+ * WARNING! buf_checkwrite() must be the last
+ * check made. If it returns 0 then
+ * we must initiate the I/O.
*/
if ((tbp->b_bcount != blksize) ||
((bp->b_bio2.bio_offset + i) !=
tbp->b_bio2.bio_offset) ||
((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
- (maxiosize / PAGE_SIZE))) {
+ (maxiosize / PAGE_SIZE)) ||
+ (LIST_FIRST(&tbp->b_dep) &&
+ buf_checkwrite(tbp))
+ ) {
BUF_UNLOCK(tbp);
break;
}
+ if (LIST_FIRST(&tbp->b_dep))
+ must_initiate = 1;
/*
* Ok, it's passed all the tests,
* so remove it from the free list
*/
bremfree(tbp);
KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
- } /* end of code for non-first buffers only */
+ }
/*
* If the IO is via the VM then we do some
if (tbp->b_flags & B_VMIO) {
vm_page_t m;
- if (i != 0) { /* if not first buffer */
- for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
+ /*
+ * Try to avoid deadlocks with the VM system.
+ * However, we cannot abort the I/O if
+ * must_initiate is non-zero.
+ */
+ if (must_initiate == 0) {
+ for (j = 0;
+ j < tbp->b_xio.xio_npages;
+ ++j) {
m = tbp->b_xio.xio_pages[j];
if (m->flags & PG_BUSY) {
bqrelse(tbp);
buf_start(tbp);
}
finishcluster:
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ (vm_page_t *)bp->b_xio.xio_pages,
+ bp->b_xio.xio_npages);
if (bp->b_bufsize > bp->b_kvasize) {
- panic(
- "cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
- bp->b_bufsize, bp->b_kvasize);
+ panic("cluster_wbuild: b_bufsize(%d) "
+ "> b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
}
totalwritten += bp->b_bufsize;
bp->b_dirtyoff = 0;
}
/*
- * Note that vfs_bio_awrite expects buffers to reside
- * on a queue, while bwrite() and brelse() do not.
- *
* NOTE: NO B_LOCKED CHECK. Also no buf_checkwrite()
* check. This code will write out the buffer, period.
*/
+ bremfree(bp);
if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
(info->flags & V_SAVE)) {
- if (bp->b_flags & B_CLUSTEROK) {
- vfs_bio_awrite(bp);
- } else {
- bremfree(bp);
- bawrite(bp);
- }
+ cluster_awrite(bp);
} else if (info->flags & V_SAVE) {
/*
* Cannot set B_NOCACHE on a clean buffer as this will
* destroy the VM backing store which might actually
* be dirty (and unsynchronized).
*/
- bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
brelse(bp);
} else {
- bremfree(bp);
bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
brelse(bp);
}
* this to support limited MNT_LAZY flushes.
*/
vp->v_lazyw = bp->b_loffset;
- if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
- info->lazycount += vfs_bio_awrite(bp);
- } else {
- info->lazycount += bp->b_bufsize;
- bremfree(bp);
- bawrite(bp);
- }
+ bremfree(bp);
+ info->lazycount += cluster_awrite(bp);
waitrunningbufspace();
vm_wait_nominal();
if (info->lazylimit && info->lazycount >= info->lazylimit)
int bowrite (struct buf *);
void brelse (struct buf *);
void bqrelse (struct buf *);
-int vfs_bio_awrite (struct buf *);
+int cluster_awrite (struct buf *);
struct buf *getpbuf (int *);
struct buf *getpbuf_kva (int *);
int inmem (struct vnode *, off_t);
struct buf *findblk (struct vnode *, off_t, int);
struct buf *getblk (struct vnode *, off_t, int, int, int);
-struct buf *getcacheblk (struct vnode *, off_t, int);
+struct buf *getcacheblk (struct vnode *, off_t, int, int);
struct buf *geteblk (int);
struct buf *getnewbuf(int, int, int, int);
void bqhold(struct buf *bp);
void cluster_append(struct bio *, struct buf *);
int cluster_readx (struct vnode *, off_t, off_t, int,
size_t, size_t, struct buf **);
-int cluster_wbuild (struct vnode *, int, off_t, int);
void cluster_write (struct buf *, off_t, int, int);
int physread (struct dev_read_args *);
int physwrite (struct dev_write_args *);
case HAMMER_STRUCTURE_UNDO_BUFFER:
if (io->released == 0) {
io->released = 1;
+ bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);
}
break;
lwkt_gettoken(&hmp->io_token);
TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry);
lwkt_reltoken(&hmp->io_token);
- bawrite(bp);
+ cluster_awrite(bp);
hammer_io_flush_mark(io->volume);
}
}
bp->b_flags &= ~B_IODEBUG;
- /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
+ bp->b_flags |= B_CLUSTEROK;
n = blksize - offset;
if (n > uio->uio_resid)
n = uio->uio_resid;
struct uio *uio;
int offset;
off_t base_offset;
+ int64_t cluster_eof;
struct buf *bp;
int kflags;
int error;
}
kflags |= NOTE_WRITE;
hammer_stats_file_write += n;
- /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
+ bp->b_flags |= B_CLUSTEROK;
if (ip->ino_data.size < uio->uio_offset) {
ip->ino_data.size = uio->uio_offset;
flags = HAMMER_INODE_SDIRTY;
* configure a HAMMER file as swap, or when HAMMER
* is serving NFS (for commits). Ick ick.
*/
- bp->b_flags |= B_AGE;
+ bp->b_flags |= B_AGE | B_CLUSTEROK;
if (ap->a_ioflag & IO_SYNC) {
bwrite(bp);
} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
bawrite(bp);
} else if (ap->a_ioflag & IO_ASYNC) {
bawrite(bp);
+ } else if (hammer_cluster_enable &&
+ !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
+ if (base_offset < HAMMER_XDEMARC)
+ cluster_eof = hammer_blockdemarc(base_offset,
+ ip->ino_data.size);
+ else
+ cluster_eof = ip->ino_data.size;
+ cluster_write(bp, cluster_eof, blksize, seqcount);
} else {
-#if 0
- if (offset + n == blksize) {
- if (hammer_cluster_enable == 0 ||
- (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
- bawrite(bp);
- } else {
- cluster_write(bp, ip->ino_data.size,
- blksize, seqcount);
- }
- } else {
-#endif
bdwrite(bp);
}
}
*/
offset = (size_t)uio->uio_offset & BMASK;
base_offset = (off_t)uio->uio_offset - offset;
- bp = getcacheblk(vp, base_offset, BSIZE);
- if (bp == NULL)
- {
+ bp = getcacheblk(vp, base_offset, BSIZE, 0);
+ if (bp == NULL) {
lwkt_gettoken(&vp->v_mount->mnt_token);
error = bread(vp, base_offset, BSIZE, &bp);
if (error) {