/*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $ * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $ */ #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) #include static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); #endif static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer"); static struct cluster_save * cluster_collectbufs (struct vnode *vp, struct buf *last_bp, int blksize); static struct buf * cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, int blksize, int run, struct buf *fbp, int doasync); static void cluster_callback (struct bio *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); extern vm_page_t bogus_page; extern int cluster_pbuf_freecnt; /* * Maximum number of blocks for read-ahead. */ #define MAXRA 32 /* * This replaces bread. */ int cluster_read(struct vnode *vp, off_t filesize, off_t loffset, int blksize, int totread, int seqcount, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; off_t origoffset; off_t doffset; int error; int i; int maxra, racluster; error = 0; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vmaxiosize(vp) / blksize; maxra = 2 * racluster + (totread / blksize); if (maxra > MAXRA) maxra = MAXRA; if (maxra > nbuf/8) maxra = nbuf/8; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0); origoffset = loffset; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { struct buf *tbp; bp->b_flags &= ~B_RAM; /* * Set read-ahead-mark only if we can passively lock * the buffer. Note that with these flags the bp * could very exist even though NULL is returned. */ for (i = 1; i < maxra; i++) { tbp = findblk(vp, loffset + i * blksize, FINDBLK_NBLOCK); if (tbp == NULL) break; if (((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) { tbp->b_flags |= B_RAM; } BUF_UNLOCK(tbp); } if (i >= maxra) return 0; loffset += i * blksize; } reqbp = bp = NULL; } else { off_t firstread = bp->b_loffset; int nblks; KASSERT(firstread != NOOFFSET, ("cluster_read: no buffer offset")); if (firstread + totread > filesize) totread = (int)(filesize - firstread); nblks = totread / blksize; if (nblks) { int burstbytes; if (nblks > racluster) nblks = racluster; error = VOP_BMAP(vp, loffset, &doffset, &burstbytes, NULL, BUF_CMD_READ); if (error) goto single_block_read; if (doffset == NOOFFSET) goto single_block_read; if (burstbytes < blksize * 2) goto single_block_read; if (nblks > burstbytes / blksize) nblks = burstbytes / blksize; bp = cluster_rbuild(vp, filesize, loffset, doffset, blksize, nblks, bp, 0); loffset += bp->b_bufsize; } else { single_block_read: /* * if it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ bp->b_flags |= B_RAM; loffset += blksize; } } /* * Handle the synchronous read. This only occurs if B_CACHE was * not set. bp (and rbp) could be either a cluster bp or a normal * bp depending on the what cluster_rbuild() decided to do. If * it is a cluster bp, vfs_busy_pages() has already been called. */ if (bp) { #if defined(CLUSTERDEBUG) if (rcluster) kprintf("S(%lld,%d,%d) ", bp->b_loffset, bp->b_bcount, seqcount); #endif bp->b_cmd = BUF_CMD_READ; if ((bp->b_flags & B_CLUSTER) == 0) vfs_busy_pages(vp, bp); bp->b_flags &= ~(B_ERROR|B_INVAL); if ((bp->b_flags & B_ASYNC) || bp->b_bio1.bio_done != NULL) BUF_KERNPROC(bp); vn_strategy(vp, &bp->b_bio1); if (bp->b_flags & B_ERROR) { if ((error = bp->b_error) == 0) error = EIO; } else { error = 0; } } /* * If we have been doing sequential I/O, then do some read-ahead. * * Only mess with buffers which we can immediately lock. HAMMER * will do device-readahead irrespective of what the blocks * represent. */ rbp = NULL; if (!error && seqcount && loffset < origoffset + seqcount * blksize && loffset + blksize <= filesize ) { int nblksread; int ntoread; int burstbytes; int tmp_error; rbp = getblk(vp, loffset, blksize, GETBLK_SZMATCH|GETBLK_NOWAIT, 0); if (rbp == NULL) goto no_read_ahead; if ((rbp->b_flags & B_CACHE)) { bqrelse(rbp); goto no_read_ahead; } /* * An error from the read-ahead bmap has nothing to do * with the caller's original request. */ tmp_error = VOP_BMAP(vp, loffset, &doffset, &burstbytes, NULL, BUF_CMD_READ); if (tmp_error || doffset == NOOFFSET) { rbp->b_flags |= B_INVAL; brelse(rbp); rbp = NULL; goto no_read_ahead; } ntoread = burstbytes / blksize; nblksread = (totread + blksize - 1) / blksize; if (seqcount < nblksread) seqcount = nblksread; if (ntoread > seqcount) ntoread = seqcount; rbp->b_flags |= B_RAM/* | B_AGE*/; if (burstbytes) { rbp = cluster_rbuild(vp, filesize, loffset, doffset, blksize, ntoread, rbp, 1); } else { rbp->b_bio2.bio_offset = doffset; } #if defined(CLUSTERDEBUG) if (rcluster) { if (bp) kprintf("A+(%lld,%d,%lld,%d) ", rbp->b_loffset, rbp->b_bcount, rbp->b_loffset - origoffset, seqcount); else kprintf("A(%lld,%d,%lld,%d) ", rbp->b_loffset, rbp->b_bcount, rbp->b_loffset - origoffset, seqcount); } #endif rbp->b_flags &= ~(B_ERROR|B_INVAL); rbp->b_flags |= B_ASYNC; rbp->b_cmd = BUF_CMD_READ; if ((rbp->b_flags & B_CLUSTER) == 0) vfs_busy_pages(vp, rbp); BUF_KERNPROC(rbp); /* B_ASYNC */ vn_strategy(vp, &rbp->b_bio1); } no_read_ahead: if (reqbp) return (biowait(reqbp)); else return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, int blksize, int run, struct buf *fbp, int doasync) { struct buf *bp, *tbp; off_t boffset; int i, j; int maxiosize = vmaxiosize(vp); /* * avoid a division */ while (loffset + run * blksize > filesize) { --run; } tbp = fbp; tbp->b_bio2.bio_offset = doffset; if((tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) { return tbp; } bp = trypbuf(&cluster_pbuf_freecnt); if (bp == NULL) return tbp; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags |= B_ASYNC | B_CLUSTER | B_VMIO; bp->b_cmd = BUF_CMD_READ; bp->b_bio1.bio_done = cluster_callback; bp->b_bio1.bio_caller_info1.cluster_head = NULL; bp->b_bio1.bio_caller_info2.cluster_tail = NULL; bp->b_loffset = loffset; bp->b_bio2.bio_offset = doffset; KASSERT(bp->b_loffset != NOOFFSET, ("cluster_rbuild: no buffer offset")); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_xio.xio_npages = 0; for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) { if (i) { if ((bp->b_xio.xio_npages * PAGE_SIZE) + round_page(blksize) > maxiosize) { break; } /* * Shortcut some checks and try to avoid buffers that * would block in the lock. The same checks have to * be made again after we officially get the buffer. */ tbp = getblk(vp, loffset + i * blksize, blksize, GETBLK_SZMATCH|GETBLK_NOWAIT, 0); if (tbp == NULL) break; for (j = 0; j < tbp->b_xio.xio_npages; j++) { if (tbp->b_xio.xio_pages[j]->valid) break; } if (j != tbp->b_xio.xio_npages) { bqrelse(tbp); break; } /* * Stop scanning if the buffer is fuly valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. */ if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || (tbp->b_flags & B_VMIO) == 0 || (LIST_FIRST(&tbp->b_dep) != NULL && buf_checkread(tbp)) ) { bqrelse(tbp); break; } /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ for (j = 0;j < tbp->b_xio.xio_npages; j++) { if (tbp->b_xio.xio_pages[j]->valid) break; } if (j != tbp->b_xio.xio_npages) { bqrelse(tbp); break; } /* * Set a read-ahead mark as appropriate */ if (i == 1 || i == (run - 1)) tbp->b_flags |= B_RAM; /* * Depress the priority of buffers not explicitly * requested. */ /* tbp->b_flags |= B_AGE; */ /* * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ if (tbp->b_bio2.bio_offset == NOOFFSET) { tbp->b_bio2.bio_offset = boffset; } else if (tbp->b_bio2.bio_offset != boffset) { brelse(tbp); break; } } /* * The first buffer is setup async if doasync is specified. * All other buffers in the cluster are setup async. This * way the caller can decide how to deal with the requested * buffer. */ if (i || doasync) tbp->b_flags |= B_ASYNC; tbp->b_cmd = BUF_CMD_READ; BUF_KERNPROC(tbp); cluster_append(&bp->b_bio1, tbp); for (j = 0; j < tbp->b_xio.xio_npages; ++j) { vm_page_t m; m = tbp->b_xio.xio_pages[j]; vm_page_io_start(m); vm_object_pip_add(m->object, 1); if ((bp->b_xio.xio_npages == 0) || (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) { bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; bp->b_xio.xio_npages++; } if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) tbp->b_xio.xio_pages[j] = bogus_page; } /* * XXX shouldn't this be += size for both, like in * cluster_wbuild()? * * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != blksize) kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize); if (tbp->b_bufsize != blksize) kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize); bp->b_bcount += blksize; bp->b_bufsize += blksize; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ for (j = 0; j < bp->b_xio.xio_npages; j++) { if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { bp->b_xio.xio_pages[j] = bogus_page; } } if (bp->b_bufsize > bp->b_kvasize) { panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)", bp->b_bufsize, bp->b_kvasize); } pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages); return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. * * The returned bio is &bp->b_bio1 */ void cluster_callback(struct bio *bio) { struct buf *bp = bio->bio_buf; struct buf *tbp; int error = 0; /* * Must propogate errors to all the components. A short read (EOF) * is a critical error. */ if (bp->b_flags & B_ERROR) { error = bp->b_error; } else if (bp->b_bcount != bp->b_bufsize) { panic("cluster_callback: unexpected EOF on cluster %p!", bio); } pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. Since the memory map * is the same, no actual copying is required. */ while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) { bio->bio_caller_info1.cluster_head = tbp->b_cluster_next; if (error) { tbp->b_flags |= B_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~(B_ERROR|B_INVAL); /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } biodone(&tbp->b_bio1); } relpbuf(bp, &cluster_pbuf_freecnt); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len) { int r = 0; switch(write_behind) { case 2: if (start_loffset < len) break; start_loffset -= len; /* fall through */ case 1: r = cluster_wbuild(vp, blksize, start_loffset, len); /* fall through */ default: /* fall through */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount) { struct vnode *vp; off_t loffset; int maxclen, cursize; int async; vp = bp->b_vp; if (vp->v_type == VREG) async = vp->v_mount->mnt_flag & MNT_ASYNC; else async = 0; loffset = bp->b_loffset; KASSERT(bp->b_loffset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (loffset == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize || bp->b_bio2.bio_offset == NOOFFSET || (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) { maxclen = vmaxiosize(vp); if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + blksize; if (bp->b_loffset + blksize != filesize || loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, blksize, vp->v_cstart, cursize); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp, blksize); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); kfree(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, blksize, vp->v_cstart, cursize); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); kfree(buflist, M_SEGMENT); vp->v_lastw = loffset; vp->v_lasta = bp->b_bio2.bio_offset; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && bp->b_loffset + blksize != filesize && (bp->b_bio2.bio_offset == NOOFFSET) && (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) || bp->b_bio2.bio_offset == NOOFFSET)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_bio2.bio_offset; vp->v_cstart = loffset + blksize; vp->v_lastw = loffset; return; } if (maxclen > blksize) vp->v_clen = maxclen - blksize; else vp->v_clen = 0; if (!async && vp->v_clen == 0) { /* I/O not contiguous */ vp->v_cstart = loffset + blksize; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = loffset; bdwrite(bp); } } else if (loffset == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) cluster_wbuild_wb(vp, blksize, vp->v_cstart, vp->v_clen + blksize); vp->v_clen = 0; vp->v_cstart = loffset + blksize; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = loffset; vp->v_lasta = bp->b_bio2.bio_offset; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) { struct buf *bp, *tbp; int i, j; int totalwritten = 0; int maxiosize = vmaxiosize(vp); while (bytes > 0) { /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK); if (tbp == NULL || (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI || (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { if (tbp) BUF_UNLOCK(tbp); start_loffset += blksize; bytes -= blksize; continue; } bremfree(tbp); KKASSERT(tbp->b_cmd == BUF_CMD_DONE); /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != blksize) || (bytes == blksize) || ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); start_loffset += blksize; bytes -= blksize; continue; } /* * Set up the pbuf. Track our append point with b_bcount * and b_bufsize. b_bufsize is not used by the device but * our caller uses it to loop clusters and we use it to * detect a premature EOF on the block device. */ bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_xio.xio_npages = 0; bp->b_loffset = tbp->b_loffset; bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags &= ~B_ERROR; bp->b_flags |= B_CLUSTER | B_BNOCLIP | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_bio1.bio_done = cluster_callback; bp->b_bio1.bio_caller_info1.cluster_head = NULL; bp->b_bio1.bio_caller_info2.cluster_tail = NULL; /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { if (i != 0) { /* If not the first buffer */ tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK); /* * Buffer not found or could not be locked * non-blocking. */ if (tbp == NULL) break; /* * If it IS in core, but has different * characteristics, then don't cluster * with it. */ if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || (tbp->b_flags & B_LOCKED) || (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp)) ) { BUF_UNLOCK(tbp); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != blksize) || ((bp->b_bio2.bio_offset + i) != tbp->b_bio2.bio_offset) || ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > (maxiosize / PAGE_SIZE))) { BUF_UNLOCK(tbp); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); KKASSERT(tbp->b_cmd == BUF_CMD_DONE); } /* end of code for non-first buffers only */ /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; if (i != 0) { /* if not first buffer */ for (j = 0; j < tbp->b_xio.xio_npages; ++j) { m = tbp->b_xio.xio_pages[j]; if (m->flags & PG_BUSY) { bqrelse(tbp); goto finishcluster; } } } for (j = 0; j < tbp->b_xio.xio_npages; ++j) { m = tbp->b_xio.xio_pages[j]; vm_page_io_start(m); vm_object_pip_add(m->object, 1); if ((bp->b_xio.xio_npages == 0) || (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) { bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; bp->b_xio.xio_npages++; } } } bp->b_bcount += blksize; bp->b_bufsize += blksize; bundirty(tbp); tbp->b_flags &= ~B_ERROR; tbp->b_flags |= B_ASYNC; tbp->b_cmd = BUF_CMD_WRITE; BUF_KERNPROC(tbp); cluster_append(&bp->b_bio1, tbp); /* * check for latent dependencies to be handled */ if (LIST_FIRST(&tbp->b_dep) != NULL) buf_start(tbp); } finishcluster: pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages); if (bp->b_bufsize > bp->b_kvasize) { panic( "cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); } totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bp->b_flags |= B_ASYNC; bp->b_cmd = BUF_CMD_WRITE; vfs_busy_pages(vp, bp); bp->b_runningbufspace = bp->b_bufsize; if (bp->b_runningbufspace) { runningbufspace += bp->b_runningbufspace; ++runningbufcount; } BUF_KERNPROC(bp); /* B_ASYNC */ vn_strategy(vp, &bp->b_bio1); bytes -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize) { struct cluster_save *buflist; struct buf *bp; off_t loffset; int i, len; len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize; buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (loffset = vp->v_cstart, i = 0; i < len; (loffset += blksize), i++) { (void) bread(vp, loffset, last_bp->b_bcount, &bp); buflist->bs_children[i] = bp; if (bp->b_bio2.bio_offset == NOOFFSET) { VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, NULL, NULL, BUF_CMD_WRITE); } } buflist->bs_children[i] = bp = last_bp; if (bp->b_bio2.bio_offset == NOOFFSET) { VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, NULL, NULL, BUF_CMD_WRITE); } buflist->bs_nchildren = i + 1; return (buflist); } void cluster_append(struct bio *bio, struct buf *tbp) { tbp->b_cluster_next = NULL; if (bio->bio_caller_info1.cluster_head == NULL) { bio->bio_caller_info1.cluster_head = tbp; bio->bio_caller_info2.cluster_tail = tbp; } else { bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp; bio->bio_caller_info2.cluster_tail = tbp; } }