gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* Modifications/enhancements:
	5	* Copyright (c) 1995 John S. Dyson. All rights reserved.
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	* 1. Redistributions of source code must retain the above copyright
	11	* notice, this list of conditions and the following disclaimer.
	12	* 2. Redistributions in binary form must reproduce the above copyright
	13	* notice, this list of conditions and the following disclaimer in the
	14	* documentation and/or other materials provided with the distribution.
	15	* 3. All advertising materials mentioning features or use of this software
	16	* must display the following acknowledgement:
	17	* This product includes software developed by the University of
	18	* California, Berkeley and its contributors.
	19	* 4. Neither the name of the University nor the names of its contributors
	20	* may be used to endorse or promote products derived from this software
	21	* without specific prior written permission.
	22	*
	23	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	24	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	25	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	26	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	27	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	28	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	29	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	30	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	32	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	33	* SUCH DAMAGE.
	34	*
	35	* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
	36	* $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
	37	* $DragonFly: src/sys/kern/vfs_cluster.c,v 1.12 2005/02/19 00:47:03 joerg Exp $
	38	*/
	39
	40	#include "opt_debug_cluster.h"
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/buf.h>
	47	#include <sys/vnode.h>
	48	#include <sys/malloc.h>
	49	#include <sys/mount.h>
	50	#include <sys/resourcevar.h>
	51	#include <sys/vmmeter.h>
	52	#include <vm/vm.h>
	53	#include <vm/vm_object.h>
	54	#include <vm/vm_page.h>
	55	#include <sys/sysctl.h>
	56	#include <sys/buf2.h>
	57	#include <vm/vm_page2.h>
	58
	59	#if defined(CLUSTERDEBUG)
	60	#include <sys/sysctl.h>
	61	static int rcluster= 0;
	62	SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
	63	#endif
	64
	65	static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
	66
	67	static struct cluster_save *
	68	cluster_collectbufs (struct vnode vp, struct buf last_bp);
	69	static struct buf *
	70	cluster_rbuild (struct vnode *vp, u_quad_t filesize, daddr_t lbn,
	71	daddr_t blkno, long size, int run, struct buf *fbp);
	72
	73	static int write_behind = 1;
	74	SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
	75
	76	extern vm_page_t bogus_page;
	77
	78	extern int cluster_pbuf_freecnt;
	79
	80	/*
	81	* Maximum number of blocks for read-ahead.
	82	*/
	83	#define MAXRA 32
	84
	85	/*
	86	* This replaces bread.
	87	*/
	88	int
	89	cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno,
	90	long size, long totread, int seqcount, struct buf **bpp)
	91	{
	92	struct buf bp, rbp, *reqbp;
	93	daddr_t blkno, origblkno;
	94	int error, num_ra;
	95	int i;
	96	int maxra, racluster;
	97	long origtotread;
	98
	99	error = 0;
	100
	101	/*
	102	* Try to limit the amount of read-ahead by a few
	103	* ad-hoc parameters. This needs work!!!
	104	*/
	105	racluster = vp->v_mount->mnt_iosize_max / size;
	106	maxra = 2 * racluster + (totread / size);
	107	if (maxra > MAXRA)
	108	maxra = MAXRA;
	109	if (maxra > nbuf/8)
	110	maxra = nbuf/8;
	111
	112	/*
	113	* get the requested block
	114	*/
	115	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
	116	origblkno = lblkno;
	117	origtotread = totread;
	118
	119	/*
	120	* if it is in the cache, then check to see if the reads have been
	121	* sequential. If they have, then try some read-ahead, otherwise
	122	* back-off on prospective read-aheads.
	123	*/
	124	if (bp->b_flags & B_CACHE) {
	125	if (!seqcount) {
	126	return 0;
	127	} else if ((bp->b_flags & B_RAM) == 0) {
	128	return 0;
	129	} else {
	130	int s;
	131	struct buf *tbp;
	132	bp->b_flags &= ~B_RAM;
	133	/*
	134	* We do the spl here so that there is no window
	135	* between the incore and the b_usecount increment
	136	* below. We opt to keep the spl out of the loop
	137	* for efficiency.
	138	*/
	139	s = splbio();
	140	for (i = 1; i < maxra; i++) {
	141
	142	if (!(tbp = incore(vp, lblkno+i))) {
	143	break;
	144	}
	145
	146	/*
	147	* Set another read-ahead mark so we know
	148	* to check again.
	149	*/
	150	if (((i % racluster) == (racluster - 1)) \|\|
	151	(i == (maxra - 1)))
	152	tbp->b_flags \|= B_RAM;
	153	}
	154	splx(s);
	155	if (i >= maxra) {
	156	return 0;
	157	}
	158	lblkno += i;
	159	}
	160	reqbp = bp = NULL;
	161	} else {
	162	off_t firstread = bp->b_offset;
	163
	164	KASSERT(bp->b_offset != NOOFFSET,
	165	("cluster_read: no buffer offset"));
	166	if (firstread + totread > filesize)
	167	totread = filesize - firstread;
	168	if (totread > size) {
	169	int nblks = 0;
	170	int ncontigafter;
	171	while (totread > 0) {
	172	nblks++;
	173	totread -= size;
	174	}
	175	if (nblks == 1)
	176	goto single_block_read;
	177	if (nblks > racluster)
	178	nblks = racluster;
	179
	180	error = VOP_BMAP(vp, lblkno, NULL,
	181	&blkno, &ncontigafter, NULL);
	182	if (error)
	183	goto single_block_read;
	184	if (blkno == -1)
	185	goto single_block_read;
	186	if (ncontigafter == 0)
	187	goto single_block_read;
	188	if (ncontigafter + 1 < nblks)
	189	nblks = ncontigafter + 1;
	190
	191	bp = cluster_rbuild(vp, filesize, lblkno,
	192	blkno, size, nblks, bp);
	193	lblkno += (bp->b_bufsize / size);
	194	} else {
	195	single_block_read:
	196	/*
	197	* if it isn't in the cache, then get a chunk from
	198	* disk if sequential, otherwise just get the block.
	199	*/
	200	bp->b_flags \|= B_READ \| B_RAM;
	201	lblkno += 1;
	202	}
	203	}
	204
	205	/*
	206	* if we have been doing sequential I/O, then do some read-ahead
	207	*/
	208	rbp = NULL;
	209	if (seqcount && (lblkno < (origblkno + seqcount))) {
	210	/*
	211	* we now build the read-ahead buffer if it is desirable.
	212	*/
	213	if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
	214	!(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
	215	blkno != -1) {
	216	int nblksread;
	217	int ntoread = num_ra + 1;
	218	nblksread = (origtotread + size - 1) / size;
	219	if (seqcount < nblksread)
	220	seqcount = nblksread;
	221	if (seqcount < ntoread)
	222	ntoread = seqcount;
	223	if (num_ra) {
	224	rbp = cluster_rbuild(vp, filesize, lblkno,
	225	blkno, size, ntoread, NULL);
	226	} else {
	227	rbp = getblk(vp, lblkno, size, 0, 0);
	228	rbp->b_flags \|= B_READ \| B_ASYNC \| B_RAM;
	229	rbp->b_blkno = blkno;
	230	}
	231	}
	232	}
	233
	234	/*
	235	* handle the synchronous read
	236	*/
	237	if (bp) {
	238	#if defined(CLUSTERDEBUG)
	239	if (rcluster)
	240	printf("S(%ld,%ld,%d) ",
	241	(long)bp->b_lblkno, bp->b_bcount, seqcount);
	242	#endif
	243	if ((bp->b_flags & B_CLUSTER) == 0) {
	244	vfs_busy_pages(bp, 0);
	245	}
	246	bp->b_flags &= ~(B_ERROR\|B_INVAL);
	247	if (bp->b_flags & (B_ASYNC\|B_CALL))
	248	BUF_KERNPROC(bp);
	249	error = VOP_STRATEGY(vp, bp);
	250	}
	251
	252	/*
	253	* and if we have read-aheads, do them too
	254	*/
	255	if (rbp) {
	256	if (error) {
	257	rbp->b_flags &= ~(B_ASYNC \| B_READ);
	258	brelse(rbp);
	259	} else if (rbp->b_flags & B_CACHE) {
	260	rbp->b_flags &= ~(B_ASYNC \| B_READ);
	261	bqrelse(rbp);
	262	} else {
	263	#if defined(CLUSTERDEBUG)
	264	if (rcluster) {
	265	if (bp)
	266	printf("A+(%ld,%ld,%ld,%d) ",
	267	(long)rbp->b_lblkno, rbp->b_bcount,
	268	(long)(rbp->b_lblkno - origblkno),
	269	seqcount);
	270	else
	271	printf("A(%ld,%ld,%ld,%d) ",
	272	(long)rbp->b_lblkno, rbp->b_bcount,
	273	(long)(rbp->b_lblkno - origblkno),
	274	seqcount);
	275	}
	276	#endif
	277
	278	if ((rbp->b_flags & B_CLUSTER) == 0) {
	279	vfs_busy_pages(rbp, 0);
	280	}
	281	rbp->b_flags &= ~(B_ERROR\|B_INVAL);
	282	if (rbp->b_flags & (B_ASYNC\|B_CALL))
	283	BUF_KERNPROC(rbp);
	284	(void) VOP_STRATEGY(vp, rbp);
	285	}
	286	}
	287	if (reqbp)
	288	return (biowait(reqbp));
	289	else
	290	return (error);
	291	}
	292
	293	/*
	294	* If blocks are contiguous on disk, use this to provide clustered
	295	* read ahead. We will read as many blocks as possible sequentially
	296	* and then parcel them up into logical blocks in the buffer hash table.
	297	*/
	298	static struct buf *
	299	cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
	300	daddr_t blkno, long size, int run, struct buf *fbp)
	301	{
	302	struct buf bp, tbp;
	303	daddr_t bn;
	304	int i, inc, j;
	305
	306	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
	307	("cluster_rbuild: size %ld != filesize %ld\n",
	308	size, vp->v_mount->mnt_stat.f_iosize));
	309
	310	/*
	311	* avoid a division
	312	*/
	313	while ((u_quad_t) size * (lbn + run) > filesize) {
	314	--run;
	315	}
	316
	317	if (fbp) {
	318	tbp = fbp;
	319	tbp->b_flags \|= B_READ;
	320	} else {
	321	tbp = getblk(vp, lbn, size, 0, 0);
	322	if (tbp->b_flags & B_CACHE)
	323	return tbp;
	324	tbp->b_flags \|= B_ASYNC \| B_READ \| B_RAM;
	325	}
	326
	327	tbp->b_blkno = blkno;
	328	if( (tbp->b_flags & B_MALLOC) \|\|
	329	((tbp->b_flags & B_VMIO) == 0) \|\| (run <= 1) )
	330	return tbp;
	331
	332	bp = trypbuf(&cluster_pbuf_freecnt);
	333	if (bp == 0)
	334	return tbp;
	335
	336	/*
	337	* We are synthesizing a buffer out of vm_page_t's, but
	338	* if the block size is not page aligned then the starting
	339	* address may not be either. Inherit the b_data offset
	340	* from the original buffer.
	341	*/
	342	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	343	((vm_offset_t)tbp->b_data & PAGE_MASK));
	344	bp->b_flags = B_ASYNC \| B_READ \| B_CALL \| B_CLUSTER \| B_VMIO;
	345	bp->b_iodone = cluster_callback;
	346	bp->b_blkno = blkno;
	347	bp->b_lblkno = lbn;
	348	bp->b_offset = tbp->b_offset;
	349	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
	350	pbgetvp(vp, bp);
	351
	352	TAILQ_INIT(&bp->b_cluster.cluster_head);
	353
	354	bp->b_bcount = 0;
	355	bp->b_bufsize = 0;
	356	bp->b_xio.xio_npages = 0;
	357
	358	inc = btodb(size);
	359	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
	360	if (i != 0) {
	361	if ((bp->b_xio.xio_npages * PAGE_SIZE) +
	362	round_page(size) > vp->v_mount->mnt_iosize_max) {
	363	break;
	364	}
	365
	366	/*
	367	* Shortcut some checks and try to avoid buffers that
	368	* would block in the lock. The same checks have to
	369	* be made again after we officially get the buffer.
	370	*/
	371	if ((tbp = incore(vp, lbn + i)) != NULL) {
	372	if (BUF_LOCK(tbp, LK_EXCLUSIVE \| LK_NOWAIT))
	373	break;
	374	BUF_UNLOCK(tbp);
	375
	376	for (j = 0; j < tbp->b_xio.xio_npages; j++) {
	377	if (tbp->b_xio.xio_pages[j]->valid)
	378	break;
	379	}
	380
	381	if (j != tbp->b_xio.xio_npages)
	382	break;
	383
	384	if (tbp->b_bcount != size)
	385	break;
	386	}
	387
	388	tbp = getblk(vp, lbn + i, size, 0, 0);
	389
	390	/*
	391	* Stop scanning if the buffer is fuly valid
	392	* (marked B_CACHE), or locked (may be doing a
	393	* background write), or if the buffer is not
	394	* VMIO backed. The clustering code can only deal
	395	* with VMIO-backed buffers.
	396	*/
	397	if ((tbp->b_flags & (B_CACHE\|B_LOCKED)) \|\|
	398	(tbp->b_flags & B_VMIO) == 0) {
	399	bqrelse(tbp);
	400	break;
	401	}
	402
	403	/*
	404	* The buffer must be completely invalid in order to
	405	* take part in the cluster. If it is partially valid
	406	* then we stop.
	407	*/
	408	for (j = 0;j < tbp->b_xio.xio_npages; j++) {
	409	if (tbp->b_xio.xio_pages[j]->valid)
	410	break;
	411	}
	412	if (j != tbp->b_xio.xio_npages) {
	413	bqrelse(tbp);
	414	break;
	415	}
	416
	417	/*
	418	* Set a read-ahead mark as appropriate
	419	*/
	420	if ((fbp && (i == 1)) \|\| (i == (run - 1)))
	421	tbp->b_flags \|= B_RAM;
	422
	423	/*
	424	* Set the buffer up for an async read (XXX should
	425	* we do this only if we do not wind up brelse()ing?).
	426	* Set the block number if it isn't set, otherwise
	427	* if it is make sure it matches the block number we
	428	* expect.
	429	*/
	430	tbp->b_flags \|= B_READ \| B_ASYNC;
	431	if (tbp->b_blkno == tbp->b_lblkno) {
	432	tbp->b_blkno = bn;
	433	} else if (tbp->b_blkno != bn) {
	434	brelse(tbp);
	435	break;
	436	}
	437	}
	438	/*
	439	* XXX fbp from caller may not be B_ASYNC, but we are going
	440	* to biodone() it in cluster_callback() anyway
	441	*/
	442	BUF_KERNPROC(tbp);
	443	TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
	444	tbp, b_cluster.cluster_entry);
	445	for (j = 0; j < tbp->b_xio.xio_npages; j += 1) {
	446	vm_page_t m;
	447	m = tbp->b_xio.xio_pages[j];
	448	vm_page_io_start(m);
	449	vm_object_pip_add(m->object, 1);
	450	if ((bp->b_xio.xio_npages == 0) \|\|
	451	(bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
	452	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	453	bp->b_xio.xio_npages++;
	454	}
	455	if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
	456	tbp->b_xio.xio_pages[j] = bogus_page;
	457	}
	458	/*
	459	* XXX shouldn't this be += size for both, like in
	460	* cluster_wbuild()?
	461	*
	462	* Don't inherit tbp->b_bufsize as it may be larger due to
	463	* a non-page-aligned size. Instead just aggregate using
	464	* 'size'.
	465	*/
	466	if (tbp->b_bcount != size)
	467	printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
	468	if (tbp->b_bufsize != size)
	469	printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
	470	bp->b_bcount += size;
	471	bp->b_bufsize += size;
	472	}
	473
	474	/*
	475	* Fully valid pages in the cluster are already good and do not need
	476	* to be re-read from disk. Replace the page with bogus_page
	477	*/
	478	for (j = 0; j < bp->b_xio.xio_npages; j++) {
	479	if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
	480	VM_PAGE_BITS_ALL) {
	481	bp->b_xio.xio_pages[j] = bogus_page;
	482	}
	483	}
	484	if (bp->b_bufsize > bp->b_kvasize)
	485	panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)",
	486	bp->b_bufsize, bp->b_kvasize);
	487	bp->b_kvasize = bp->b_bufsize;
	488
	489	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	490	(vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	491	return (bp);
	492	}
	493
	494	/*
	495	* Cleanup after a clustered read or write.
	496	* This is complicated by the fact that any of the buffers might have
	497	* extra memory (if there were no empty buffer headers at allocbuf time)
	498	* that we will need to shift around.
	499	*/
	500	void
	501	cluster_callback(struct buf *bp)
	502	{
	503	struct buf nbp, tbp;
	504	int error = 0;
	505
	506	/*
	507	* Must propogate errors to all the components.
	508	*/
	509	if (bp->b_flags & B_ERROR)
	510	error = bp->b_error;
	511
	512	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
	513	/*
	514	* Move memory from the large cluster buffer into the component
	515	* buffers and mark IO as done on these.
	516	*/
	517	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
	518	tbp; tbp = nbp) {
	519	nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
	520	if (error) {
	521	tbp->b_flags \|= B_ERROR;
	522	tbp->b_error = error;
	523	} else {
	524	tbp->b_dirtyoff = tbp->b_dirtyend = 0;
	525	tbp->b_flags &= ~(B_ERROR\|B_INVAL);
	526	/*
	527	* XXX the bdwrite()/bqrelse() issued during
	528	* cluster building clears B_RELBUF (see bqrelse()
	529	* comment). If direct I/O was specified, we have
	530	* to restore it here to allow the buffer and VM
	531	* to be freed.
	532	*/
	533	if (tbp->b_flags & B_DIRECT)
	534	tbp->b_flags \|= B_RELBUF;
	535	}
	536	biodone(tbp);
	537	}
	538	relpbuf(bp, &cluster_pbuf_freecnt);
	539	}
	540
	541	/*
	542	* cluster_wbuild_wb:
	543	*
	544	* Implement modified write build for cluster.
	545	*
	546	* write_behind = 0 write behind disabled
	547	* write_behind = 1 write behind normal (default)
	548	* write_behind = 2 write behind backed-off
	549	*/
	550
	551	static __inline int
	552	cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
	553	{
	554	int r = 0;
	555
	556	switch(write_behind) {
	557	case 2:
	558	if (start_lbn < len)
	559	break;
	560	start_lbn -= len;
	561	/* fall through */
	562	case 1:
	563	r = cluster_wbuild(vp, size, start_lbn, len);
	564	/* fall through */
	565	default:
	566	/* fall through */
	567	break;
	568	}
	569	return(r);
	570	}
	571
	572	/*
	573	* Do clustered write for FFS.
	574	*
	575	* Three cases:
	576	* 1. Write is not sequential (write asynchronously)
	577	* Write is sequential:
	578	* 2. beginning of cluster - begin cluster
	579	* 3. middle of a cluster - add to cluster
	580	* 4. end of a cluster - asynchronously write cluster
	581	*/
	582	void
	583	cluster_write(struct buf *bp, u_quad_t filesize, int seqcount)
	584	{
	585	struct vnode *vp;
	586	daddr_t lbn;
	587	int maxclen, cursize;
	588	int lblocksize;
	589	int async;
	590
	591	vp = bp->b_vp;
	592	if (vp->v_type == VREG) {
	593	async = vp->v_mount->mnt_flag & MNT_ASYNC;
	594	lblocksize = vp->v_mount->mnt_stat.f_iosize;
	595	} else {
	596	async = 0;
	597	lblocksize = bp->b_bufsize;
	598	}
	599	lbn = bp->b_lblkno;
	600	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
	601
	602	/* Initialize vnode to beginning of file. */
	603	if (lbn == 0)
	604	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
	605
	606	if (vp->v_clen == 0 \|\| lbn != vp->v_lastw + 1 \|\|
	607	(bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
	608	maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
	609	if (vp->v_clen != 0) {
	610	/*
	611	* Next block is not sequential.
	612	*
	613	* If we are not writing at end of file, the process
	614	* seeked to another point in the file since its last
	615	* write, or we have reached our maximum cluster size,
	616	* then push the previous cluster. Otherwise try
	617	* reallocating to make it sequential.
	618	*
	619	* Change to algorithm: only push previous cluster if
	620	* it was sequential from the point of view of the
	621	* seqcount heuristic, otherwise leave the buffer
	622	* intact so we can potentially optimize the I/O
	623	* later on in the buf_daemon or update daemon
	624	* flush.
	625	*/
	626	cursize = vp->v_lastw - vp->v_cstart + 1;
	627	if (((u_quad_t) bp->b_offset + lblocksize) != filesize \|\|
	628	lbn != vp->v_lastw + 1 \|\| vp->v_clen <= cursize) {
	629	if (!async && seqcount > 0) {
	630	cluster_wbuild_wb(vp, lblocksize,
	631	vp->v_cstart, cursize);
	632	}
	633	} else {
	634	struct buf bpp, endbp;
	635	struct cluster_save *buflist;
	636
	637	buflist = cluster_collectbufs(vp, bp);
	638	endbp = &buflist->bs_children
	639	[buflist->bs_nchildren - 1];
	640	if (VOP_REALLOCBLKS(vp, buflist)) {
	641	/*
	642	* Failed, push the previous cluster
	643	* if really writing sequentially
	644	* in the logical file (seqcount > 1),
	645	* otherwise delay it in the hopes that
	646	* the low level disk driver can
	647	* optimize the write ordering.
	648	*/
	649	for (bpp = buflist->bs_children;
	650	bpp < endbp; bpp++)
	651	brelse(*bpp);
	652	free(buflist, M_SEGMENT);
	653	if (seqcount > 1) {
	654	cluster_wbuild_wb(vp,
	655	lblocksize, vp->v_cstart,
	656	cursize);
	657	}
	658	} else {
	659	/*
	660	* Succeeded, keep building cluster.
	661	*/
	662	for (bpp = buflist->bs_children;
	663	bpp <= endbp; bpp++)
	664	bdwrite(*bpp);
	665	free(buflist, M_SEGMENT);
	666	vp->v_lastw = lbn;
	667	vp->v_lasta = bp->b_blkno;
	668	return;
	669	}
	670	}
	671	}
	672	/*
	673	* Consider beginning a cluster. If at end of file, make
	674	* cluster as large as possible, otherwise find size of
	675	* existing cluster.
	676	*/
	677	if ((vp->v_type == VREG) &&
	678	((u_quad_t) bp->b_offset + lblocksize) != filesize &&
	679	(bp->b_blkno == bp->b_lblkno) &&
	680	(VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) \|\|
	681	bp->b_blkno == -1)) {
	682	bawrite(bp);
	683	vp->v_clen = 0;
	684	vp->v_lasta = bp->b_blkno;
	685	vp->v_cstart = lbn + 1;
	686	vp->v_lastw = lbn;
	687	return;
	688	}
	689	vp->v_clen = maxclen;
	690	if (!async && maxclen == 0) { /* I/O not contiguous */
	691	vp->v_cstart = lbn + 1;
	692	bawrite(bp);
	693	} else { /* Wait for rest of cluster */
	694	vp->v_cstart = lbn;
	695	bdwrite(bp);
	696	}
	697	} else if (lbn == vp->v_cstart + vp->v_clen) {
	698	/*
	699	* At end of cluster, write it out if seqcount tells us we
	700	* are operating sequentially, otherwise let the buf or
	701	* update daemon handle it.
	702	*/
	703	bdwrite(bp);
	704	if (seqcount > 1)
	705	cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
	706	vp->v_clen = 0;
	707	vp->v_cstart = lbn + 1;
	708	} else if (vm_page_count_severe()) {
	709	/*
	710	* We are low on memory, get it going NOW
	711	*/
	712	bawrite(bp);
	713	} else {
	714	/*
	715	* In the middle of a cluster, so just delay the I/O for now.
	716	*/
	717	bdwrite(bp);
	718	}
	719	vp->v_lastw = lbn;
	720	vp->v_lasta = bp->b_blkno;
	721	}
	722
	723
	724	/*
	725	* This is an awful lot like cluster_rbuild...wish they could be combined.
	726	* The last lbn argument is the current block on which I/O is being
	727	* performed. Check to see that it doesn't fall in the middle of
	728	* the current block (if last_bp == NULL).
	729	*/
	730	int
	731	cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len)
	732	{
	733	struct buf bp, tbp;
	734	int i, j, s;
	735	int totalwritten = 0;
	736	int dbsize = btodb(size);
	737
	738	while (len > 0) {
	739	s = splbio();
	740	/*
	741	* If the buffer is not delayed-write (i.e. dirty), or it
	742	* is delayed-write but either locked or inval, it cannot
	743	* partake in the clustered write.
	744	*/
	745	if (((tbp = gbincore(vp, start_lbn)) == NULL) \|\|
	746	((tbp->b_flags & (B_LOCKED \| B_INVAL \| B_DELWRI)) != B_DELWRI) \|\|
	747	BUF_LOCK(tbp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	748	++start_lbn;
	749	--len;
	750	splx(s);
	751	continue;
	752	}
	753	bremfree(tbp);
	754	tbp->b_flags &= ~B_DONE;
	755	splx(s);
	756
	757	/*
	758	* Extra memory in the buffer, punt on this buffer.
	759	* XXX we could handle this in most cases, but we would
	760	* have to push the extra memory down to after our max
	761	* possible cluster size and then potentially pull it back
	762	* up if the cluster was terminated prematurely--too much
	763	* hassle.
	764	*/
	765	if (((tbp->b_flags & (B_CLUSTEROK\|B_MALLOC)) != B_CLUSTEROK) \|\|
	766	(tbp->b_bcount != tbp->b_bufsize) \|\|
	767	(tbp->b_bcount != size) \|\|
	768	(len == 1) \|\|
	769	((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
	770	totalwritten += tbp->b_bufsize;
	771	bawrite(tbp);
	772	++start_lbn;
	773	--len;
	774	continue;
	775	}
	776
	777	/*
	778	* We got a pbuf to make the cluster in.
	779	* so initialise it.
	780	*/
	781	TAILQ_INIT(&bp->b_cluster.cluster_head);
	782	bp->b_bcount = 0;
	783	bp->b_bufsize = 0;
	784	bp->b_xio.xio_npages = 0;
	785	bp->b_blkno = tbp->b_blkno;
	786	bp->b_lblkno = tbp->b_lblkno;
	787	bp->b_offset = tbp->b_offset;
	788
	789	/*
	790	* We are synthesizing a buffer out of vm_page_t's, but
	791	* if the block size is not page aligned then the starting
	792	* address may not be either. Inherit the b_data offset
	793	* from the original buffer.
	794	*/
	795	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	796	((vm_offset_t)tbp->b_data & PAGE_MASK));
	797	bp->b_flags \|= B_CALL \| B_CLUSTER \|
	798	(tbp->b_flags & (B_VMIO \| B_NEEDCOMMIT \| B_NOWDRAIN));
	799	bp->b_iodone = cluster_callback;
	800	pbgetvp(vp, bp);
	801	/*
	802	* From this location in the file, scan forward to see
	803	* if there are buffers with adjacent data that need to
	804	* be written as well.
	805	*/
	806	for (i = 0; i < len; ++i, ++start_lbn) {
	807	if (i != 0) { /* If not the first buffer */
	808	s = splbio();
	809	/*
	810	* If the adjacent data is not even in core it
	811	* can't need to be written.
	812	*/
	813	if ((tbp = gbincore(vp, start_lbn)) == NULL) {
	814	splx(s);
	815	break;
	816	}
	817
	818	/*
	819	* If it IS in core, but has different
	820	* characteristics, or is locked (which
	821	* means it could be undergoing a background
	822	* I/O or be in a weird state), then don't
	823	* cluster with it.
	824	*/
	825	if ((tbp->b_flags & (B_VMIO \| B_CLUSTEROK \|
	826	B_INVAL \| B_DELWRI \| B_NEEDCOMMIT))
	827	!= (B_DELWRI \| B_CLUSTEROK \|
	828	(bp->b_flags & (B_VMIO \| B_NEEDCOMMIT))) \|\|
	829	(tbp->b_flags & B_LOCKED) \|\|
	830	BUF_LOCK(tbp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	831	splx(s);
	832	break;
	833	}
	834
	835	/*
	836	* Check that the combined cluster
	837	* would make sense with regard to pages
	838	* and would not be too large
	839	*/
	840	if ((tbp->b_bcount != size) \|\|
	841	((bp->b_blkno + (dbsize * i)) !=
	842	tbp->b_blkno) \|\|
	843	((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
	844	(vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
	845	BUF_UNLOCK(tbp);
	846	splx(s);
	847	break;
	848	}
	849	/*
	850	* Ok, it's passed all the tests,
	851	* so remove it from the free list
	852	* and mark it busy. We will use it.
	853	*/
	854	bremfree(tbp);
	855	tbp->b_flags &= ~B_DONE;
	856	splx(s);
	857	} /* end of code for non-first buffers only */
	858	/* check for latent dependencies to be handled */
	859	if ((LIST_FIRST(&tbp->b_dep)) != NULL &&
	860	bioops.io_start)
	861	(*bioops.io_start)(tbp);
	862	/*
	863	* If the IO is via the VM then we do some
	864	* special VM hackery (yuck). Since the buffer's
	865	* block size may not be page-aligned it is possible
	866	* for a page to be shared between two buffers. We
	867	* have to get rid of the duplication when building
	868	* the cluster.
	869	*/
	870	if (tbp->b_flags & B_VMIO) {
	871	vm_page_t m;
	872
	873	if (i != 0) { /* if not first buffer */
	874	for (j = 0; j < tbp->b_xio.xio_npages; j += 1) {
	875	m = tbp->b_xio.xio_pages[j];
	876	if (m->flags & PG_BUSY) {
	877	bqrelse(tbp);
	878	goto finishcluster;
	879	}
	880	}
	881	}
	882
	883	for (j = 0; j < tbp->b_xio.xio_npages; j += 1) {
	884	m = tbp->b_xio.xio_pages[j];
	885	vm_page_io_start(m);
	886	vm_object_pip_add(m->object, 1);
	887	if ((bp->b_xio.xio_npages == 0) \|\|
	888	(bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
	889	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	890	bp->b_xio.xio_npages++;
	891	}
	892	}
	893	}
	894	bp->b_bcount += size;
	895	bp->b_bufsize += size;
	896
	897	s = splbio();
	898	bundirty(tbp);
	899	tbp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR);
	900	tbp->b_flags \|= B_ASYNC;
	901	reassignbuf(tbp, tbp->b_vp); /* put on clean list */
	902	++tbp->b_vp->v_numoutput;
	903	splx(s);
	904	BUF_KERNPROC(tbp);
	905	TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
	906	tbp, b_cluster.cluster_entry);
	907	}
	908	finishcluster:
	909	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	910	(vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	911	if (bp->b_bufsize > bp->b_kvasize)
	912	panic(
	913	"cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
	914	bp->b_bufsize, bp->b_kvasize);
	915	bp->b_kvasize = bp->b_bufsize;
	916	totalwritten += bp->b_bufsize;
	917	bp->b_dirtyoff = 0;
	918	bp->b_dirtyend = bp->b_bufsize;
	919	bawrite(bp);
	920
	921	len -= i;
	922	}
	923	return totalwritten;
	924	}
	925
	926	/*
	927	* Collect together all the buffers in a cluster.
	928	* Plus add one additional buffer.
	929	*/
	930	static struct cluster_save *
	931	cluster_collectbufs(struct vnode vp, struct buf last_bp)
	932	{
	933	struct cluster_save *buflist;
	934	struct buf *bp;
	935	daddr_t lbn;
	936	int i, len;
	937
	938	len = vp->v_lastw - vp->v_cstart + 1;
	939	buflist = malloc(sizeof(struct buf ) (len + 1) + sizeof(*buflist),
	940	M_SEGMENT, M_WAITOK);
	941	buflist->bs_nchildren = 0;
	942	buflist->bs_children = (struct buf **) (buflist + 1);
	943	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
	944	(void) bread(vp, lbn, last_bp->b_bcount, &bp);
	945	buflist->bs_children[i] = bp;
	946	if (bp->b_blkno == bp->b_lblkno)
	947	VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
	948	NULL, NULL);
	949	}
	950	buflist->bs_children[i] = bp = last_bp;
	951	if (bp->b_blkno == bp->b_lblkno)
	952	VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
	953	NULL, NULL);
	954	buflist->bs_nchildren = i + 1;
	955	return (buflist);
	956	}