gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* Modifications/enhancements:
	5	* Copyright (c) 1995 John S. Dyson. All rights reserved.
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	* 1. Redistributions of source code must retain the above copyright
	11	* notice, this list of conditions and the following disclaimer.
	12	* 2. Redistributions in binary form must reproduce the above copyright
	13	* notice, this list of conditions and the following disclaimer in the
	14	* documentation and/or other materials provided with the distribution.
	15	* 3. All advertising materials mentioning features or use of this software
	16	* must display the following acknowledgement:
	17	* This product includes software developed by the University of
	18	* California, Berkeley and its contributors.
	19	* 4. Neither the name of the University nor the names of its contributors
	20	* may be used to endorse or promote products derived from this software
	21	* without specific prior written permission.
	22	*
	23	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	24	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	25	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	26	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	27	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	28	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	29	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	30	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	32	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	33	* SUCH DAMAGE.
	34	*
	35	* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
	36	* $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
	37	* $DragonFly: src/sys/kern/vfs_cluster.c,v 1.21 2006/04/28 16:34:01 dillon Exp $
	38	*/
	39
	40	#include "opt_debug_cluster.h"
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/buf.h>
	47	#include <sys/vnode.h>
	48	#include <sys/malloc.h>
	49	#include <sys/mount.h>
	50	#include <sys/resourcevar.h>
	51	#include <sys/vmmeter.h>
	52	#include <vm/vm.h>
	53	#include <vm/vm_object.h>
	54	#include <vm/vm_page.h>
	55	#include <sys/sysctl.h>
	56	#include <sys/buf2.h>
	57	#include <vm/vm_page2.h>
	58
	59	#if defined(CLUSTERDEBUG)
	60	#include <sys/sysctl.h>
	61	static int rcluster= 0;
	62	SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
	63	#endif
	64
	65	static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
	66
	67	static struct cluster_save *
	68	cluster_collectbufs (struct vnode vp, struct buf last_bp,
	69	int lblocksize);
	70	static struct buf *
	71	cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
	72	off_t doffset, int size, int run, struct buf *fbp);
	73	static void cluster_callback (struct bio *);
	74
	75
	76	static int write_behind = 1;
	77	SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
	78
	79	extern vm_page_t bogus_page;
	80
	81	extern int cluster_pbuf_freecnt;
	82
	83	/*
	84	* Maximum number of blocks for read-ahead.
	85	*/
	86	#define MAXRA 32
	87
	88	/*
	89	* This replaces bread.
	90	*/
	91	int
	92	cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
	93	int size, int totread, int seqcount, struct buf **bpp)
	94	{
	95	struct buf bp, rbp, *reqbp;
	96	off_t origoffset;
	97	off_t doffset;
	98	int error;
	99	int i;
	100	int maxra, racluster;
	101
	102	error = 0;
	103
	104	/*
	105	* Try to limit the amount of read-ahead by a few
	106	* ad-hoc parameters. This needs work!!!
	107	*/
	108	racluster = vp->v_mount->mnt_iosize_max / size;
	109	maxra = 2 * racluster + (totread / size);
	110	if (maxra > MAXRA)
	111	maxra = MAXRA;
	112	if (maxra > nbuf/8)
	113	maxra = nbuf/8;
	114
	115	/*
	116	* get the requested block
	117	*/
	118	*bpp = reqbp = bp = getblk(vp, loffset, size, 0, 0);
	119	origoffset = loffset;
	120
	121	/*
	122	* if it is in the cache, then check to see if the reads have been
	123	* sequential. If they have, then try some read-ahead, otherwise
	124	* back-off on prospective read-aheads.
	125	*/
	126	if (bp->b_flags & B_CACHE) {
	127	if (!seqcount) {
	128	return 0;
	129	} else if ((bp->b_flags & B_RAM) == 0) {
	130	return 0;
	131	} else {
	132	struct buf *tbp;
	133	bp->b_flags &= ~B_RAM;
	134	/*
	135	* We do the crit here so that there is no window
	136	* between the findblk and the b_usecount increment
	137	* below. We opt to keep the crit out of the loop
	138	* for efficiency.
	139	*/
	140	crit_enter();
	141	for (i = 1; i < maxra; i++) {
	142	if (!(tbp = findblk(vp, loffset + i * size))) {
	143	break;
	144	}
	145
	146	/*
	147	* Set another read-ahead mark so we know
	148	* to check again.
	149	*/
	150	if (((i % racluster) == (racluster - 1)) \|\|
	151	(i == (maxra - 1)))
	152	tbp->b_flags \|= B_RAM;
	153	}
	154	crit_exit();
	155	if (i >= maxra) {
	156	return 0;
	157	}
	158	loffset += i * size;
	159	}
	160	reqbp = bp = NULL;
	161	} else {
	162	off_t firstread = bp->b_loffset;
	163	int nblks;
	164
	165	KASSERT(firstread != NOOFFSET,
	166	("cluster_read: no buffer offset"));
	167	if (firstread + totread > filesize)
	168	totread = (int)(filesize - firstread);
	169	nblks = totread / size;
	170	if (nblks) {
	171	int burstbytes;
	172
	173	if (nblks > racluster)
	174	nblks = racluster;
	175
	176	error = VOP_BMAP(vp, loffset, NULL,
	177	&doffset, &burstbytes, NULL);
	178	if (error)
	179	goto single_block_read;
	180	if (doffset == NOOFFSET)
	181	goto single_block_read;
	182	if (burstbytes < size * 2)
	183	goto single_block_read;
	184	if (nblks > burstbytes / size)
	185	nblks = burstbytes / size;
	186
	187	bp = cluster_rbuild(vp, filesize, loffset,
	188	doffset, size, nblks, bp);
	189	loffset += bp->b_bufsize;
	190	} else {
	191	single_block_read:
	192	/*
	193	* if it isn't in the cache, then get a chunk from
	194	* disk if sequential, otherwise just get the block.
	195	*/
	196	bp->b_flags \|= B_READ \| B_RAM;
	197	loffset += size;
	198	}
	199	}
	200
	201	/*
	202	* If we have been doing sequential I/O, then do some read-ahead.
	203	*/
	204	rbp = NULL;
	205	if (seqcount &&
	206	loffset < origoffset + seqcount * size &&
	207	loffset + size <= filesize
	208	) {
	209	rbp = getblk(vp, loffset, size, 0, 0);
	210	if ((rbp->b_flags & B_CACHE) == 0) {
	211	int nblksread;
	212	int ntoread;
	213	int burstbytes;
	214
	215	error = VOP_BMAP(vp, loffset, NULL,
	216	&doffset, &burstbytes, NULL);
	217	if (error \|\| doffset == NOOFFSET) {
	218	rbp->b_flags &= ~(B_ASYNC \| B_READ);
	219	brelse(rbp);
	220	rbp = NULL;
	221	goto no_read_ahead;
	222	}
	223	ntoread = burstbytes / size;
	224	nblksread = (totread + size - 1) / size;
	225	if (seqcount < nblksread)
	226	seqcount = nblksread;
	227	if (seqcount < ntoread)
	228	ntoread = seqcount;
	229
	230	rbp->b_flags \|= B_READ \| B_ASYNC \| B_RAM;
	231	if (burstbytes) {
	232	rbp = cluster_rbuild(vp, filesize, loffset,
	233	doffset, size,
	234	ntoread, rbp);
	235	} else {
	236	rbp->b_bio2.bio_offset = doffset;
	237	}
	238	}
	239	}
	240	no_read_ahead:
	241
	242	/*
	243	* Handle the synchronous read. This only occurs if B_CACHE was
	244	* not set.
	245	*/
	246	if (bp) {
	247	#if defined(CLUSTERDEBUG)
	248	if (rcluster)
	249	printf("S(%lld,%d,%d) ",
	250	bp->b_loffset, bp->b_bcount, seqcount);
	251	#endif
	252	if ((bp->b_flags & B_CLUSTER) == 0) {
	253	vfs_busy_pages(vp, bp, 0);
	254	}
	255	bp->b_flags &= ~(B_ERROR\|B_INVAL);
	256	if ((bp->b_flags & B_ASYNC) \|\| bp->b_bio1.bio_done != NULL)
	257	BUF_KERNPROC(bp);
	258	vn_strategy(vp, &bp->b_bio1);
	259	error = bp->b_error;
	260	}
	261
	262	/*
	263	* And if we have read-aheads, do them too
	264	*/
	265	if (rbp) {
	266	if (error) {
	267	rbp->b_flags &= ~(B_ASYNC \| B_READ);
	268	brelse(rbp);
	269	} else if (rbp->b_flags & B_CACHE) {
	270	rbp->b_flags &= ~(B_ASYNC \| B_READ);
	271	bqrelse(rbp);
	272	} else {
	273	#if defined(CLUSTERDEBUG)
	274	if (rcluster) {
	275	if (bp)
	276	printf("A+(%lld,%d,%lld,%d) ",
	277	rbp->b_loffset, rbp->b_bcount,
	278	rbp->b_loffset - origoffset,
	279	seqcount);
	280	else
	281	printf("A(%lld,%d,%lld,%d) ",
	282	rbp->b_loffset, rbp->b_bcount,
	283	rbp->b_loffset - origoffset,
	284	seqcount);
	285	}
	286	#endif
	287
	288	if ((rbp->b_flags & B_CLUSTER) == 0) {
	289	vfs_busy_pages(vp, rbp, 0);
	290	}
	291	rbp->b_flags &= ~(B_ERROR\|B_INVAL);
	292	if ((rbp->b_flags & B_ASYNC) \|\| rbp->b_bio1.bio_done != NULL)
	293	BUF_KERNPROC(rbp);
	294	vn_strategy(vp, &rbp->b_bio1);
	295	}
	296	}
	297	if (reqbp)
	298	return (biowait(reqbp));
	299	else
	300	return (error);
	301	}
	302
	303	/*
	304	* If blocks are contiguous on disk, use this to provide clustered
	305	* read ahead. We will read as many blocks as possible sequentially
	306	* and then parcel them up into logical blocks in the buffer hash table.
	307	*/
	308	static struct buf *
	309	cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset,
	310	off_t doffset, int size, int run, struct buf *fbp)
	311	{
	312	struct buf bp, tbp;
	313	off_t boffset;
	314	int i, j;
	315
	316	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
	317	("cluster_rbuild: size %d != filesize %ld\n",
	318	size, vp->v_mount->mnt_stat.f_iosize));
	319
	320	/*
	321	* avoid a division
	322	*/
	323	while (loffset + run * size > filesize) {
	324	--run;
	325	}
	326
	327	tbp = fbp;
	328	tbp->b_flags \|= B_READ;
	329	tbp->b_bio2.bio_offset = doffset;
	330	if( (tbp->b_flags & B_MALLOC) \|\|
	331	((tbp->b_flags & B_VMIO) == 0) \|\| (run <= 1) )
	332	return tbp;
	333
	334	bp = trypbuf(&cluster_pbuf_freecnt);
	335	if (bp == NULL)
	336	return tbp;
	337
	338	/*
	339	* We are synthesizing a buffer out of vm_page_t's, but
	340	* if the block size is not page aligned then the starting
	341	* address may not be either. Inherit the b_data offset
	342	* from the original buffer.
	343	*/
	344	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	345	((vm_offset_t)tbp->b_data & PAGE_MASK));
	346	bp->b_flags \|= B_ASYNC \| B_READ \| B_CLUSTER \| B_VMIO;
	347	bp->b_bio1.bio_done = cluster_callback;
	348	bp->b_bio1.bio_caller_info1.cluster_head = NULL;
	349	bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
	350	bp->b_loffset = loffset;
	351	bp->b_bio2.bio_offset = NOOFFSET;
	352	KASSERT(bp->b_loffset != NOOFFSET,
	353	("cluster_rbuild: no buffer offset"));
	354
	355	bp->b_bcount = 0;
	356	bp->b_bufsize = 0;
	357	bp->b_xio.xio_npages = 0;
	358
	359	for (boffset = doffset, i = 0; i < run; ++i, boffset += size) {
	360	if (i != 0) {
	361	if ((bp->b_xio.xio_npages * PAGE_SIZE) +
	362	round_page(size) > vp->v_mount->mnt_iosize_max) {
	363	break;
	364	}
	365
	366	/*
	367	* Shortcut some checks and try to avoid buffers that
	368	* would block in the lock. The same checks have to
	369	* be made again after we officially get the buffer.
	370	*/
	371	if ((tbp = findblk(vp, loffset + i * size)) != NULL) {
	372	if (BUF_LOCK(tbp, LK_EXCLUSIVE \| LK_NOWAIT))
	373	break;
	374	BUF_UNLOCK(tbp);
	375
	376	for (j = 0; j < tbp->b_xio.xio_npages; j++) {
	377	if (tbp->b_xio.xio_pages[j]->valid)
	378	break;
	379	}
	380
	381	if (j != tbp->b_xio.xio_npages)
	382	break;
	383
	384	if (tbp->b_bcount != size)
	385	break;
	386	}
	387
	388	tbp = getblk(vp, loffset + i * size, size, 0, 0);
	389
	390	/*
	391	* Stop scanning if the buffer is fuly valid
	392	* (marked B_CACHE), or locked (may be doing a
	393	* background write), or if the buffer is not
	394	* VMIO backed. The clustering code can only deal
	395	* with VMIO-backed buffers.
	396	*/
	397	if ((tbp->b_flags & (B_CACHE\|B_LOCKED)) \|\|
	398	(tbp->b_flags & B_VMIO) == 0) {
	399	bqrelse(tbp);
	400	break;
	401	}
	402
	403	/*
	404	* The buffer must be completely invalid in order to
	405	* take part in the cluster. If it is partially valid
	406	* then we stop.
	407	*/
	408	for (j = 0;j < tbp->b_xio.xio_npages; j++) {
	409	if (tbp->b_xio.xio_pages[j]->valid)
	410	break;
	411	}
	412	if (j != tbp->b_xio.xio_npages) {
	413	bqrelse(tbp);
	414	break;
	415	}
	416
	417	/*
	418	* Set a read-ahead mark as appropriate
	419	*/
	420	if (i == 1 \|\| i == (run - 1))
	421	tbp->b_flags \|= B_RAM;
	422
	423	/*
	424	* Set the buffer up for an async read (XXX should
	425	* we do this only if we do not wind up brelse()ing?).
	426	* Set the block number if it isn't set, otherwise
	427	* if it is make sure it matches the block number we
	428	* expect.
	429	*/
	430	tbp->b_flags \|= B_READ \| B_ASYNC;
	431	if (tbp->b_bio2.bio_offset == NOOFFSET) {
	432	tbp->b_bio2.bio_offset = boffset;
	433	} else if (tbp->b_bio2.bio_offset != boffset) {
	434	brelse(tbp);
	435	break;
	436	}
	437	}
	438	/*
	439	* XXX fbp from caller may not be B_ASYNC, but we are going
	440	* to biodone() it in cluster_callback() anyway
	441	*/
	442	BUF_KERNPROC(tbp);
	443	cluster_append(&bp->b_bio1, tbp);
	444	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
	445	vm_page_t m;
	446	m = tbp->b_xio.xio_pages[j];
	447	vm_page_io_start(m);
	448	vm_object_pip_add(m->object, 1);
	449	if ((bp->b_xio.xio_npages == 0) \|\|
	450	(bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
	451	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	452	bp->b_xio.xio_npages++;
	453	}
	454	if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
	455	tbp->b_xio.xio_pages[j] = bogus_page;
	456	}
	457	/*
	458	* XXX shouldn't this be += size for both, like in
	459	* cluster_wbuild()?
	460	*
	461	* Don't inherit tbp->b_bufsize as it may be larger due to
	462	* a non-page-aligned size. Instead just aggregate using
	463	* 'size'.
	464	*/
	465	if (tbp->b_bcount != size)
	466	printf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, size);
	467	if (tbp->b_bufsize != size)
	468	printf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, size);
	469	bp->b_bcount += size;
	470	bp->b_bufsize += size;
	471	}
	472
	473	/*
	474	* Fully valid pages in the cluster are already good and do not need
	475	* to be re-read from disk. Replace the page with bogus_page
	476	*/
	477	for (j = 0; j < bp->b_xio.xio_npages; j++) {
	478	if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
	479	VM_PAGE_BITS_ALL) {
	480	bp->b_xio.xio_pages[j] = bogus_page;
	481	}
	482	}
	483	if (bp->b_bufsize > bp->b_kvasize)
	484	panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
	485	bp->b_bufsize, bp->b_kvasize);
	486	bp->b_kvasize = bp->b_bufsize;
	487
	488	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	489	(vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	490	return (bp);
	491	}
	492
	493	/*
	494	* Cleanup after a clustered read or write.
	495	* This is complicated by the fact that any of the buffers might have
	496	* extra memory (if there were no empty buffer headers at allocbuf time)
	497	* that we will need to shift around.
	498	*
	499	* The returned bio is &bp->b_bio1
	500	*/
	501	void
	502	cluster_callback(struct bio *bio)
	503	{
	504	struct buf *bp = bio->bio_buf;
	505	struct buf *tbp;
	506	int error = 0;
	507
	508	/*
	509	* Must propogate errors to all the components.
	510	*/
	511	if (bp->b_flags & B_ERROR)
	512	error = bp->b_error;
	513
	514	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
	515	/*
	516	* Move memory from the large cluster buffer into the component
	517	* buffers and mark IO as done on these. Since the memory map
	518	* is the same, no actual copying is required.
	519	*/
	520	while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
	521	bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
	522	if (error) {
	523	tbp->b_flags \|= B_ERROR;
	524	tbp->b_error = error;
	525	} else {
	526	tbp->b_dirtyoff = tbp->b_dirtyend = 0;
	527	tbp->b_flags &= ~(B_ERROR\|B_INVAL);
	528	/*
	529	* XXX the bdwrite()/bqrelse() issued during
	530	* cluster building clears B_RELBUF (see bqrelse()
	531	* comment). If direct I/O was specified, we have
	532	* to restore it here to allow the buffer and VM
	533	* to be freed.
	534	*/
	535	if (tbp->b_flags & B_DIRECT)
	536	tbp->b_flags \|= B_RELBUF;
	537	}
	538	biodone(&tbp->b_bio1);
	539	}
	540	relpbuf(bp, &cluster_pbuf_freecnt);
	541	}
	542
	543	/*
	544	* cluster_wbuild_wb:
	545	*
	546	* Implement modified write build for cluster.
	547	*
	548	* write_behind = 0 write behind disabled
	549	* write_behind = 1 write behind normal (default)
	550	* write_behind = 2 write behind backed-off
	551	*/
	552
	553	static __inline int
	554	cluster_wbuild_wb(struct vnode *vp, int size, off_t start_loffset, int len)
	555	{
	556	int r = 0;
	557
	558	switch(write_behind) {
	559	case 2:
	560	if (start_loffset < len)
	561	break;
	562	start_loffset -= len;
	563	/* fall through */
	564	case 1:
	565	r = cluster_wbuild(vp, size, start_loffset, len);
	566	/* fall through */
	567	default:
	568	/* fall through */
	569	break;
	570	}
	571	return(r);
	572	}
	573
	574	/*
	575	* Do clustered write for FFS.
	576	*
	577	* Three cases:
	578	* 1. Write is not sequential (write asynchronously)
	579	* Write is sequential:
	580	* 2. beginning of cluster - begin cluster
	581	* 3. middle of a cluster - add to cluster
	582	* 4. end of a cluster - asynchronously write cluster
	583	*/
	584	void
	585	cluster_write(struct buf *bp, off_t filesize, int seqcount)
	586	{
	587	struct vnode *vp;
	588	off_t loffset;
	589	int maxclen, cursize;
	590	int lblocksize;
	591	int async;
	592
	593	vp = bp->b_vp;
	594	if (vp->v_type == VREG) {
	595	async = vp->v_mount->mnt_flag & MNT_ASYNC;
	596	lblocksize = vp->v_mount->mnt_stat.f_iosize;
	597	} else {
	598	async = 0;
	599	lblocksize = bp->b_bufsize;
	600	}
	601	loffset = bp->b_loffset;
	602	KASSERT(bp->b_loffset != NOOFFSET,
	603	("cluster_write: no buffer offset"));
	604
	605	/* Initialize vnode to beginning of file. */
	606	if (loffset == 0)
	607	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
	608
	609	if (vp->v_clen == 0 \|\| loffset != vp->v_lastw + lblocksize \|\|
	610	bp->b_bio2.bio_offset == NOOFFSET \|\|
	611	(bp->b_bio2.bio_offset != vp->v_lasta + lblocksize)) {
	612	maxclen = vp->v_mount->mnt_iosize_max;
	613	if (vp->v_clen != 0) {
	614	/*
	615	* Next block is not sequential.
	616	*
	617	* If we are not writing at end of file, the process
	618	* seeked to another point in the file since its last
	619	* write, or we have reached our maximum cluster size,
	620	* then push the previous cluster. Otherwise try
	621	* reallocating to make it sequential.
	622	*
	623	* Change to algorithm: only push previous cluster if
	624	* it was sequential from the point of view of the
	625	* seqcount heuristic, otherwise leave the buffer
	626	* intact so we can potentially optimize the I/O
	627	* later on in the buf_daemon or update daemon
	628	* flush.
	629	*/
	630	cursize = vp->v_lastw - vp->v_cstart + lblocksize;
	631	if (bp->b_loffset + lblocksize != filesize \|\|
	632	loffset != vp->v_lastw + lblocksize \|\| vp->v_clen <= cursize) {
	633	if (!async && seqcount > 0) {
	634	cluster_wbuild_wb(vp, lblocksize,
	635	vp->v_cstart, cursize);
	636	}
	637	} else {
	638	struct buf bpp, endbp;
	639	struct cluster_save *buflist;
	640
	641	buflist = cluster_collectbufs(vp, bp,
	642	lblocksize);
	643	endbp = &buflist->bs_children
	644	[buflist->bs_nchildren - 1];
	645	if (VOP_REALLOCBLKS(vp, buflist)) {
	646	/*
	647	* Failed, push the previous cluster
	648	* if really writing sequentially
	649	* in the logical file (seqcount > 1),
	650	* otherwise delay it in the hopes that
	651	* the low level disk driver can
	652	* optimize the write ordering.
	653	*/
	654	for (bpp = buflist->bs_children;
	655	bpp < endbp; bpp++)
	656	brelse(*bpp);
	657	free(buflist, M_SEGMENT);
	658	if (seqcount > 1) {
	659	cluster_wbuild_wb(vp,
	660	lblocksize, vp->v_cstart,
	661	cursize);
	662	}
	663	} else {
	664	/*
	665	* Succeeded, keep building cluster.
	666	*/
	667	for (bpp = buflist->bs_children;
	668	bpp <= endbp; bpp++)
	669	bdwrite(*bpp);
	670	free(buflist, M_SEGMENT);
	671	vp->v_lastw = loffset;
	672	vp->v_lasta = bp->b_bio2.bio_offset;
	673	return;
	674	}
	675	}
	676	}
	677	/*
	678	* Consider beginning a cluster. If at end of file, make
	679	* cluster as large as possible, otherwise find size of
	680	* existing cluster.
	681	*/
	682	if ((vp->v_type == VREG) &&
	683	bp->b_loffset + lblocksize != filesize &&
	684	(bp->b_bio2.bio_offset == NOOFFSET) &&
	685	(VOP_BMAP(vp, loffset, NULL, &bp->b_bio2.bio_offset, &maxclen, NULL) \|\|
	686	bp->b_bio2.bio_offset == NOOFFSET)) {
	687	bawrite(bp);
	688	vp->v_clen = 0;
	689	vp->v_lasta = bp->b_bio2.bio_offset;
	690	vp->v_cstart = loffset + lblocksize;
	691	vp->v_lastw = loffset;
	692	return;
	693	}
	694	if (maxclen > lblocksize)
	695	vp->v_clen = maxclen - lblocksize;
	696	else
	697	vp->v_clen = 0;
	698	if (!async && vp->v_clen == 0) { /* I/O not contiguous */
	699	vp->v_cstart = loffset + lblocksize;
	700	bawrite(bp);
	701	} else { /* Wait for rest of cluster */
	702	vp->v_cstart = loffset;
	703	bdwrite(bp);
	704	}
	705	} else if (loffset == vp->v_cstart + vp->v_clen) {
	706	/*
	707	* At end of cluster, write it out if seqcount tells us we
	708	* are operating sequentially, otherwise let the buf or
	709	* update daemon handle it.
	710	*/
	711	bdwrite(bp);
	712	if (seqcount > 1)
	713	cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
	714	vp->v_clen + lblocksize);
	715	vp->v_clen = 0;
	716	vp->v_cstart = loffset + lblocksize;
	717	} else if (vm_page_count_severe()) {
	718	/*
	719	* We are low on memory, get it going NOW
	720	*/
	721	bawrite(bp);
	722	} else {
	723	/*
	724	* In the middle of a cluster, so just delay the I/O for now.
	725	*/
	726	bdwrite(bp);
	727	}
	728	vp->v_lastw = loffset;
	729	vp->v_lasta = bp->b_bio2.bio_offset;
	730	}
	731
	732
	733	/*
	734	* This is an awful lot like cluster_rbuild...wish they could be combined.
	735	* The last lbn argument is the current block on which I/O is being
	736	* performed. Check to see that it doesn't fall in the middle of
	737	* the current block (if last_bp == NULL).
	738	*/
	739	int
	740	cluster_wbuild(struct vnode *vp, int size, off_t start_loffset, int bytes)
	741	{
	742	struct buf bp, tbp;
	743	int i, j;
	744	int totalwritten = 0;
	745
	746	while (bytes > 0) {
	747	crit_enter();
	748	/*
	749	* If the buffer is not delayed-write (i.e. dirty), or it
	750	* is delayed-write but either locked or inval, it cannot
	751	* partake in the clustered write.
	752	*/
	753	if (((tbp = findblk(vp, start_loffset)) == NULL) \|\|
	754	((tbp->b_flags & (B_LOCKED \| B_INVAL \| B_DELWRI)) != B_DELWRI) \|\|
	755	BUF_LOCK(tbp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	756	start_loffset += size;
	757	bytes -= size;
	758	crit_exit();
	759	continue;
	760	}
	761	bremfree(tbp);
	762	tbp->b_flags &= ~B_DONE;
	763	crit_exit();
	764
	765	/*
	766	* Extra memory in the buffer, punt on this buffer.
	767	* XXX we could handle this in most cases, but we would
	768	* have to push the extra memory down to after our max
	769	* possible cluster size and then potentially pull it back
	770	* up if the cluster was terminated prematurely--too much
	771	* hassle.
	772	*/
	773	if (((tbp->b_flags & (B_CLUSTEROK\|B_MALLOC)) != B_CLUSTEROK) \|\|
	774	(tbp->b_bcount != tbp->b_bufsize) \|\|
	775	(tbp->b_bcount != size) \|\|
	776	(bytes == size) \|\|
	777	((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
	778	totalwritten += tbp->b_bufsize;
	779	bawrite(tbp);
	780	start_loffset += size;
	781	bytes -= size;
	782	continue;
	783	}
	784
	785	/*
	786	* We got a pbuf to make the cluster in.
	787	* so initialise it.
	788	*/
	789	bp->b_bcount = 0;
	790	bp->b_bufsize = 0;
	791	bp->b_xio.xio_npages = 0;
	792	bp->b_loffset = tbp->b_loffset;
	793	bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
	794
	795	/*
	796	* We are synthesizing a buffer out of vm_page_t's, but
	797	* if the block size is not page aligned then the starting
	798	* address may not be either. Inherit the b_data offset
	799	* from the original buffer.
	800	*/
	801	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	802	((vm_offset_t)tbp->b_data & PAGE_MASK));
	803	bp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR);
	804	bp->b_flags \|= B_CLUSTER \| B_ASYNC \|
	805	(tbp->b_flags & (B_VMIO \| B_NEEDCOMMIT \| B_NOWDRAIN));
	806	bp->b_bio1.bio_done = cluster_callback;
	807	bp->b_bio1.bio_caller_info1.cluster_head = NULL;
	808	bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
	809	/*
	810	* From this location in the file, scan forward to see
	811	* if there are buffers with adjacent data that need to
	812	* be written as well.
	813	*/
	814	for (i = 0; i < bytes; (i += size), (start_loffset += size)) {
	815	if (i != 0) { /* If not the first buffer */
	816	crit_enter();
	817	/*
	818	* If the adjacent data is not even in core it
	819	* can't need to be written.
	820	*/
	821	if ((tbp = findblk(vp, start_loffset)) == NULL) {
	822	crit_exit();
	823	break;
	824	}
	825
	826	/*
	827	* If it IS in core, but has different
	828	* characteristics, or is locked (which
	829	* means it could be undergoing a background
	830	* I/O or be in a weird state), then don't
	831	* cluster with it.
	832	*/
	833	if ((tbp->b_flags & (B_VMIO \| B_CLUSTEROK \|
	834	B_INVAL \| B_DELWRI \| B_NEEDCOMMIT))
	835	!= (B_DELWRI \| B_CLUSTEROK \|
	836	(bp->b_flags & (B_VMIO \| B_NEEDCOMMIT))) \|\|
	837	(tbp->b_flags & B_LOCKED) \|\|
	838	BUF_LOCK(tbp, LK_EXCLUSIVE \| LK_NOWAIT)) {
	839	crit_exit();
	840	break;
	841	}
	842
	843	/*
	844	* Check that the combined cluster
	845	* would make sense with regard to pages
	846	* and would not be too large
	847	*/
	848	if ((tbp->b_bcount != size) \|\|
	849	((bp->b_bio2.bio_offset + i) !=
	850	tbp->b_bio2.bio_offset) \|\|
	851	((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
	852	(vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
	853	BUF_UNLOCK(tbp);
	854	crit_exit();
	855	break;
	856	}
	857	/*
	858	* Ok, it's passed all the tests,
	859	* so remove it from the free list
	860	* and mark it busy. We will use it.
	861	*/
	862	bremfree(tbp);
	863	tbp->b_flags &= ~B_DONE;
	864	crit_exit();
	865	} /* end of code for non-first buffers only */
	866
	867	/*
	868	* If the IO is via the VM then we do some
	869	* special VM hackery (yuck). Since the buffer's
	870	* block size may not be page-aligned it is possible
	871	* for a page to be shared between two buffers. We
	872	* have to get rid of the duplication when building
	873	* the cluster.
	874	*/
	875	if (tbp->b_flags & B_VMIO) {
	876	vm_page_t m;
	877
	878	if (i != 0) { /* if not first buffer */
	879	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
	880	m = tbp->b_xio.xio_pages[j];
	881	if (m->flags & PG_BUSY) {
	882	bqrelse(tbp);
	883	goto finishcluster;
	884	}
	885	}
	886	}
	887
	888	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
	889	m = tbp->b_xio.xio_pages[j];
	890	vm_page_io_start(m);
	891	vm_object_pip_add(m->object, 1);
	892	if ((bp->b_xio.xio_npages == 0) \|\|
	893	(bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
	894	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	895	bp->b_xio.xio_npages++;
	896	}
	897	}
	898	}
	899	bp->b_bcount += size;
	900	bp->b_bufsize += size;
	901
	902	crit_enter();
	903	bundirty(tbp);
	904	tbp->b_flags &= ~(B_READ \| B_DONE \| B_ERROR);
	905	tbp->b_flags \|= B_ASYNC;
	906	crit_exit();
	907	BUF_KERNPROC(tbp);
	908	cluster_append(&bp->b_bio1, tbp);
	909
	910	/*
	911	* check for latent dependencies to be handled
	912	*/
	913	if (LIST_FIRST(&tbp->b_dep) != NULL && bioops.io_start)
	914	(*bioops.io_start)(tbp);
	915
	916	}
	917	finishcluster:
	918	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	919	(vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	920	if (bp->b_bufsize > bp->b_kvasize)
	921	panic(
	922	"cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
	923	bp->b_bufsize, bp->b_kvasize);
	924	bp->b_kvasize = bp->b_bufsize;
	925	totalwritten += bp->b_bufsize;
	926	bp->b_dirtyoff = 0;
	927	bp->b_dirtyend = bp->b_bufsize;
	928
	929	vfs_busy_pages(vp, bp, 1);
	930	bp->b_runningbufspace = bp->b_bufsize;
	931	runningbufspace += bp->b_runningbufspace;
	932	BUF_KERNPROC(bp); /* B_ASYNC */
	933	vn_strategy(vp, &bp->b_bio1);
	934
	935	bytes -= i;
	936	}
	937	return totalwritten;
	938	}
	939
	940	/*
	941	* Collect together all the buffers in a cluster.
	942	* Plus add one additional buffer.
	943	*/
	944	static struct cluster_save *
	945	cluster_collectbufs(struct vnode vp, struct buf last_bp, int lblocksize)
	946	{
	947	struct cluster_save *buflist;
	948	struct buf *bp;
	949	off_t loffset;
	950	int i, len;
	951
	952	len = (int)(vp->v_lastw - vp->v_cstart + lblocksize) / lblocksize;
	953	buflist = malloc(sizeof(struct buf ) (len + 1) + sizeof(*buflist),
	954	M_SEGMENT, M_WAITOK);
	955	buflist->bs_nchildren = 0;
	956	buflist->bs_children = (struct buf **) (buflist + 1);
	957	for (loffset = vp->v_cstart, i = 0; i < len; (loffset += lblocksize), i++) {
	958	(void) bread(vp, loffset, last_bp->b_bcount, &bp);
	959	buflist->bs_children[i] = bp;
	960	if (bp->b_bio2.bio_offset == NOOFFSET) {
	961	VOP_BMAP(bp->b_vp, bp->b_loffset, NULL,
	962	&bp->b_bio2.bio_offset, NULL, NULL);
	963	}
	964	}
	965	buflist->bs_children[i] = bp = last_bp;
	966	if (bp->b_bio2.bio_offset == NOOFFSET) {
	967	VOP_BMAP(bp->b_vp, bp->b_loffset, NULL,
	968	&bp->b_bio2.bio_offset, NULL, NULL);
	969	}
	970	buflist->bs_nchildren = i + 1;
	971	return (buflist);
	972	}
	973
	974	void
	975	cluster_append(struct bio bio, struct buf tbp)
	976	{
	977	tbp->b_cluster_next = NULL;
	978	if (bio->bio_caller_info1.cluster_head == NULL) {
	979	bio->bio_caller_info1.cluster_head = tbp;
	980	bio->bio_caller_info2.cluster_tail = tbp;
	981	} else {
	982	bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
	983	bio->bio_caller_info2.cluster_tail = tbp;
	984	}
	985	}
	986