gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* Modifications/enhancements:
	5	* Copyright (c) 1995 John S. Dyson. All rights reserved.
	6	* Copyright (c) 2012-2013 Matthew Dillon. All rights reserved.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. Neither the name of the University nor the names of its contributors
	17	* may be used to endorse or promote products derived from this software
	18	* without specific prior written permission.
	19	*
	20	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	21	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	22	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	23	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	24	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	25	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	26	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	27	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	28	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	29	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	30	* SUCH DAMAGE.
	31	*/
	32
	33	#include "opt_debug_cluster.h"
	34
	35	#include <sys/param.h>
	36	#include <sys/systm.h>
	37	#include <sys/kernel.h>
	38	#include <sys/proc.h>
	39	#include <sys/buf.h>
	40	#include <sys/vnode.h>
	41	#include <sys/malloc.h>
	42	#include <sys/mount.h>
	43	#include <sys/resourcevar.h>
	44	#include <sys/vmmeter.h>
	45	#include <vm/vm.h>
	46	#include <vm/vm_object.h>
	47	#include <vm/vm_page.h>
	48	#include <sys/sysctl.h>
	49
	50	#include <sys/buf2.h>
	51	#include <vm/vm_page2.h>
	52
	53	#include <machine/limits.h>
	54
	55	/*
	56	* Cluster tracking cache - replaces the original vnode v_* fields which had
	57	* limited utility and were not MP safe.
	58	*
	59	* The cluster tracking cache is a simple 4-way set-associative non-chained
	60	* cache. It is capable of tracking up to four zones separated by 1MB or
	61	* more per vnode.
	62	*
	63	* NOTE: We want this structure to be cache-line friendly so the iterator
	64	* is embedded rather than in a separate array.
	65	*
	66	* NOTE: A cluster cache entry can become stale when a vnode is recycled.
	67	* For now we treat the values as heuristical but also self-consistent.
	68	* i.e. the values cannot be completely random and cannot be SMP unsafe
	69	* or the cluster code might end-up clustering non-contiguous buffers
	70	* at the wrong offsets.
	71	*/
	72	struct cluster_cache {
	73	struct vnode *vp;
	74	u_int locked;
	75	off_t v_lastw; /* last write (write cluster) */
	76	off_t v_cstart; /* start block of cluster */
	77	off_t v_lasta; /* last allocation */
	78	u_int v_clen; /* length of current cluster */
	79	u_int iterator;
	80	} __cachealign;
	81
	82	typedef struct cluster_cache cluster_cache_t;
	83
	84	#define CLUSTER_CACHE_SIZE 512
	85	#define CLUSTER_CACHE_MASK (CLUSTER_CACHE_SIZE - 1)
	86
	87	#define CLUSTER_ZONE ((off_t)(1024 * 1024))
	88
	89	cluster_cache_t cluster_array[CLUSTER_CACHE_SIZE];
	90
	91	#if defined(CLUSTERDEBUG)
	92	#include <sys/sysctl.h>
	93	static int rcluster= 0;
	94	SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
	95	#endif
	96
	97	static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
	98
	99	static struct cluster_save *
	100	cluster_collectbufs (cluster_cache_t cc, struct vnode vp,
	101	struct buf *last_bp, int blksize);
	102	static struct buf *
	103	cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
	104	off_t doffset, int blksize, int run,
	105	struct buf fbp, int srp);
	106	static void cluster_callback (struct bio *);
	107	static void cluster_setram (struct buf *);
	108	static void cluster_clrram (struct buf *);
	109	static int cluster_wbuild(struct vnode vp, struct buf *bpp, int blksize,
	110	off_t start_loffset, int bytes);
	111
	112	static int write_behind = 1;
	113	SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
	114	"Cluster write-behind setting");
	115	static quad_t write_behind_minfilesize = 10 * 1024 * 1024;
	116	SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW,
	117	&write_behind_minfilesize, 0, "Cluster write-behind setting");
	118	static int max_readahead = 2 * 1024 * 1024;
	119	SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0,
	120	"Limit in bytes for desired cluster read-ahead");
	121
	122	extern vm_page_t bogus_page;
	123
	124	extern int cluster_pbuf_freecnt;
	125
	126	/*
	127	* nblks is our cluster_rbuild request size. The approximate number of
	128	* physical read-ahead requests is maxra / nblks. The physical request
	129	* size is limited by the device (maxrbuild). We also do not want to make
	130	* the request size too big or it will mess up the B_RAM streaming.
	131	*/
	132	static __inline
	133	int
	134	calc_rbuild_reqsize(int maxra, int maxrbuild)
	135	{
	136	int nblks;
	137
	138	if ((nblks = maxra / 4) > maxrbuild)
	139	nblks = maxrbuild;
	140	if (nblks < 1)
	141	nblks = maxra;
	142	return nblks;
	143	}
	144
	145	/*
	146	* Acquire/release cluster cache (can return dummy entry)
	147	*/
	148	static
	149	cluster_cache_t *
	150	cluster_getcache(cluster_cache_t dummy, struct vnode vp, off_t loffset)
	151	{
	152	cluster_cache_t *cc;
	153	size_t hv;
	154	int i;
	155	int xact;
	156
	157	hv = (size_t)(intptr_t)vp ^ (size_t)(intptr_t)vp / sizeof(*vp);
	158	hv &= CLUSTER_CACHE_MASK & ~3;
	159	cc = &cluster_array[hv];
	160
	161	xact = -1;
	162	for (i = 0; i < 4; ++i) {
	163	if (cc[i].vp != vp)
	164	continue;
	165	if (((cc[i].v_cstart ^ loffset) & ~(CLUSTER_ZONE - 1)) == 0) {
	166	xact = i;
	167	break;
	168	}
	169	}
	170	if (xact >= 0 && atomic_swap_int(&cc[xact].locked, 1) == 0) {
	171	if (cc[xact].vp == vp &&
	172	((cc[i].v_cstart ^ loffset) & ~(CLUSTER_ZONE - 1)) == 0) {
	173	return(&cc[xact]);
	174	}
	175	atomic_swap_int(&cc[xact].locked, 0);
	176	}
	177
	178	/*
	179	* New entry. If we can't acquire the cache line then use the
	180	* passed-in dummy element and reset all fields.
	181	*
	182	* When we are able to acquire the cache line we only clear the
	183	* fields if the vp does not match. This allows us to multi-zone
	184	* a vp and for excessive zones / partial clusters to be retired.
	185	*/
	186	i = cc->iterator++ & 3;
	187	cc += i;
	188	if (atomic_swap_int(&cc->locked, 1) != 0) {
	189	cc = dummy;
	190	cc->locked = 1;
	191	cc->vp = NULL;
	192	}
	193	if (cc->vp != vp) {
	194	cc->vp = vp;
	195	cc->v_lasta = 0;
	196	cc->v_clen = 0;
	197	cc->v_cstart = 0;
	198	cc->v_lastw = 0;
	199	}
	200	return(cc);
	201	}
	202
	203	static
	204	void
	205	cluster_putcache(cluster_cache_t *cc)
	206	{
	207	atomic_swap_int(&cc->locked, 0);
	208	}
	209
	210	/*
	211	* This replaces bread(), providing a synchronous read of the requested
	212	* buffer plus asynchronous read-ahead within the specified bounds.
	213	*
	214	* The caller may pre-populate *bpp if it already has the requested buffer
	215	* in-hand, else must set *bpp to NULL. Note that the cluster_read() inline
	216	* sets *bpp to NULL and then calls cluster_readx() for compatibility.
	217	*
	218	* filesize - read-ahead @ blksize will not cross this boundary
	219	* loffset - loffset for returned *bpp
	220	* blksize - blocksize for returned *bpp and read-ahead bps
	221	* minreq - minimum (not a hard minimum) in bytes, typically reflects
	222	* a higher level uio resid.
	223	* maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB)
	224	* bpp - return buffer (*bpp) for (loffset,blksize)
	225	*/
	226	int
	227	cluster_readx(struct vnode *vp, off_t filesize, off_t loffset,
	228	int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
	229	{
	230	struct buf bp, rbp, *reqbp;
	231	off_t origoffset;
	232	off_t doffset;
	233	int error;
	234	int i;
	235	int maxra;
	236	int maxrbuild;
	237	int sr;
	238
	239	sr = 0;
	240
	241	/*
	242	* Calculate the desired read-ahead in blksize'd blocks (maxra).
	243	* To do this we calculate maxreq.
	244	*
	245	* maxreq typically starts out as a sequential heuristic. If the
	246	* high level uio/resid is bigger (minreq), we pop maxreq up to
	247	* minreq. This represents the case where random I/O is being
	248	* performed by the userland is issuing big read()'s.
	249	*
	250	* Then we limit maxreq to max_readahead to ensure it is a reasonable
	251	* value.
	252	*
	253	* Finally we must ensure that (loffset + maxreq) does not cross the
	254	* boundary (filesize) for the current blocksize. If we allowed it
	255	* to cross we could end up with buffers past the boundary with the
	256	* wrong block size (HAMMER large-data areas use mixed block sizes).
	257	* minreq is also absolutely limited to filesize.
	258	*/
	259	if (maxreq < minreq)
	260	maxreq = minreq;
	261	/* minreq not used beyond this point */
	262
	263	if (maxreq > max_readahead) {
	264	maxreq = max_readahead;
	265	if (maxreq > 16 * 1024 * 1024)
	266	maxreq = 16 * 1024 * 1024;
	267	}
	268	if (maxreq < blksize)
	269	maxreq = blksize;
	270	if (loffset + maxreq > filesize) {
	271	if (loffset > filesize)
	272	maxreq = 0;
	273	else
	274	maxreq = filesize - loffset;
	275	}
	276
	277	maxra = (int)(maxreq / blksize);
	278
	279	/*
	280	* Get the requested block.
	281	*/
	282	if (*bpp)
	283	reqbp = bp = *bpp;
	284	else
	285	*bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
	286	origoffset = loffset;
	287
	288	/*
	289	* Calculate the maximum cluster size for a single I/O, used
	290	* by cluster_rbuild().
	291	*/
	292	maxrbuild = vmaxiosize(vp) / blksize;
	293
	294	/*
	295	* if it is in the cache, then check to see if the reads have been
	296	* sequential. If they have, then try some read-ahead, otherwise
	297	* back-off on prospective read-aheads.
	298	*/
	299	if (bp->b_flags & B_CACHE) {
	300	/*
	301	* Not sequential, do not do any read-ahead
	302	*/
	303	if (maxra <= 1)
	304	return 0;
	305
	306	/*
	307	* No read-ahead mark, do not do any read-ahead
	308	* yet.
	309	*/
	310	if ((bp->b_flags & B_RAM) == 0)
	311	return 0;
	312
	313	/*
	314	* We hit a read-ahead-mark, figure out how much read-ahead
	315	* to do (maxra) and where to start (loffset).
	316	*
	317	* Typically the way this works is that B_RAM is set in the
	318	* middle of the cluster and triggers an overlapping
	319	* read-ahead of 1/2 a cluster more blocks. This ensures
	320	* that the cluster read-ahead scales with the read-ahead
	321	* count and is thus better-able to absorb the caller's
	322	* latency.
	323	*
	324	* Estimate where the next unread block will be by assuming
	325	* that the B_RAM's are placed at the half-way point.
	326	*/
	327	bp->b_flags &= ~B_RAM;
	328
	329	i = maxra / 2;
	330	rbp = findblk(vp, loffset + i * blksize, FINDBLK_TEST);
	331	if (rbp == NULL \|\| (rbp->b_flags & B_CACHE) == 0) {
	332	while (i) {
	333	--i;
	334	rbp = findblk(vp, loffset + i * blksize,
	335	FINDBLK_TEST);
	336	if (rbp) {
	337	++i;
	338	break;
	339	}
	340	}
	341	} else {
	342	while (i < maxra) {
	343	rbp = findblk(vp, loffset + i * blksize,
	344	FINDBLK_TEST);
	345	if (rbp == NULL)
	346	break;
	347	++i;
	348	}
	349	}
	350
	351	/*
	352	* We got everything or everything is in the cache, no
	353	* point continuing.
	354	*/
	355	if (i >= maxra)
	356	return 0;
	357
	358	/*
	359	* Calculate where to start the read-ahead and how much
	360	* to do. Generally speaking we want to read-ahead by
	361	* (maxra) when we've found a read-ahead mark. We do
	362	* not want to reduce maxra here as it will cause
	363	* successive read-ahead I/O's to be smaller and smaller.
	364	*
	365	* However, we have to make sure we don't break the
	366	* filesize limitation for the clustered operation.
	367	*/
	368	loffset += i * blksize;
	369	reqbp = bp = NULL;
	370
	371	if (loffset >= filesize)
	372	return 0;
	373	if (loffset + maxra * blksize > filesize) {
	374	maxreq = filesize - loffset;
	375	maxra = (int)(maxreq / blksize);
	376	}
	377
	378	/*
	379	* Set RAM on first read-ahead block since we still have
	380	* approximate maxra/2 blocks ahead of us that are already
	381	* cached or in-progress.
	382	*/
	383	sr = 1;
	384	} else {
	385	/*
	386	* Start block is not valid, we will want to do a
	387	* full read-ahead.
	388	*/
	389	__debugvar off_t firstread = bp->b_loffset;
	390	int nblks;
	391
	392	/*
	393	* Set-up synchronous read for bp.
	394	*/
	395	bp->b_cmd = BUF_CMD_READ;
	396	bp->b_bio1.bio_done = biodone_sync;
	397	bp->b_bio1.bio_flags \|= BIO_SYNC;
	398
	399	KASSERT(firstread != NOOFFSET,
	400	("cluster_read: no buffer offset"));
	401
	402	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	403
	404	/*
	405	* Set RAM half-way through the full-cluster.
	406	*/
	407	sr = (maxra + 1) / 2;
	408
	409	if (nblks > 1) {
	410	int burstbytes;
	411
	412	error = VOP_BMAP(vp, loffset, &doffset,
	413	&burstbytes, NULL, BUF_CMD_READ);
	414	if (error)
	415	goto single_block_read;
	416	if (nblks > burstbytes / blksize)
	417	nblks = burstbytes / blksize;
	418	if (doffset == NOOFFSET)
	419	goto single_block_read;
	420	if (nblks <= 1)
	421	goto single_block_read;
	422
	423	bp = cluster_rbuild(vp, filesize, loffset,
	424	doffset, blksize, nblks, bp, &sr);
	425	loffset += bp->b_bufsize;
	426	maxra -= bp->b_bufsize / blksize;
	427	} else {
	428	single_block_read:
	429	/*
	430	* If it isn't in the cache, then get a chunk from
	431	* disk if sequential, otherwise just get the block.
	432	*/
	433	loffset += blksize;
	434	--maxra;
	435	}
	436	}
	437
	438	/*
	439	* If B_CACHE was not set issue bp. bp will either be an
	440	* asynchronous cluster buf or a synchronous single-buf.
	441	* If it is a single buf it will be the same as reqbp.
	442	*
	443	* NOTE: Once an async cluster buf is issued bp becomes invalid.
	444	*/
	445	if (bp) {
	446	#if defined(CLUSTERDEBUG)
	447	if (rcluster)
	448	kprintf("S(%012jx,%d,%d)\n",
	449	(intmax_t)bp->b_loffset, bp->b_bcount, maxra);
	450	#endif
	451	if ((bp->b_flags & B_CLUSTER) == 0)
	452	vfs_busy_pages(vp, bp);
	453	bp->b_flags &= ~(B_ERROR\|B_INVAL);
	454	vn_strategy(vp, &bp->b_bio1);
	455	/* bp invalid now */
	456	bp = NULL;
	457	}
	458
	459	#if defined(CLUSTERDEBUG)
	460	if (rcluster)
	461	kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n",
	462	loffset, blksize, maxra, sr);
	463	#endif
	464
	465	/*
	466	* If we have been doing sequential I/O, then do some read-ahead.
	467	* The code above us should have positioned us at the next likely
	468	* offset.
	469	*
	470	* Only mess with buffers which we can immediately lock. HAMMER
	471	* will do device-readahead irrespective of what the blocks
	472	* represent.
	473	*
	474	* Set B_RAM on the first buffer (the next likely offset needing
	475	* read-ahead), under the assumption that there are still
	476	* approximately maxra/2 blocks good ahead of us.
	477	*/
	478	while (maxra > 0) {
	479	int burstbytes;
	480	int nblks;
	481
	482	rbp = getblk(vp, loffset, blksize,
	483	GETBLK_SZMATCH\|GETBLK_NOWAIT, 0);
	484	#if defined(CLUSTERDEBUG)
	485	if (rcluster) {
	486	kprintf("read-ahead %016jx rbp=%p ",
	487	loffset, rbp);
	488	}
	489	#endif
	490	if (rbp == NULL)
	491	goto no_read_ahead;
	492	if ((rbp->b_flags & B_CACHE)) {
	493	bqrelse(rbp);
	494	goto no_read_ahead;
	495	}
	496
	497	/*
	498	* If BMAP is not supported or has an issue, we still do
	499	* (maxra) read-ahead, but we do not try to use rbuild.
	500	*/
	501	error = VOP_BMAP(vp, loffset, &doffset,
	502	&burstbytes, NULL, BUF_CMD_READ);
	503	if (error \|\| doffset == NOOFFSET) {
	504	nblks = 1;
	505	doffset = NOOFFSET;
	506	} else {
	507	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	508	if (nblks > burstbytes / blksize)
	509	nblks = burstbytes / blksize;
	510	}
	511	rbp->b_cmd = BUF_CMD_READ;
	512
	513	if (nblks > 1) {
	514	rbp = cluster_rbuild(vp, filesize, loffset,
	515	doffset, blksize,
	516	nblks, rbp, &sr);
	517	} else {
	518	rbp->b_bio2.bio_offset = doffset;
	519	if (--sr == 0)
	520	cluster_setram(rbp);
	521	}
	522
	523	rbp->b_flags &= ~(B_ERROR\|B_INVAL);
	524
	525	if ((rbp->b_flags & B_CLUSTER) == 0)
	526	vfs_busy_pages(vp, rbp);
	527	BUF_KERNPROC(rbp);
	528	loffset += rbp->b_bufsize;
	529	maxra -= rbp->b_bufsize / blksize;
	530	vn_strategy(vp, &rbp->b_bio1);
	531	/* rbp invalid now */
	532	}
	533
	534	/*
	535	* Wait for our original buffer to complete its I/O. reqbp will
	536	* be NULL if the original buffer was B_CACHE. We are returning
	537	* (*bpp) which is the same as reqbp when reqbp != NULL.
	538	*/
	539	no_read_ahead:
	540	if (reqbp) {
	541	KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
	542	error = biowait(&reqbp->b_bio1, "clurd");
	543	} else {
	544	error = 0;
	545	}
	546	return (error);
	547	}
	548
	549	/*
	550	* This replaces breadcb(), providing an asynchronous read of the requested
	551	* buffer with a callback, plus an asynchronous read-ahead within the
	552	* specified bounds.
	553	*
	554	* The callback must check whether BIO_DONE is set in the bio and issue
	555	* the bpdone(bp, 0) if it isn't. The callback is responsible for clearing
	556	* BIO_DONE and disposing of the I/O (bqrelse()ing it).
	557	*
	558	* filesize - read-ahead @ blksize will not cross this boundary
	559	* loffset - loffset for returned *bpp
	560	* blksize - blocksize for returned *bpp and read-ahead bps
	561	* minreq - minimum (not a hard minimum) in bytes, typically reflects
	562	* a higher level uio resid.
	563	* maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB)
	564	* bpp - return buffer (*bpp) for (loffset,blksize)
	565	*/
	566	void
	567	cluster_readcb(struct vnode *vp, off_t filesize, off_t loffset,
	568	int blksize, size_t minreq, size_t maxreq,
	569	void (func)(struct bio ), void *arg)
	570	{
	571	struct buf bp, rbp, *reqbp;
	572	off_t origoffset;
	573	off_t doffset;
	574	int i;
	575	int maxra;
	576	int maxrbuild;
	577	int sr;
	578
	579	sr = 0;
	580
	581	/*
	582	* Calculate the desired read-ahead in blksize'd blocks (maxra).
	583	* To do this we calculate maxreq.
	584	*
	585	* maxreq typically starts out as a sequential heuristic. If the
	586	* high level uio/resid is bigger (minreq), we pop maxreq up to
	587	* minreq. This represents the case where random I/O is being
	588	* performed by the userland is issuing big read()'s.
	589	*
	590	* Then we limit maxreq to max_readahead to ensure it is a reasonable
	591	* value.
	592	*
	593	* Finally we must ensure that (loffset + maxreq) does not cross the
	594	* boundary (filesize) for the current blocksize. If we allowed it
	595	* to cross we could end up with buffers past the boundary with the
	596	* wrong block size (HAMMER large-data areas use mixed block sizes).
	597	* minreq is also absolutely limited to filesize.
	598	*/
	599	if (maxreq < minreq)
	600	maxreq = minreq;
	601	/* minreq not used beyond this point */
	602
	603	if (maxreq > max_readahead) {
	604	maxreq = max_readahead;
	605	if (maxreq > 16 * 1024 * 1024)
	606	maxreq = 16 * 1024 * 1024;
	607	}
	608	if (maxreq < blksize)
	609	maxreq = blksize;
	610	if (loffset + maxreq > filesize) {
	611	if (loffset > filesize)
	612	maxreq = 0;
	613	else
	614	maxreq = filesize - loffset;
	615	}
	616
	617	maxra = (int)(maxreq / blksize);
	618
	619	/*
	620	* Get the requested block.
	621	*/
	622	reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
	623	origoffset = loffset;
	624
	625	/*
	626	* Calculate the maximum cluster size for a single I/O, used
	627	* by cluster_rbuild().
	628	*/
	629	maxrbuild = vmaxiosize(vp) / blksize;
	630
	631	/*
	632	* if it is in the cache, then check to see if the reads have been
	633	* sequential. If they have, then try some read-ahead, otherwise
	634	* back-off on prospective read-aheads.
	635	*/
	636	if (bp->b_flags & B_CACHE) {
	637	/*
	638	* Setup for func() call whether we do read-ahead or not.
	639	*/
	640	bp->b_bio1.bio_caller_info1.ptr = arg;
	641	bp->b_bio1.bio_flags \|= BIO_DONE;
	642
	643	/*
	644	* Not sequential, do not do any read-ahead
	645	*/
	646	if (maxra <= 1)
	647	goto no_read_ahead;
	648
	649	/*
	650	* No read-ahead mark, do not do any read-ahead
	651	* yet.
	652	*/
	653	if ((bp->b_flags & B_RAM) == 0)
	654	goto no_read_ahead;
	655	bp->b_flags &= ~B_RAM;
	656
	657	/*
	658	* We hit a read-ahead-mark, figure out how much read-ahead
	659	* to do (maxra) and where to start (loffset).
	660	*
	661	* Shortcut the scan. Typically the way this works is that
	662	* we've built up all the blocks inbetween except for the
	663	* last in previous iterations, so if the second-to-last
	664	* block is present we just skip ahead to it.
	665	*
	666	* This algorithm has O(1) cpu in the steady state no
	667	* matter how large maxra is.
	668	*/
	669	if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
	670	i = maxra - 1;
	671	else
	672	i = 1;
	673	while (i < maxra) {
	674	if (findblk(vp, loffset + i * blksize,
	675	FINDBLK_TEST) == NULL) {
	676	break;
	677	}
	678	++i;
	679	}
	680
	681	/*
	682	* We got everything or everything is in the cache, no
	683	* point continuing.
	684	*/
	685	if (i >= maxra)
	686	goto no_read_ahead;
	687
	688	/*
	689	* Calculate where to start the read-ahead and how much
	690	* to do. Generally speaking we want to read-ahead by
	691	* (maxra) when we've found a read-ahead mark. We do
	692	* not want to reduce maxra here as it will cause
	693	* successive read-ahead I/O's to be smaller and smaller.
	694	*
	695	* However, we have to make sure we don't break the
	696	* filesize limitation for the clustered operation.
	697	*/
	698	loffset += i * blksize;
	699	bp = NULL;
	700	/* leave reqbp intact to force function callback */
	701
	702	if (loffset >= filesize)
	703	goto no_read_ahead;
	704	if (loffset + maxra * blksize > filesize) {
	705	maxreq = filesize - loffset;
	706	maxra = (int)(maxreq / blksize);
	707	}
	708	sr = 1;
	709	} else {
	710	/*
	711	* bp is not valid, no prior cluster in progress so get a
	712	* full cluster read-ahead going.
	713	*/
	714	__debugvar off_t firstread = bp->b_loffset;
	715	int nblks;
	716	int error;
	717
	718	/*
	719	* Set-up synchronous read for bp.
	720	*/
	721	bp->b_flags &= ~(B_ERROR \| B_EINTR \| B_INVAL);
	722	bp->b_cmd = BUF_CMD_READ;
	723	bp->b_bio1.bio_done = func;
	724	bp->b_bio1.bio_caller_info1.ptr = arg;
	725	BUF_KERNPROC(bp);
	726	reqbp = NULL; /* don't func() reqbp, it's running async */
	727
	728	KASSERT(firstread != NOOFFSET,
	729	("cluster_read: no buffer offset"));
	730
	731	/*
	732	* nblks is our cluster_rbuild request size, limited
	733	* primarily by the device.
	734	*/
	735	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	736
	737	/*
	738	* Set RAM half-way through the full-cluster.
	739	*/
	740	sr = (maxra + 1) / 2;
	741
	742	if (nblks > 1) {
	743	int burstbytes;
	744
	745	error = VOP_BMAP(vp, loffset, &doffset,
	746	&burstbytes, NULL, BUF_CMD_READ);
	747	if (error)
	748	goto single_block_read;
	749	if (nblks > burstbytes / blksize)
	750	nblks = burstbytes / blksize;
	751	if (doffset == NOOFFSET)
	752	goto single_block_read;
	753	if (nblks <= 1)
	754	goto single_block_read;
	755
	756	bp = cluster_rbuild(vp, filesize, loffset,
	757	doffset, blksize, nblks, bp, &sr);
	758	loffset += bp->b_bufsize;
	759	maxra -= bp->b_bufsize / blksize;
	760	} else {
	761	single_block_read:
	762	/*
	763	* If it isn't in the cache, then get a chunk from
	764	* disk if sequential, otherwise just get the block.
	765	*/
	766	loffset += blksize;
	767	--maxra;
	768	}
	769	}
	770
	771	/*
	772	* If bp != NULL then B_CACHE was NOT set and bp must be issued.
	773	* bp will either be an asynchronous cluster buf or an asynchronous
	774	* single-buf.
	775	*
	776	* NOTE: Once an async cluster buf is issued bp becomes invalid.
	777	*/
	778	if (bp) {
	779	#if defined(CLUSTERDEBUG)
	780	if (rcluster)
	781	kprintf("S(%012jx,%d,%d)\n",
	782	(intmax_t)bp->b_loffset, bp->b_bcount, maxra);
	783	#endif
	784	if ((bp->b_flags & B_CLUSTER) == 0)
	785	vfs_busy_pages(vp, bp);
	786	bp->b_flags &= ~(B_ERROR\|B_INVAL);
	787	vn_strategy(vp, &bp->b_bio1);
	788	/* bp invalid now */
	789	bp = NULL;
	790	}
	791
	792	#if defined(CLUSTERDEBUG)
	793	if (rcluster)
	794	kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n",
	795	loffset, blksize, maxra, sr);
	796	#endif
	797
	798	/*
	799	* If we have been doing sequential I/O, then do some read-ahead.
	800	* The code above us should have positioned us at the next likely
	801	* offset.
	802	*
	803	* Only mess with buffers which we can immediately lock. HAMMER
	804	* will do device-readahead irrespective of what the blocks
	805	* represent.
	806	*/
	807	while (maxra > 0) {
	808	int burstbytes;
	809	int error;
	810	int nblks;
	811
	812	rbp = getblk(vp, loffset, blksize,
	813	GETBLK_SZMATCH\|GETBLK_NOWAIT, 0);
	814	if (rbp == NULL)
	815	goto no_read_ahead;
	816	if ((rbp->b_flags & B_CACHE)) {
	817	bqrelse(rbp);
	818	goto no_read_ahead;
	819	}
	820
	821	/*
	822	* If BMAP is not supported or has an issue, we still do
	823	* (maxra) read-ahead, but we do not try to use rbuild.
	824	*/
	825	error = VOP_BMAP(vp, loffset, &doffset,
	826	&burstbytes, NULL, BUF_CMD_READ);
	827	if (error \|\| doffset == NOOFFSET) {
	828	nblks = 1;
	829	doffset = NOOFFSET;
	830	} else {
	831	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	832	if (nblks > burstbytes / blksize)
	833	nblks = burstbytes / blksize;
	834	}
	835	rbp->b_cmd = BUF_CMD_READ;
	836
	837	if (nblks > 1) {
	838	rbp = cluster_rbuild(vp, filesize, loffset,
	839	doffset, blksize,
	840	nblks, rbp, &sr);
	841	} else {
	842	rbp->b_bio2.bio_offset = doffset;
	843	if (--sr == 0)
	844	cluster_setram(rbp);
	845	}
	846
	847	rbp->b_flags &= ~(B_ERROR\|B_INVAL);
	848
	849	if ((rbp->b_flags & B_CLUSTER) == 0)
	850	vfs_busy_pages(vp, rbp);
	851	BUF_KERNPROC(rbp);
	852	loffset += rbp->b_bufsize;
	853	maxra -= rbp->b_bufsize / blksize;
	854	vn_strategy(vp, &rbp->b_bio1);
	855	/* rbp invalid now */
	856	}
	857
	858	/*
	859	* If reqbp is non-NULL it had B_CACHE set and we issue the
	860	* function callback synchronously.
	861	*
	862	* Note that we may start additional asynchronous I/O before doing
	863	* the func() callback for the B_CACHE case
	864	*/
	865	no_read_ahead:
	866	if (reqbp)
	867	func(&reqbp->b_bio1);
	868	}
	869
	870	/*
	871	* If blocks are contiguous on disk, use this to provide clustered
	872	* read ahead. We will read as many blocks as possible sequentially
	873	* and then parcel them up into logical blocks in the buffer hash table.
	874	*
	875	* This function either returns a cluster buf or it returns fbp. fbp is
	876	* already expected to be set up as a synchronous or asynchronous request.
	877	*
	878	* If a cluster buf is returned it will always be async.
	879	*
	880	* (*srp) counts down original blocks to determine where B_RAM should be set.
	881	* Set B_RAM when srp drops to 0. If (srp) starts at 0, B_RAM will not be
	882	* set on any buffer. Make sure B_RAM is cleared on any other buffers to
	883	* prevent degenerate read-aheads from being generated.
	884	*/
	885	static struct buf *
	886	cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
	887	int blksize, int run, struct buf fbp, int srp)
	888	{
	889	struct buf bp, tbp;
	890	off_t boffset;
	891	int i, j;
	892	int maxiosize = vmaxiosize(vp);
	893
	894	/*
	895	* avoid a division
	896	*/
	897	while (loffset + run * blksize > filesize) {
	898	--run;
	899	}
	900
	901	tbp = fbp;
	902	tbp->b_bio2.bio_offset = doffset;
	903	if((tbp->b_flags & B_MALLOC) \|\|
	904	((tbp->b_flags & B_VMIO) == 0) \|\| (run <= 1)) {
	905	if (--*srp == 0)
	906	cluster_setram(tbp);
	907	else
	908	cluster_clrram(tbp);
	909	return tbp;
	910	}
	911
	912	bp = trypbuf_kva(&cluster_pbuf_freecnt);
	913	if (bp == NULL) {
	914	return tbp;
	915	}
	916
	917	/*
	918	* We are synthesizing a buffer out of vm_page_t's, but
	919	* if the block size is not page aligned then the starting
	920	* address may not be either. Inherit the b_data offset
	921	* from the original buffer.
	922	*/
	923	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	924	((vm_offset_t)tbp->b_data & PAGE_MASK));
	925	bp->b_flags \|= B_CLUSTER \| B_VMIO;
	926	bp->b_cmd = BUF_CMD_READ;
	927	bp->b_bio1.bio_done = cluster_callback; /* default to async */
	928	bp->b_bio1.bio_caller_info1.cluster_head = NULL;
	929	bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
	930	bp->b_loffset = loffset;
	931	bp->b_bio2.bio_offset = doffset;
	932	KASSERT(bp->b_loffset != NOOFFSET,
	933	("cluster_rbuild: no buffer offset"));
	934
	935	bp->b_bcount = 0;
	936	bp->b_bufsize = 0;
	937	bp->b_xio.xio_npages = 0;
	938
	939	for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
	940	if (i) {
	941	if ((bp->b_xio.xio_npages * PAGE_SIZE) +
	942	round_page(blksize) > maxiosize) {
	943	break;
	944	}
	945
	946	/*
	947	* Shortcut some checks and try to avoid buffers that
	948	* would block in the lock. The same checks have to
	949	* be made again after we officially get the buffer.
	950	*/
	951	tbp = getblk(vp, loffset + i * blksize, blksize,
	952	GETBLK_SZMATCH\|GETBLK_NOWAIT, 0);
	953	if (tbp == NULL)
	954	break;
	955	for (j = 0; j < tbp->b_xio.xio_npages; j++) {
	956	if (tbp->b_xio.xio_pages[j]->valid)
	957	break;
	958	}
	959	if (j != tbp->b_xio.xio_npages) {
	960	bqrelse(tbp);
	961	break;
	962	}
	963
	964	/*
	965	* Stop scanning if the buffer is fuly valid
	966	* (marked B_CACHE), or locked (may be doing a
	967	* background write), or if the buffer is not
	968	* VMIO backed. The clustering code can only deal
	969	* with VMIO-backed buffers.
	970	*/
	971	if ((tbp->b_flags & (B_CACHE\|B_LOCKED)) \|\|
	972	(tbp->b_flags & B_VMIO) == 0 \|\|
	973	(LIST_FIRST(&tbp->b_dep) != NULL &&
	974	buf_checkread(tbp))
	975	) {
	976	bqrelse(tbp);
	977	break;
	978	}
	979
	980	/*
	981	* The buffer must be completely invalid in order to
	982	* take part in the cluster. If it is partially valid
	983	* then we stop.
	984	*/
	985	for (j = 0;j < tbp->b_xio.xio_npages; j++) {
	986	if (tbp->b_xio.xio_pages[j]->valid)
	987	break;
	988	}
	989	if (j != tbp->b_xio.xio_npages) {
	990	bqrelse(tbp);
	991	break;
	992	}
	993
	994	/*
	995	* Depress the priority of buffers not explicitly
	996	* requested.
	997	*/
	998	/* tbp->b_flags \|= B_AGE; */
	999
	1000	/*
	1001	* Set the block number if it isn't set, otherwise
	1002	* if it is make sure it matches the block number we
	1003	* expect.
	1004	*/
	1005	if (tbp->b_bio2.bio_offset == NOOFFSET) {
	1006	tbp->b_bio2.bio_offset = boffset;
	1007	} else if (tbp->b_bio2.bio_offset != boffset) {
	1008	brelse(tbp);
	1009	break;
	1010	}
	1011	}
	1012
	1013	/*
	1014	* Set B_RAM if (*srp) is 1. B_RAM is only set on one buffer
	1015	* in the cluster, including potentially the first buffer
	1016	* once we start streaming the read-aheads.
	1017	*/
	1018	if (--*srp == 0)
	1019	cluster_setram(tbp);
	1020	else
	1021	cluster_clrram(tbp);
	1022
	1023	/*
	1024	* The passed-in tbp (i == 0) will already be set up for
	1025	* async or sync operation. All other tbp's acquire in
	1026	* our loop are set up for async operation.
	1027	*/
	1028	tbp->b_cmd = BUF_CMD_READ;
	1029	BUF_KERNPROC(tbp);
	1030	cluster_append(&bp->b_bio1, tbp);
	1031	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
	1032	vm_page_t m;
	1033
	1034	m = tbp->b_xio.xio_pages[j];
	1035	vm_page_busy_wait(m, FALSE, "clurpg");
	1036	vm_page_io_start(m);
	1037	vm_page_wakeup(m);
	1038	vm_object_pip_add(m->object, 1);
	1039	if ((bp->b_xio.xio_npages == 0) \|\|
	1040	(bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
	1041	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	1042	bp->b_xio.xio_npages++;
	1043	}
	1044	if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
	1045	tbp->b_xio.xio_pages[j] = bogus_page;
	1046	}
	1047	/*
	1048	* XXX shouldn't this be += size for both, like in
	1049	* cluster_wbuild()?
	1050	*
	1051	* Don't inherit tbp->b_bufsize as it may be larger due to
	1052	* a non-page-aligned size. Instead just aggregate using
	1053	* 'size'.
	1054	*/
	1055	if (tbp->b_bcount != blksize)
	1056	kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
	1057	if (tbp->b_bufsize != blksize)
	1058	kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
	1059	bp->b_bcount += blksize;
	1060	bp->b_bufsize += blksize;
	1061	}
	1062
	1063	/*
	1064	* Fully valid pages in the cluster are already good and do not need
	1065	* to be re-read from disk. Replace the page with bogus_page
	1066	*/
	1067	for (j = 0; j < bp->b_xio.xio_npages; j++) {
	1068	if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
	1069	VM_PAGE_BITS_ALL) {
	1070	bp->b_xio.xio_pages[j] = bogus_page;
	1071	}
	1072	}
	1073	if (bp->b_bufsize > bp->b_kvasize) {
	1074	panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
	1075	bp->b_bufsize, bp->b_kvasize);
	1076	}
	1077	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
	1078	(vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
	1079	BUF_KERNPROC(bp);
	1080	return (bp);
	1081	}
	1082
	1083	/*
	1084	* Cleanup after a clustered read or write.
	1085	* This is complicated by the fact that any of the buffers might have
	1086	* extra memory (if there were no empty buffer headers at allocbuf time)
	1087	* that we will need to shift around.
	1088	*
	1089	* The returned bio is &bp->b_bio1
	1090	*/
	1091	static void
	1092	cluster_callback(struct bio *bio)
	1093	{
	1094	struct buf *bp = bio->bio_buf;
	1095	struct buf *tbp;
	1096	int error = 0;
	1097
	1098	/*
	1099	* Must propogate errors to all the components. A short read (EOF)
	1100	* is a critical error.
	1101	*/
	1102	if (bp->b_flags & B_ERROR) {
	1103	error = bp->b_error;
	1104	} else if (bp->b_bcount != bp->b_bufsize) {
	1105	panic("cluster_callback: unexpected EOF on cluster %p!", bio);
	1106	}
	1107
	1108	pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
	1109	bp->b_xio.xio_npages);
	1110	/*
	1111	* Move memory from the large cluster buffer into the component
	1112	* buffers and mark IO as done on these. Since the memory map
	1113	* is the same, no actual copying is required.
	1114	*/
	1115	while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
	1116	bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
	1117	if (error) {
	1118	tbp->b_flags \|= B_ERROR \| B_IOISSUED;
	1119	tbp->b_error = error;
	1120	} else {
	1121	tbp->b_dirtyoff = tbp->b_dirtyend = 0;
	1122	tbp->b_flags &= ~(B_ERROR\|B_INVAL);
	1123	tbp->b_flags \|= B_IOISSUED;
	1124	/*
	1125	* XXX the bdwrite()/bqrelse() issued during
	1126	* cluster building clears B_RELBUF (see bqrelse()
	1127	* comment). If direct I/O was specified, we have
	1128	* to restore it here to allow the buffer and VM
	1129	* to be freed.
	1130	*/
	1131	if (tbp->b_flags & B_DIRECT)
	1132	tbp->b_flags \|= B_RELBUF;
	1133
	1134	/*
	1135	* XXX I think biodone() below will do this, but do
	1136	* it here anyway for consistency.
	1137	*/
	1138	if (tbp->b_cmd == BUF_CMD_WRITE)
	1139	bundirty(tbp);
	1140	}
	1141	biodone(&tbp->b_bio1);
	1142	}
	1143	relpbuf(bp, &cluster_pbuf_freecnt);
	1144	}
	1145
	1146	/*
	1147	* Implement modified write build for cluster.
	1148	*
	1149	* write_behind = 0 write behind disabled
	1150	* write_behind = 1 write behind normal (default)
	1151	* write_behind = 2 write behind backed-off
	1152	*
	1153	* In addition, write_behind is only activated for files that have
	1154	* grown past a certain size (default 10MB). Otherwise temporary files
	1155	* wind up generating a lot of unnecessary disk I/O.
	1156	*/
	1157	static __inline int
	1158	cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
	1159	{
	1160	int r = 0;
	1161
	1162	switch(write_behind) {
	1163	case 2:
	1164	if (start_loffset < len)
	1165	break;
	1166	start_loffset -= len;
	1167	/* fall through */
	1168	case 1:
	1169	if (vp->v_filesize >= write_behind_minfilesize) {
	1170	r = cluster_wbuild(vp, NULL, blksize,
	1171	start_loffset, len);
	1172	}
	1173	/* fall through */
	1174	default:
	1175	/* fall through */
	1176	break;
	1177	}
	1178	return(r);
	1179	}
	1180
	1181	/*
	1182	* Do clustered write for FFS.
	1183	*
	1184	* Three cases:
	1185	* 1. Write is not sequential (write asynchronously)
	1186	* Write is sequential:
	1187	* 2. beginning of cluster - begin cluster
	1188	* 3. middle of a cluster - add to cluster
	1189	* 4. end of a cluster - asynchronously write cluster
	1190	*
	1191	* WARNING! vnode fields are not locked and must ONLY be used heuristically.
	1192	*/
	1193	void
	1194	cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
	1195	{
	1196	struct vnode *vp;
	1197	off_t loffset;
	1198	int maxclen, cursize;
	1199	int async;
	1200	cluster_cache_t dummy;
	1201	cluster_cache_t *cc;
	1202
	1203	vp = bp->b_vp;
	1204	if (vp->v_type == VREG)
	1205	async = vp->v_mount->mnt_flag & MNT_ASYNC;
	1206	else
	1207	async = 0;
	1208	loffset = bp->b_loffset;
	1209	KASSERT(bp->b_loffset != NOOFFSET,
	1210	("cluster_write: no buffer offset"));
	1211
	1212	cc = cluster_getcache(&dummy, vp, loffset);
	1213
	1214	/*
	1215	* Initialize vnode to beginning of file.
	1216	*/
	1217	if (loffset == 0)
	1218	cc->v_lasta = cc->v_clen = cc->v_cstart = cc->v_lastw = 0;
	1219
	1220	if (cc->v_clen == 0 \|\| loffset != cc->v_lastw + blksize \|\|
	1221	bp->b_bio2.bio_offset == NOOFFSET \|\|
	1222	(bp->b_bio2.bio_offset != cc->v_lasta + blksize)) {
	1223	maxclen = vmaxiosize(vp);
	1224	if (cc->v_clen != 0) {
	1225	/*
	1226	* Next block is not sequential.
	1227	*
	1228	* If we are not writing at end of file, the process
	1229	* seeked to another point in the file since its last
	1230	* write, or we have reached our maximum cluster size,
	1231	* then push the previous cluster. Otherwise try
	1232	* reallocating to make it sequential.
	1233	*
	1234	* Change to algorithm: only push previous cluster if
	1235	* it was sequential from the point of view of the
	1236	* seqcount heuristic, otherwise leave the buffer
	1237	* intact so we can potentially optimize the I/O
	1238	* later on in the buf_daemon or update daemon
	1239	* flush.
	1240	*/
	1241	cursize = cc->v_lastw - cc->v_cstart + blksize;
	1242	if (bp->b_loffset + blksize < filesize \|\|
	1243	loffset != cc->v_lastw + blksize \|\|
	1244	cc->v_clen <= cursize) {
	1245	if (!async && seqcount > 0) {
	1246	cluster_wbuild_wb(vp, blksize,
	1247	cc->v_cstart, cursize);
	1248	}
	1249	} else {
	1250	struct buf bpp, endbp;
	1251	struct cluster_save *buflist;
	1252
	1253	buflist = cluster_collectbufs(cc, vp,
	1254	bp, blksize);
	1255	endbp = &buflist->bs_children
	1256	[buflist->bs_nchildren - 1];
	1257	if (VOP_REALLOCBLKS(vp, buflist)) {
	1258	/*
	1259	* Failed, push the previous cluster
	1260	* if really writing sequentially
	1261	* in the logical file (seqcount > 1),
	1262	* otherwise delay it in the hopes that
	1263	* the low level disk driver can
	1264	* optimize the write ordering.
	1265	*
	1266	* NOTE: We do not brelse the last
	1267	* element which is bp, and we
	1268	* do not return here.
	1269	*/
	1270	for (bpp = buflist->bs_children;
	1271	bpp < endbp; bpp++)
	1272	brelse(*bpp);
	1273	kfree(buflist, M_SEGMENT);
	1274	if (seqcount > 1) {
	1275	cluster_wbuild_wb(vp,
	1276	blksize, cc->v_cstart,
	1277	cursize);
	1278	}
	1279	} else {
	1280	/*
	1281	* Succeeded, keep building cluster.
	1282	*/
	1283	for (bpp = buflist->bs_children;
	1284	bpp <= endbp; bpp++)
	1285	bdwrite(*bpp);
	1286	kfree(buflist, M_SEGMENT);
	1287	cc->v_lastw = loffset;
	1288	cc->v_lasta = bp->b_bio2.bio_offset;
	1289	cluster_putcache(cc);
	1290	return;
	1291	}
	1292	}
	1293	}
	1294	/*
	1295	* Consider beginning a cluster. If at end of file, make
	1296	* cluster as large as possible, otherwise find size of
	1297	* existing cluster.
	1298	*/
	1299	if ((vp->v_type == VREG) &&
	1300	bp->b_loffset + blksize < filesize &&
	1301	(bp->b_bio2.bio_offset == NOOFFSET) &&
	1302	(VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) \|\|
	1303	bp->b_bio2.bio_offset == NOOFFSET)) {
	1304	bdwrite(bp);
	1305	cc->v_clen = 0;
	1306	cc->v_lasta = bp->b_bio2.bio_offset;
	1307	cc->v_cstart = loffset + blksize;
	1308	cc->v_lastw = loffset;
	1309	cluster_putcache(cc);
	1310	return;
	1311	}
	1312	if (maxclen > blksize)
	1313	cc->v_clen = maxclen - blksize;
	1314	else
	1315	cc->v_clen = 0;
	1316	if (!async && cc->v_clen == 0) { /* I/O not contiguous */
	1317	cc->v_cstart = loffset + blksize;
	1318	bdwrite(bp);
	1319	} else { /* Wait for rest of cluster */
	1320	cc->v_cstart = loffset;
	1321	bdwrite(bp);
	1322	}
	1323	} else if (loffset == cc->v_cstart + cc->v_clen) {
	1324	/*
	1325	* At end of cluster, write it out if seqcount tells us we
	1326	* are operating sequentially, otherwise let the buf or
	1327	* update daemon handle it.
	1328	*/
	1329	bdwrite(bp);
	1330	if (seqcount > 1)
	1331	cluster_wbuild_wb(vp, blksize, cc->v_cstart,
	1332	cc->v_clen + blksize);
	1333	cc->v_clen = 0;
	1334	cc->v_cstart = loffset + blksize;
	1335	} else if (vm_page_count_severe() &&
	1336	bp->b_loffset + blksize < filesize) {
	1337	/*
	1338	* We are low on memory, get it going NOW. However, do not
	1339	* try to push out a partial block at the end of the file
	1340	* as this could lead to extremely non-optimal write activity.
	1341	*/
	1342	bawrite(bp);
	1343	} else {
	1344	/*
	1345	* In the middle of a cluster, so just delay the I/O for now.
	1346	*/
	1347	bdwrite(bp);
	1348	}
	1349	cc->v_lastw = loffset;
	1350	cc->v_lasta = bp->b_bio2.bio_offset;
	1351	cluster_putcache(cc);
	1352	}
	1353
	1354	/*
	1355	* This is the clustered version of bawrite(). It works similarly to
	1356	* cluster_write() except I/O on the buffer is guaranteed to occur.
	1357	*/
	1358	int
	1359	cluster_awrite(struct buf *bp)
	1360	{
	1361	int total;
	1362
	1363	/*
	1364	* Don't bother if it isn't clusterable.
	1365	*/
	1366	if ((bp->b_flags & B_CLUSTEROK) == 0 \|\|
	1367	bp->b_vp == NULL \|\|
	1368	(bp->b_vp->v_flag & VOBJBUF) == 0) {
	1369	total = bp->b_bufsize;
	1370	bawrite(bp);
	1371	return (total);
	1372	}
	1373
	1374	total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize,
	1375	bp->b_loffset, vmaxiosize(bp->b_vp));
	1376	if (bp)
	1377	bawrite(bp);
	1378
	1379	return total;
	1380	}
	1381
	1382	/*
	1383	* This is an awful lot like cluster_rbuild...wish they could be combined.
	1384	* The last lbn argument is the current block on which I/O is being
	1385	* performed. Check to see that it doesn't fall in the middle of
	1386	* the current block (if last_bp == NULL).
	1387	*
	1388	* cluster_wbuild() normally does not guarantee anything. If bpp is
	1389	* non-NULL and cluster_wbuild() is able to incorporate it into the
	1390	* I/O it will set *bpp to NULL, otherwise it will leave it alone and
	1391	* the caller must dispose of *bpp.
	1392	*/
	1393	static int
	1394	cluster_wbuild(struct vnode vp, struct buf *bpp,
	1395	int blksize, off_t start_loffset, int bytes)
	1396	{
	1397	struct buf bp, tbp;
	1398	int i, j;
	1399	int totalwritten = 0;
	1400	int must_initiate;
	1401	int maxiosize = vmaxiosize(vp);
	1402
	1403	while (bytes > 0) {
	1404	/*
	1405	* If the buffer matches the passed locked & removed buffer
	1406	* we used the passed buffer (which might not be B_DELWRI).
	1407	*
	1408	* Otherwise locate the buffer and determine if it is
	1409	* compatible.
	1410	*/
	1411	if (bpp && (*bpp)->b_loffset == start_loffset) {
	1412	tbp = *bpp;
	1413	*bpp = NULL;
	1414	bpp = NULL;
	1415	} else {
	1416	tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
	1417	if (tbp == NULL \|\|
	1418	(tbp->b_flags & (B_LOCKED \| B_INVAL \| B_DELWRI)) !=
	1419	B_DELWRI \|\|
	1420	(LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
	1421	if (tbp)
	1422	BUF_UNLOCK(tbp);
	1423	start_loffset += blksize;
	1424	bytes -= blksize;
	1425	continue;
	1426	}
	1427	bremfree(tbp);
	1428	}
	1429	KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
	1430
	1431	/*
	1432	* Extra memory in the buffer, punt on this buffer.
	1433	* XXX we could handle this in most cases, but we would
	1434	* have to push the extra memory down to after our max
	1435	* possible cluster size and then potentially pull it back
	1436	* up if the cluster was terminated prematurely--too much
	1437	* hassle.
	1438	*/
	1439	if (((tbp->b_flags & (B_CLUSTEROK\|B_MALLOC)) != B_CLUSTEROK) \|\|
	1440	(tbp->b_bcount != tbp->b_bufsize) \|\|
	1441	(tbp->b_bcount != blksize) \|\|
	1442	(bytes == blksize) \|\|
	1443	((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) {
	1444	totalwritten += tbp->b_bufsize;
	1445	bawrite(tbp);
	1446	start_loffset += blksize;
	1447	bytes -= blksize;
	1448	continue;
	1449	}
	1450
	1451	/*
	1452	* Set up the pbuf. Track our append point with b_bcount
	1453	* and b_bufsize. b_bufsize is not used by the device but
	1454	* our caller uses it to loop clusters and we use it to
	1455	* detect a premature EOF on the block device.
	1456	*/
	1457	bp->b_bcount = 0;
	1458	bp->b_bufsize = 0;
	1459	bp->b_xio.xio_npages = 0;
	1460	bp->b_loffset = tbp->b_loffset;
	1461	bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
	1462
	1463	/*
	1464	* We are synthesizing a buffer out of vm_page_t's, but
	1465	* if the block size is not page aligned then the starting
	1466	* address may not be either. Inherit the b_data offset
	1467	* from the original buffer.
	1468	*/
	1469	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
	1470	((vm_offset_t)tbp->b_data & PAGE_MASK));
	1471	bp->b_flags &= ~B_ERROR;
	1472	bp->b_flags \|= B_CLUSTER \| B_BNOCLIP \|
	1473	(tbp->b_flags & (B_VMIO \| B_NEEDCOMMIT));
	1474	bp->b_bio1.bio_caller_info1.cluster_head = NULL;
	1475	bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
	1476
	1477	/*
	1478	* From this location in the file, scan forward to see
	1479	* if there are buffers with adjacent data that need to
	1480	* be written as well.
	1481	*
	1482	* IO must be initiated on index 0 at this point
	1483	* (particularly when called from cluster_awrite()).
	1484	*/
	1485	for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
	1486	if (i == 0) {
	1487	must_initiate = 1;
	1488	} else {
	1489	/*
	1490	* Not first buffer.
	1491	*/
	1492	must_initiate = 0;
	1493	tbp = findblk(vp, start_loffset,
	1494	FINDBLK_NBLOCK);
	1495	/*
	1496	* Buffer not found or could not be locked
	1497	* non-blocking.
	1498	*/
	1499	if (tbp == NULL)
	1500	break;
	1501
	1502	/*
	1503	* If it IS in core, but has different
	1504	* characteristics, then don't cluster
	1505	* with it.
	1506	*/
	1507	if ((tbp->b_flags & (B_VMIO \| B_CLUSTEROK \|
	1508	B_INVAL \| B_DELWRI \| B_NEEDCOMMIT))
	1509	!= (B_DELWRI \| B_CLUSTEROK \|
	1510	(bp->b_flags & (B_VMIO \| B_NEEDCOMMIT))) \|\|
	1511	(tbp->b_flags & B_LOCKED)
	1512	) {
	1513	BUF_UNLOCK(tbp);
	1514	break;
	1515	}
	1516
	1517	/*
	1518	* Check that the combined cluster
	1519	* would make sense with regard to pages
	1520	* and would not be too large
	1521	*
	1522	* WARNING! buf_checkwrite() must be the last
	1523	* check made. If it returns 0 then
	1524	* we must initiate the I/O.
	1525	*/
	1526	if ((tbp->b_bcount != blksize) \|\|
	1527	((bp->b_bio2.bio_offset + i) !=
	1528	tbp->b_bio2.bio_offset) \|\|
	1529	((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
	1530	(maxiosize / PAGE_SIZE)) \|\|
	1531	(LIST_FIRST(&tbp->b_dep) &&
	1532	buf_checkwrite(tbp))
	1533	) {
	1534	BUF_UNLOCK(tbp);
	1535	break;
	1536	}
	1537	if (LIST_FIRST(&tbp->b_dep))
	1538	must_initiate = 1;
	1539	/*
	1540	* Ok, it's passed all the tests,
	1541	* so remove it from the free list
	1542	* and mark it busy. We will use it.
	1543	*/
	1544	bremfree(tbp);
	1545	KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
	1546	}
	1547
	1548	/*
	1549	* If the IO is via the VM then we do some
	1550	* special VM hackery (yuck). Since the buffer's
	1551	* block size may not be page-aligned it is possible
	1552	* for a page to be shared between two buffers. We
	1553	* have to get rid of the duplication when building
	1554	* the cluster.
	1555	*/
	1556	if (tbp->b_flags & B_VMIO) {
	1557	vm_page_t m;
	1558
	1559	/*
	1560	* Try to avoid deadlocks with the VM system.
	1561	* However, we cannot abort the I/O if
	1562	* must_initiate is non-zero.
	1563	*/
	1564	if (must_initiate == 0) {
	1565	for (j = 0;
	1566	j < tbp->b_xio.xio_npages;
	1567	++j) {
	1568	m = tbp->b_xio.xio_pages[j];
	1569	if (m->flags & PG_BUSY) {
	1570	bqrelse(tbp);
	1571	goto finishcluster;
	1572	}
	1573	}
	1574	}
	1575
	1576	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
	1577	m = tbp->b_xio.xio_pages[j];
	1578	vm_page_busy_wait(m, FALSE, "clurpg");
	1579	vm_page_io_start(m);
	1580	vm_page_wakeup(m);
	1581	vm_object_pip_add(m->object, 1);
	1582	if ((bp->b_xio.xio_npages == 0) \|\|
	1583	(bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
	1584	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	1585	bp->b_xio.xio_npages++;
	1586	}
	1587	}
	1588	}
	1589	bp->b_bcount += blksize;
	1590	bp->b_bufsize += blksize;
	1591
	1592	/*
	1593	* NOTE: see bwrite/bawrite code for why we no longer
	1594	* undirty tbp here.
	1595	*
	1596	* bundirty(tbp); REMOVED
	1597	*/
	1598	tbp->b_flags &= ~B_ERROR;
	1599	tbp->b_cmd = BUF_CMD_WRITE;
	1600	BUF_KERNPROC(tbp);
	1601	cluster_append(&bp->b_bio1, tbp);
	1602
	1603	/*
	1604	* check for latent dependencies to be handled
	1605	*/
	1606	if (LIST_FIRST(&tbp->b_dep) != NULL)
	1607	buf_start(tbp);
	1608	}
	1609	finishcluster:
	1610	pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
	1611	(vm_page_t *)bp->b_xio.xio_pages,
	1612	bp->b_xio.xio_npages);
	1613	if (bp->b_bufsize > bp->b_kvasize) {
	1614	panic("cluster_wbuild: b_bufsize(%d) "
	1615	"> b_kvasize(%d)\n",
	1616	bp->b_bufsize, bp->b_kvasize);
	1617	}
	1618	totalwritten += bp->b_bufsize;
	1619	bp->b_dirtyoff = 0;
	1620	bp->b_dirtyend = bp->b_bufsize;
	1621	bp->b_bio1.bio_done = cluster_callback;
	1622	bp->b_cmd = BUF_CMD_WRITE;
	1623
	1624	vfs_busy_pages(vp, bp);
	1625	bsetrunningbufspace(bp, bp->b_bufsize);
	1626	BUF_KERNPROC(bp);
	1627	vn_strategy(vp, &bp->b_bio1);
	1628
	1629	bytes -= i;
	1630	}
	1631	return totalwritten;
	1632	}
	1633
	1634	/*
	1635	* Collect together all the buffers in a cluster, plus add one
	1636	* additional buffer passed-in.
	1637	*
	1638	* Only pre-existing buffers whos block size matches blksize are collected.
	1639	* (this is primarily because HAMMER1 uses varying block sizes and we don't
	1640	* want to override its choices).
	1641	*
	1642	* This code will not try to collect buffers that it cannot lock, otherwise
	1643	* it might deadlock against SMP-friendly filesystems.
	1644	*/
	1645	static struct cluster_save *
	1646	cluster_collectbufs(cluster_cache_t cc, struct vnode vp,
	1647	struct buf *last_bp, int blksize)
	1648	{
	1649	struct cluster_save *buflist;
	1650	struct buf *bp;
	1651	off_t loffset;
	1652	int i, len;
	1653	int j;
	1654	int k;
	1655
	1656	len = (int)(cc->v_lastw - cc->v_cstart + blksize) / blksize;
	1657	KKASSERT(len > 0);
	1658	buflist = kmalloc(sizeof(struct buf ) (len + 1) + sizeof(*buflist),
	1659	M_SEGMENT, M_WAITOK);
	1660	buflist->bs_nchildren = 0;
	1661	buflist->bs_children = (struct buf **) (buflist + 1);
	1662	for (loffset = cc->v_cstart, i = 0, j = 0;
	1663	i < len;
	1664	(loffset += blksize), i++) {
	1665	bp = getcacheblk(vp, loffset,
	1666	last_bp->b_bcount, GETBLK_SZMATCH \|
	1667	GETBLK_NOWAIT);
	1668	buflist->bs_children[i] = bp;
	1669	if (bp == NULL) {
	1670	j = i + 1;
	1671	} else if (bp->b_bio2.bio_offset == NOOFFSET) {
	1672	VOP_BMAP(bp->b_vp, bp->b_loffset,
	1673	&bp->b_bio2.bio_offset,
	1674	NULL, NULL, BUF_CMD_WRITE);
	1675	}
	1676	}
	1677
	1678	/*
	1679	* Get rid of gaps
	1680	*/
	1681	for (k = 0; k < j; ++k) {
	1682	if (buflist->bs_children[k]) {
	1683	bqrelse(buflist->bs_children[k]);
	1684	buflist->bs_children[k] = NULL;
	1685	}
	1686	}
	1687	if (j != 0) {
	1688	if (j != i) {
	1689	bcopy(buflist->bs_children + j,
	1690	buflist->bs_children + 0,
	1691	sizeof(buflist->bs_children[0]) * (i - j));
	1692	}
	1693	i -= j;
	1694	}
	1695	buflist->bs_children[i] = bp = last_bp;
	1696	if (bp->b_bio2.bio_offset == NOOFFSET) {
	1697	VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
	1698	NULL, NULL, BUF_CMD_WRITE);
	1699	}
	1700	buflist->bs_nchildren = i + 1;
	1701	return (buflist);
	1702	}
	1703
	1704	void
	1705	cluster_append(struct bio bio, struct buf tbp)
	1706	{
	1707	tbp->b_cluster_next = NULL;
	1708	if (bio->bio_caller_info1.cluster_head == NULL) {
	1709	bio->bio_caller_info1.cluster_head = tbp;
	1710	bio->bio_caller_info2.cluster_tail = tbp;
	1711	} else {
	1712	bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
	1713	bio->bio_caller_info2.cluster_tail = tbp;
	1714	}
	1715	}
	1716
	1717	static
	1718	void
	1719	cluster_setram(struct buf *bp)
	1720	{
	1721	bp->b_flags \|= B_RAM;
	1722	if (bp->b_xio.xio_npages)
	1723	vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
	1724	}
	1725
	1726	static
	1727	void
	1728	cluster_clrram(struct buf *bp)
	1729	{
	1730	bp->b_flags &= ~B_RAM;
	1731	if (bp->b_xio.xio_npages)
	1732	vm_page_flag_clear(bp->b_xio.xio_pages[0], PG_RAM);
	1733	}