gitweb.dragonflybsd.org Git - dragonfly.git/blame

Commit	Line	Data
984263bc MD	1	/*-
	2	* Copyright (c) 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* Modifications/enhancements:
	5	* Copyright (c) 1995 John S. Dyson. All rights reserved.
38a4b308	6	* Copyright (c) 2012-2013 Matthew Dillon. All rights reserved.
984263bc MD	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
dc71b7ab	16	* 3. Neither the name of the University nor the names of its contributors
984263bc MD	17	* may be used to endorse or promote products derived from this software
	18	* without specific prior written permission.
	19	*
	20	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	21	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	22	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	23	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	24	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	25	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	26	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	27	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	28	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	29	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	30	* SUCH DAMAGE.
984263bc MD	31	*/
	32
	33	#include "opt_debug_cluster.h"
	34
	35	#include <sys/param.h>
	36	#include <sys/systm.h>
	37	#include <sys/kernel.h>
	38	#include <sys/proc.h>
	39	#include <sys/buf.h>
	40	#include <sys/vnode.h>
	41	#include <sys/malloc.h>
	42	#include <sys/mount.h>
	43	#include <sys/resourcevar.h>
	44	#include <sys/vmmeter.h>
	45	#include <vm/vm.h>
	46	#include <vm/vm_object.h>
	47	#include <vm/vm_page.h>
	48	#include <sys/sysctl.h>
54341a3b	49
3020e3be	50	#include <sys/buf2.h>
12e4aaff	51	#include <vm/vm_page2.h>
984263bc	52
e54488bb MD	53	#include <machine/limits.h>
e54488bb MD	54
38a4b308 MD	55	/*
	56	* Cluster tracking cache - replaces the original vnode v_* fields which had
	57	* limited utility and were not MP safe.
	58	*
	59	* The cluster tracking cache is a simple 4-way set-associative non-chained
	60	* cache. It is capable of tracking up to four zones separated by 1MB or
	61	* more per vnode.
	62	*
	63	* NOTE: We want this structure to be cache-line friendly so the iterator
	64	* is embedded rather than in a separate array.
	65	*
	66	* NOTE: A cluster cache entry can become stale when a vnode is recycled.
	67	* For now we treat the values as heuristical but also self-consistent.
	68	* i.e. the values cannot be completely random and cannot be SMP unsafe
	69	* or the cluster code might end-up clustering non-contiguous buffers
	70	* at the wrong offsets.
	71	*/
	72	struct cluster_cache {
	73	struct vnode *vp;
	74	u_int locked;
cf297f2c MD	75	off_t v_lastw; /* last write (end) (write cluster) */
	76	off_t v_cstart; /* start block (beg) of cluster */
	77	off_t v_lasta; /* last allocation (end) */
38a4b308 MD	78	u_int v_clen; /* length of current cluster */
	79	u_int iterator;
	80	} __cachealign;
	81
	82	typedef struct cluster_cache cluster_cache_t;
	83
	84	#define CLUSTER_CACHE_SIZE 512
	85	#define CLUSTER_CACHE_MASK (CLUSTER_CACHE_SIZE - 1)
	86
	87	#define CLUSTER_ZONE ((off_t)(1024 * 1024))
	88
	89	cluster_cache_t cluster_array[CLUSTER_CACHE_SIZE];
	90
984263bc	91	#if defined(CLUSTERDEBUG)
984263bc MD	92	static int rcluster= 0;
	93	SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
	94	#endif
	95
d1cd9d97	96	static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
984263bc MD	97
984263bc MD	98	static struct cluster_save *
38a4b308 MD	99	cluster_collectbufs (cluster_cache_t cc, struct vnode vp,
38a4b308 MD	100	struct buf *last_bp, int blksize);
984263bc	101	static struct buf *
54078292	102	cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
e92ca23a	103	off_t doffset, int blksize, int run,
cb1fa82f	104	struct buf fbp, int srp);
81b5c339	105	static void cluster_callback (struct bio *);
cf1bb2a8	106	static void cluster_setram (struct buf *);
cb1fa82f	107	static void cluster_clrram (struct buf *);
9de13b88 MD	108	static int cluster_wbuild(struct vnode vp, struct buf *bpp, int blksize,
9de13b88 MD	109	off_t start_loffset, int bytes);
984263bc MD	110
984263bc MD	111	static int write_behind = 1;
093e85dc SG	112	SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
093e85dc SG	113	"Cluster write-behind setting");
504ea70e MD	114	static quad_t write_behind_minfilesize = 10 * 1024 * 1024;
	115	SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW,
	116	&write_behind_minfilesize, 0, "Cluster write-behind setting");
364c022c	117	static int max_readahead = 2 * 1024 * 1024;
093e85dc SG	118	SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0,
093e85dc SG	119	"Limit in bytes for desired cluster read-ahead");
984263bc MD	120
	121	extern vm_page_t bogus_page;
	122
cb1fa82f MD	123	/*
	124	* nblks is our cluster_rbuild request size. The approximate number of
	125	* physical read-ahead requests is maxra / nblks. The physical request
	126	* size is limited by the device (maxrbuild). We also do not want to make
	127	* the request size too big or it will mess up the B_RAM streaming.
	128	*/
	129	static __inline
	130	int
	131	calc_rbuild_reqsize(int maxra, int maxrbuild)
	132	{
	133	int nblks;
	134
	135	if ((nblks = maxra / 4) > maxrbuild)
	136	nblks = maxrbuild;
	137	if (nblks < 1)
	138	nblks = maxra;
	139	return nblks;
	140	}
	141
38a4b308 MD	142	/*
	143	* Acquire/release cluster cache (can return dummy entry)
	144	*/
	145	static
	146	cluster_cache_t *
	147	cluster_getcache(cluster_cache_t dummy, struct vnode vp, off_t loffset)
	148	{
	149	cluster_cache_t *cc;
	150	size_t hv;
	151	int i;
	152	int xact;
	153
	154	hv = (size_t)(intptr_t)vp ^ (size_t)(intptr_t)vp / sizeof(*vp);
	155	hv &= CLUSTER_CACHE_MASK & ~3;
	156	cc = &cluster_array[hv];
	157
	158	xact = -1;
	159	for (i = 0; i < 4; ++i) {
	160	if (cc[i].vp != vp)
	161	continue;
3f7b7260	162	if (rounddown2(cc[i].v_cstart ^ loffset, CLUSTER_ZONE) == 0) {
38a4b308 MD	163	xact = i;
	164	break;
	165	}
	166	}
	167	if (xact >= 0 && atomic_swap_int(&cc[xact].locked, 1) == 0) {
	168	if (cc[xact].vp == vp &&
3f7b7260	169	rounddown2(cc[i].v_cstart ^ loffset, CLUSTER_ZONE) == 0) {
38a4b308 MD	170	return(&cc[xact]);
	171	}
	172	atomic_swap_int(&cc[xact].locked, 0);
	173	}
	174
	175	/*
	176	* New entry. If we can't acquire the cache line then use the
	177	* passed-in dummy element and reset all fields.
	178	*
	179	* When we are able to acquire the cache line we only clear the
	180	* fields if the vp does not match. This allows us to multi-zone
	181	* a vp and for excessive zones / partial clusters to be retired.
	182	*/
	183	i = cc->iterator++ & 3;
	184	cc += i;
	185	if (atomic_swap_int(&cc->locked, 1) != 0) {
	186	cc = dummy;
	187	cc->locked = 1;
	188	cc->vp = NULL;
	189	}
	190	if (cc->vp != vp) {
	191	cc->vp = vp;
	192	cc->v_lasta = 0;
	193	cc->v_clen = 0;
	194	cc->v_cstart = 0;
	195	cc->v_lastw = 0;
	196	}
	197	return(cc);
	198	}
	199
	200	static
	201	void
	202	cluster_putcache(cluster_cache_t *cc)
	203	{
	204	atomic_swap_int(&cc->locked, 0);
	205	}
	206
984263bc	207	/*
dbb11a6e MD	208	* This replaces bread(), providing a synchronous read of the requested
	209	* buffer plus asynchronous read-ahead within the specified bounds.
	210	*
	211	* The caller may pre-populate *bpp if it already has the requested buffer
	212	* in-hand, else must set *bpp to NULL. Note that the cluster_read() inline
	213	* sets *bpp to NULL and then calls cluster_readx() for compatibility.
364c022c MD	214	*
	215	* filesize - read-ahead @ blksize will not cross this boundary
	216	* loffset - loffset for returned *bpp
	217	* blksize - blocksize for returned *bpp and read-ahead bps
	218	* minreq - minimum (not a hard minimum) in bytes, typically reflects
	219	* a higher level uio resid.
	220	* maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB)
	221	* bpp - return buffer (*bpp) for (loffset,blksize)
984263bc MD	222	*/
984263bc MD	223	int
9c93755a MD	224	cluster_readx(struct vnode *vp, off_t filesize, off_t loffset, int blksize,
	225	int bflags, size_t minreq, size_t maxreq,
	226	struct buf **bpp)
984263bc MD	227	{
984263bc MD	228	struct buf bp, rbp, *reqbp;
54078292 MD	229	off_t origoffset;
	230	off_t doffset;
	231	int error;
984263bc	232	int i;
364c022c MD	233	int maxra;
364c022c MD	234	int maxrbuild;
cb1fa82f	235	int sr;
d32579c3	236	int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0;
984263bc	237
cb1fa82f	238	sr = 0;
984263bc MD	239
984263bc MD	240	/*
364c022c MD	241	* Calculate the desired read-ahead in blksize'd blocks (maxra).
364c022c MD	242	* To do this we calculate maxreq.
6b84c93e	243	*
364c022c MD	244	* maxreq typically starts out as a sequential heuristic. If the
	245	* high level uio/resid is bigger (minreq), we pop maxreq up to
	246	* minreq. This represents the case where random I/O is being
	247	* performed by the userland is issuing big read()'s.
6b84c93e	248	*
364c022c MD	249	* Then we limit maxreq to max_readahead to ensure it is a reasonable
	250	* value.
	251	*
b28ad496	252	* Finally we must ensure that (loffset + maxreq) does not cross the
364c022c MD	253	* boundary (filesize) for the current blocksize. If we allowed it
	254	* to cross we could end up with buffers past the boundary with the
	255	* wrong block size (HAMMER large-data areas use mixed block sizes).
b28ad496	256	* minreq is also absolutely limited to filesize.
984263bc	257	*/
364c022c MD	258	if (maxreq < minreq)
364c022c MD	259	maxreq = minreq;
b28ad496 MD	260	/* minreq not used beyond this point */
b28ad496 MD	261
364c022c MD	262	if (maxreq > max_readahead) {
	263	maxreq = max_readahead;
	264	if (maxreq > 16 * 1024 * 1024)
	265	maxreq = 16 * 1024 * 1024;
	266	}
	267	if (maxreq < blksize)
	268	maxreq = blksize;
	269	if (loffset + maxreq > filesize) {
	270	if (loffset > filesize)
	271	maxreq = 0;
	272	else
	273	maxreq = filesize - loffset;
	274	}
	275
	276	maxra = (int)(maxreq / blksize);
984263bc MD	277
984263bc MD	278	/*
ae8e83e6	279	* Get the requested block.
984263bc	280	*/
54341a3b MD	281	if (*bpp)
	282	reqbp = bp = *bpp;
	283	else
d32579c3	284	*bpp = reqbp = bp = getblk(vp, loffset, blksize, blkflags, 0);
54078292	285	origoffset = loffset;
984263bc	286
364c022c MD	287	/*
	288	* Calculate the maximum cluster size for a single I/O, used
	289	* by cluster_rbuild().
	290	*/
	291	maxrbuild = vmaxiosize(vp) / blksize;
	292
984263bc	293	/*
d9a07a60	294	* If it is in the cache, then check to see if the reads have been
984263bc MD	295	* sequential. If they have, then try some read-ahead, otherwise
	296	* back-off on prospective read-aheads.
	297	*/
	298	if (bp->b_flags & B_CACHE) {
6b84c93e MD	299	/*
	300	* Not sequential, do not do any read-ahead
	301	*/
364c022c	302	if (maxra <= 1)
984263bc	303	return 0;
6b84c93e MD	304
	305	/*
	306	* No read-ahead mark, do not do any read-ahead
	307	* yet.
	308	*/
	309	if ((bp->b_flags & B_RAM) == 0)
984263bc	310	return 0;
b1c20cfa	311
6b84c93e MD	312	/*
	313	* We hit a read-ahead-mark, figure out how much read-ahead
	314	* to do (maxra) and where to start (loffset).
	315	*
cb1fa82f MD	316	* Typically the way this works is that B_RAM is set in the
	317	* middle of the cluster and triggers an overlapping
	318	* read-ahead of 1/2 a cluster more blocks. This ensures
	319	* that the cluster read-ahead scales with the read-ahead
	320	* count and is thus better-able to absorb the caller's
	321	* latency.
6b84c93e	322	*
cb1fa82f MD	323	* Estimate where the next unread block will be by assuming
cb1fa82f MD	324	* that the B_RAM's are placed at the half-way point.
6b84c93e MD	325	*/
	326	bp->b_flags &= ~B_RAM;
	327
cb1fa82f MD	328	i = maxra / 2;
	329	rbp = findblk(vp, loffset + i * blksize, FINDBLK_TEST);
	330	if (rbp == NULL \|\| (rbp->b_flags & B_CACHE) == 0) {
	331	while (i) {
	332	--i;
	333	rbp = findblk(vp, loffset + i * blksize,
	334	FINDBLK_TEST);
	335	if (rbp) {
	336	++i;
	337	break;
	338	}
	339	}
	340	} else {
	341	while (i < maxra) {
	342	rbp = findblk(vp, loffset + i * blksize,
	343	FINDBLK_TEST);
	344	if (rbp == NULL)
	345	break;
	346	++i;
984263bc	347	}
984263bc	348	}
364c022c MD	349
	350	/*
	351	* We got everything or everything is in the cache, no
	352	* point continuing.
	353	*/
6b84c93e MD	354	if (i >= maxra)
6b84c93e MD	355	return 0;
616dd1e9 MD	356
	357	/*
	358	* Calculate where to start the read-ahead and how much
	359	* to do. Generally speaking we want to read-ahead by
	360	* (maxra) when we've found a read-ahead mark. We do
	361	* not want to reduce maxra here as it will cause
	362	* successive read-ahead I/O's to be smaller and smaller.
cf83ee2c MD	363	*
	364	* However, we have to make sure we don't break the
	365	* filesize limitation for the clustered operation.
616dd1e9	366	*/
6b84c93e	367	loffset += i * blksize;
984263bc	368	reqbp = bp = NULL;
cf83ee2c MD	369
	370	if (loffset >= filesize)
	371	return 0;
	372	if (loffset + maxra * blksize > filesize) {
	373	maxreq = filesize - loffset;
	374	maxra = (int)(maxreq / blksize);
	375	}
cb1fa82f MD	376
	377	/*
	378	* Set RAM on first read-ahead block since we still have
	379	* approximate maxra/2 blocks ahead of us that are already
	380	* cached or in-progress.
	381	*/
	382	sr = 1;
984263bc	383	} else {
cb1fa82f MD	384	/*
	385	* Start block is not valid, we will want to do a
	386	* full read-ahead.
	387	*/
4d8329e1	388	__debugvar off_t firstread = bp->b_loffset;
54078292	389	int nblks;
984263bc	390
ae8e83e6 MD	391	/*
	392	* Set-up synchronous read for bp.
	393	*/
	394	bp->b_cmd = BUF_CMD_READ;
	395	bp->b_bio1.bio_done = biodone_sync;
	396	bp->b_bio1.bio_flags \|= BIO_SYNC;
	397
81b5c339 MD	398	KASSERT(firstread != NOOFFSET,
81b5c339 MD	399	("cluster_read: no buffer offset"));
54078292	400
cb1fa82f MD	401	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
cb1fa82f MD	402
364c022c	403	/*
cb1fa82f	404	* Set RAM half-way through the full-cluster.
364c022c	405	*/
cb1fa82f	406	sr = (maxra + 1) / 2;
364c022c MD	407
	408	if (nblks > 1) {
	409	int burstbytes;
984263bc	410
e92ca23a MD	411	error = VOP_BMAP(vp, loffset, &doffset,
e92ca23a MD	412	&burstbytes, NULL, BUF_CMD_READ);
984263bc MD	413	if (error)
984263bc MD	414	goto single_block_read;
364c022c MD	415	if (nblks > burstbytes / blksize)
364c022c MD	416	nblks = burstbytes / blksize;
54078292	417	if (doffset == NOOFFSET)
984263bc	418	goto single_block_read;
364c022c	419	if (nblks <= 1)
984263bc	420	goto single_block_read;
984263bc	421
54078292	422	bp = cluster_rbuild(vp, filesize, loffset,
cb1fa82f	423	doffset, blksize, nblks, bp, &sr);
54078292	424	loffset += bp->b_bufsize;
364c022c	425	maxra -= bp->b_bufsize / blksize;
984263bc MD	426	} else {
	427	single_block_read:
	428	/*
364c022c	429	* If it isn't in the cache, then get a chunk from
984263bc MD	430	* disk if sequential, otherwise just get the block.
984263bc MD	431	*/
e92ca23a	432	loffset += blksize;
364c022c	433	--maxra;
984263bc MD	434	}
	435	}
	436
984263bc	437	/*
ae8e83e6 MD	438	* If B_CACHE was not set issue bp. bp will either be an
	439	* asynchronous cluster buf or a synchronous single-buf.
	440	* If it is a single buf it will be the same as reqbp.
	441	*
	442	* NOTE: Once an async cluster buf is issued bp becomes invalid.
984263bc MD	443	*/
	444	if (bp) {
	445	#if defined(CLUSTERDEBUG)
	446	if (rcluster)
364c022c MD	447	kprintf("S(%012jx,%d,%d)\n",
364c022c MD	448	(intmax_t)bp->b_loffset, bp->b_bcount, maxra);
984263bc	449	#endif
10f3fee5 MD	450	if ((bp->b_flags & B_CLUSTER) == 0)
10f3fee5 MD	451	vfs_busy_pages(vp, bp);
9c93755a MD	452	bp->b_flags &= ~(B_ERROR \| B_INVAL \| B_NOTMETA);
9c93755a MD	453	bp->b_flags \|= bflags;
81b5c339	454	vn_strategy(vp, &bp->b_bio1);
ae8e83e6	455	/* bp invalid now */
dbb11a6e	456	bp = NULL;
984263bc MD	457	}
984263bc MD	458
cb1fa82f MD	459	#if defined(CLUSTERDEBUG)
	460	if (rcluster)
	461	kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n",
	462	loffset, blksize, maxra, sr);
	463	#endif
	464
984263bc	465	/*
bfda7080	466	* If we have been doing sequential I/O, then do some read-ahead.
6b84c93e MD	467	* The code above us should have positioned us at the next likely
6b84c93e MD	468	* offset.
0728eafc MD	469	*
	470	* Only mess with buffers which we can immediately lock. HAMMER
	471	* will do device-readahead irrespective of what the blocks
	472	* represent.
cb1fa82f MD	473	*
	474	* Set B_RAM on the first buffer (the next likely offset needing
	475	* read-ahead), under the assumption that there are still
	476	* approximately maxra/2 blocks good ahead of us.
984263bc	477	*/
cb1fa82f	478	while (maxra > 0) {
bfda7080	479	int burstbytes;
364c022c	480	int nblks;
bfda7080	481
b77cfc40	482	rbp = getblk(vp, loffset, blksize,
d32579c3 MD	483	GETBLK_SZMATCH \| GETBLK_NOWAIT \| GETBLK_KVABIO,
d32579c3 MD	484	0);
cb1fa82f MD	485	#if defined(CLUSTERDEBUG)
	486	if (rcluster) {
	487	kprintf("read-ahead %016jx rbp=%p ",
	488	loffset, rbp);
	489	}
	490	#endif
b77cfc40 MD	491	if (rbp == NULL)
b77cfc40 MD	492	goto no_read_ahead;
bfda7080	493	if ((rbp->b_flags & B_CACHE)) {
984263bc	494	bqrelse(rbp);
bfda7080 SS	495	goto no_read_ahead;
	496	}
	497
ac7ffc8a	498	/*
cb1fa82f MD	499	* If BMAP is not supported or has an issue, we still do
cb1fa82f MD	500	* (maxra) read-ahead, but we do not try to use rbuild.
ac7ffc8a	501	*/
cb1fa82f MD	502	error = VOP_BMAP(vp, loffset, &doffset,
	503	&burstbytes, NULL, BUF_CMD_READ);
	504	if (error \|\| doffset == NOOFFSET) {
	505	nblks = 1;
	506	doffset = NOOFFSET;
	507	} else {
	508	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	509	if (nblks > burstbytes / blksize)
	510	nblks = burstbytes / blksize;
bfda7080	511	}
ae8e83e6	512	rbp->b_cmd = BUF_CMD_READ;
ae8e83e6	513
364c022c	514	if (nblks > 1) {
bfda7080	515	rbp = cluster_rbuild(vp, filesize, loffset,
e92ca23a	516	doffset, blksize,
cb1fa82f	517	nblks, rbp, &sr);
984263bc	518	} else {
bfda7080	519	rbp->b_bio2.bio_offset = doffset;
cb1fa82f MD	520	if (--sr == 0)
cb1fa82f MD	521	cluster_setram(rbp);
bfda7080	522	}
364c022c	523
9c93755a MD	524	rbp->b_flags &= ~(B_ERROR \| B_INVAL \| B_NOTMETA);
9c93755a MD	525	rbp->b_flags \|= bflags;
10f3fee5	526
bfda7080 SS	527	if ((rbp->b_flags & B_CLUSTER) == 0)
bfda7080 SS	528	vfs_busy_pages(vp, rbp);
ae8e83e6	529	BUF_KERNPROC(rbp);
6b84c93e MD	530	loffset += rbp->b_bufsize;
6b84c93e MD	531	maxra -= rbp->b_bufsize / blksize;
bfda7080	532	vn_strategy(vp, &rbp->b_bio1);
ae8e83e6	533	/* rbp invalid now */
984263bc	534	}
bfda7080	535
ae8e83e6 MD	536	/*
	537	* Wait for our original buffer to complete its I/O. reqbp will
	538	* be NULL if the original buffer was B_CACHE. We are returning
	539	* (*bpp) which is the same as reqbp when reqbp != NULL.
	540	*/
	541	no_read_ahead:
	542	if (reqbp) {
	543	KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
	544	error = biowait(&reqbp->b_bio1, "clurd");
cb1fa82f MD	545	} else {
cb1fa82f MD	546	error = 0;
ae8e83e6 MD	547	}
ae8e83e6 MD	548	return (error);
984263bc MD	549	}
984263bc MD	550
dbb11a6e MD	551	/*
	552	* This replaces breadcb(), providing an asynchronous read of the requested
	553	* buffer with a callback, plus an asynchronous read-ahead within the
	554	* specified bounds.
	555	*
	556	* The callback must check whether BIO_DONE is set in the bio and issue
	557	* the bpdone(bp, 0) if it isn't. The callback is responsible for clearing
	558	* BIO_DONE and disposing of the I/O (bqrelse()ing it).
	559	*
	560	* filesize - read-ahead @ blksize will not cross this boundary
	561	* loffset - loffset for returned *bpp
	562	* blksize - blocksize for returned *bpp and read-ahead bps
	563	* minreq - minimum (not a hard minimum) in bytes, typically reflects
	564	* a higher level uio resid.
	565	* maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB)
	566	* bpp - return buffer (*bpp) for (loffset,blksize)
	567	*/
	568	void
9c93755a MD	569	cluster_readcb(struct vnode *vp, off_t filesize, off_t loffset, int blksize,
	570	int bflags, size_t minreq, size_t maxreq,
	571	void (func)(struct bio ), void *arg)
dbb11a6e MD	572	{
	573	struct buf bp, rbp, *reqbp;
	574	off_t origoffset;
	575	off_t doffset;
	576	int i;
	577	int maxra;
	578	int maxrbuild;
cb1fa82f	579	int sr;
d32579c3	580	int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0;
cb1fa82f MD	581
cb1fa82f MD	582	sr = 0;
dbb11a6e MD	583
	584	/*
	585	* Calculate the desired read-ahead in blksize'd blocks (maxra).
	586	* To do this we calculate maxreq.
	587	*
	588	* maxreq typically starts out as a sequential heuristic. If the
	589	* high level uio/resid is bigger (minreq), we pop maxreq up to
	590	* minreq. This represents the case where random I/O is being
	591	* performed by the userland is issuing big read()'s.
	592	*
	593	* Then we limit maxreq to max_readahead to ensure it is a reasonable
	594	* value.
	595	*
	596	* Finally we must ensure that (loffset + maxreq) does not cross the
	597	* boundary (filesize) for the current blocksize. If we allowed it
	598	* to cross we could end up with buffers past the boundary with the
	599	* wrong block size (HAMMER large-data areas use mixed block sizes).
	600	* minreq is also absolutely limited to filesize.
	601	*/
	602	if (maxreq < minreq)
	603	maxreq = minreq;
	604	/* minreq not used beyond this point */
	605
	606	if (maxreq > max_readahead) {
	607	maxreq = max_readahead;
	608	if (maxreq > 16 * 1024 * 1024)
	609	maxreq = 16 * 1024 * 1024;
	610	}
	611	if (maxreq < blksize)
	612	maxreq = blksize;
	613	if (loffset + maxreq > filesize) {
	614	if (loffset > filesize)
	615	maxreq = 0;
	616	else
	617	maxreq = filesize - loffset;
	618	}
	619
	620	maxra = (int)(maxreq / blksize);
	621
	622	/*
	623	* Get the requested block.
	624	*/
d32579c3	625	reqbp = bp = getblk(vp, loffset, blksize, blkflags, 0);
dbb11a6e MD	626	origoffset = loffset;
	627
	628	/*
	629	* Calculate the maximum cluster size for a single I/O, used
	630	* by cluster_rbuild().
	631	*/
	632	maxrbuild = vmaxiosize(vp) / blksize;
	633
	634	/*
	635	* if it is in the cache, then check to see if the reads have been
	636	* sequential. If they have, then try some read-ahead, otherwise
	637	* back-off on prospective read-aheads.
	638	*/
	639	if (bp->b_flags & B_CACHE) {
	640	/*
	641	* Setup for func() call whether we do read-ahead or not.
	642	*/
	643	bp->b_bio1.bio_caller_info1.ptr = arg;
	644	bp->b_bio1.bio_flags \|= BIO_DONE;
	645
	646	/*
	647	* Not sequential, do not do any read-ahead
	648	*/
	649	if (maxra <= 1)
	650	goto no_read_ahead;
	651
	652	/*
	653	* No read-ahead mark, do not do any read-ahead
	654	* yet.
	655	*/
	656	if ((bp->b_flags & B_RAM) == 0)
	657	goto no_read_ahead;
	658	bp->b_flags &= ~B_RAM;
	659
	660	/*
	661	* We hit a read-ahead-mark, figure out how much read-ahead
	662	* to do (maxra) and where to start (loffset).
	663	*
	664	* Shortcut the scan. Typically the way this works is that
	665	* we've built up all the blocks inbetween except for the
	666	* last in previous iterations, so if the second-to-last
	667	* block is present we just skip ahead to it.
	668	*
	669	* This algorithm has O(1) cpu in the steady state no
	670	* matter how large maxra is.
	671	*/
	672	if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
	673	i = maxra - 1;
	674	else
	675	i = 1;
	676	while (i < maxra) {
	677	if (findblk(vp, loffset + i * blksize,
	678	FINDBLK_TEST) == NULL) {
	679	break;
	680	}
	681	++i;
	682	}
	683
	684	/*
	685	* We got everything or everything is in the cache, no
	686	* point continuing.
	687	*/
	688	if (i >= maxra)
	689	goto no_read_ahead;
690
691	/*
692	* Calculate where to start the read-ahead and how much
693	* to do. Generally speaking we want to read-ahead by
694	* (maxra) when we've found a read-ahead mark. We do
695	* not want to reduce maxra here as it will cause
696	* successive read-ahead I/O's to be smaller and smaller.
697	*
698	* However, we have to make sure we don't break the
699	* filesize limitation for the clustered operation.
700	*/
701	loffset += i * blksize;
702	bp = NULL;
703	/* leave reqbp intact to force function callback */
704
705	if (loffset >= filesize)
706	goto no_read_ahead;
707	if (loffset + maxra * blksize > filesize) {
708	maxreq = filesize - loffset;
709	maxra = (int)(maxreq / blksize);
710	}
cb1fa82f	711	sr = 1;
dbb11a6e	712	} else {
cb1fa82f MD	713	/*
	714	* bp is not valid, no prior cluster in progress so get a
	715	* full cluster read-ahead going.
	716	*/
dbb11a6e MD	717	__debugvar off_t firstread = bp->b_loffset;
dbb11a6e MD	718	int nblks;
cb1fa82f	719	int error;
dbb11a6e MD	720
	721	/*
	722	* Set-up synchronous read for bp.
	723	*/
9c93755a MD	724	bp->b_flags &= ~(B_ERROR \| B_EINTR \| B_INVAL \| B_NOTMETA);
9c93755a MD	725	bp->b_flags \|= bflags;
dbb11a6e MD	726	bp->b_cmd = BUF_CMD_READ;
	727	bp->b_bio1.bio_done = func;
	728	bp->b_bio1.bio_caller_info1.ptr = arg;
	729	BUF_KERNPROC(bp);
	730	reqbp = NULL; /* don't func() reqbp, it's running async */
	731
	732	KASSERT(firstread != NOOFFSET,
	733	("cluster_read: no buffer offset"));
	734
	735	/*
	736	* nblks is our cluster_rbuild request size, limited
	737	* primarily by the device.
	738	*/
cb1fa82f MD	739	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	740
	741	/*
	742	* Set RAM half-way through the full-cluster.
	743	*/
	744	sr = (maxra + 1) / 2;
dbb11a6e MD	745
	746	if (nblks > 1) {
	747	int burstbytes;
	748
cb1fa82f MD	749	error = VOP_BMAP(vp, loffset, &doffset,
	750	&burstbytes, NULL, BUF_CMD_READ);
	751	if (error)
dbb11a6e MD	752	goto single_block_read;
	753	if (nblks > burstbytes / blksize)
	754	nblks = burstbytes / blksize;
	755	if (doffset == NOOFFSET)
	756	goto single_block_read;
	757	if (nblks <= 1)
	758	goto single_block_read;
	759
	760	bp = cluster_rbuild(vp, filesize, loffset,
cb1fa82f	761	doffset, blksize, nblks, bp, &sr);
dbb11a6e MD	762	loffset += bp->b_bufsize;
	763	maxra -= bp->b_bufsize / blksize;
	764	} else {
	765	single_block_read:
	766	/*
	767	* If it isn't in the cache, then get a chunk from
	768	* disk if sequential, otherwise just get the block.
	769	*/
dbb11a6e MD	770	loffset += blksize;
	771	--maxra;
	772	}
	773	}
	774
	775	/*
	776	* If bp != NULL then B_CACHE was NOT set and bp must be issued.
	777	* bp will either be an asynchronous cluster buf or an asynchronous
	778	* single-buf.
	779	*
	780	* NOTE: Once an async cluster buf is issued bp becomes invalid.
	781	*/
	782	if (bp) {
	783	#if defined(CLUSTERDEBUG)
	784	if (rcluster)
	785	kprintf("S(%012jx,%d,%d)\n",
	786	(intmax_t)bp->b_loffset, bp->b_bcount, maxra);
	787	#endif
	788	if ((bp->b_flags & B_CLUSTER) == 0)
	789	vfs_busy_pages(vp, bp);
9c93755a MD	790	bp->b_flags &= ~(B_ERROR \| B_INVAL \| B_NOTMETA);
9c93755a MD	791	bp->b_flags \|= bflags;
dbb11a6e MD	792	vn_strategy(vp, &bp->b_bio1);
	793	/* bp invalid now */
	794	bp = NULL;
	795	}
	796
cb1fa82f MD	797	#if defined(CLUSTERDEBUG)
	798	if (rcluster)
	799	kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n",
	800	loffset, blksize, maxra, sr);
	801	#endif
	802
dbb11a6e MD	803	/*
	804	* If we have been doing sequential I/O, then do some read-ahead.
	805	* The code above us should have positioned us at the next likely
	806	* offset.
	807	*
	808	* Only mess with buffers which we can immediately lock. HAMMER
	809	* will do device-readahead irrespective of what the blocks
	810	* represent.
	811	*/
	812	while (maxra > 0) {
	813	int burstbytes;
cb1fa82f	814	int error;
dbb11a6e MD	815	int nblks;
	816
	817	rbp = getblk(vp, loffset, blksize,
d32579c3 MD	818	GETBLK_SZMATCH \| GETBLK_NOWAIT \| GETBLK_KVABIO,
d32579c3 MD	819	0);
dbb11a6e MD	820	if (rbp == NULL)
	821	goto no_read_ahead;
	822	if ((rbp->b_flags & B_CACHE)) {
	823	bqrelse(rbp);
	824	goto no_read_ahead;
	825	}
	826
	827	/*
cb1fa82f MD	828	* If BMAP is not supported or has an issue, we still do
cb1fa82f MD	829	* (maxra) read-ahead, but we do not try to use rbuild.
dbb11a6e	830	*/
cb1fa82f MD	831	error = VOP_BMAP(vp, loffset, &doffset,
	832	&burstbytes, NULL, BUF_CMD_READ);
	833	if (error \|\| doffset == NOOFFSET) {
	834	nblks = 1;
	835	doffset = NOOFFSET;
	836	} else {
	837	nblks = calc_rbuild_reqsize(maxra, maxrbuild);
	838	if (nblks > burstbytes / blksize)
	839	nblks = burstbytes / blksize;
dbb11a6e	840	}
dbb11a6e	841	rbp->b_cmd = BUF_CMD_READ;
dbb11a6e MD	842
	843	if (nblks > 1) {
	844	rbp = cluster_rbuild(vp, filesize, loffset,
	845	doffset, blksize,
cb1fa82f	846	nblks, rbp, &sr);
dbb11a6e MD	847	} else {
dbb11a6e MD	848	rbp->b_bio2.bio_offset = doffset;
cb1fa82f MD	849	if (--sr == 0)
cb1fa82f MD	850	cluster_setram(rbp);
dbb11a6e MD	851	}
dbb11a6e MD	852
9c93755a MD	853	rbp->b_flags &= ~(B_ERROR \| B_INVAL \| B_NOTMETA);
9c93755a MD	854	rbp->b_flags \|= bflags;
dbb11a6e MD	855
	856	if ((rbp->b_flags & B_CLUSTER) == 0)
	857	vfs_busy_pages(vp, rbp);
	858	BUF_KERNPROC(rbp);
	859	loffset += rbp->b_bufsize;
	860	maxra -= rbp->b_bufsize / blksize;
	861	vn_strategy(vp, &rbp->b_bio1);
	862	/* rbp invalid now */
	863	}
	864
	865	/*
	866	* If reqbp is non-NULL it had B_CACHE set and we issue the
	867	* function callback synchronously.
	868	*
	869	* Note that we may start additional asynchronous I/O before doing
	870	* the func() callback for the B_CACHE case
	871	*/
	872	no_read_ahead:
	873	if (reqbp)
	874	func(&reqbp->b_bio1);
	875	}
	876
984263bc MD	877	/*
	878	* If blocks are contiguous on disk, use this to provide clustered
	879	* read ahead. We will read as many blocks as possible sequentially
	880	* and then parcel them up into logical blocks in the buffer hash table.
ae8e83e6 MD	881	*
	882	* This function either returns a cluster buf or it returns fbp. fbp is
	883	* already expected to be set up as a synchronous or asynchronous request.
	884	*
	885	* If a cluster buf is returned it will always be async.
cb1fa82f MD	886	*
	887	* (*srp) counts down original blocks to determine where B_RAM should be set.
	888	* Set B_RAM when srp drops to 0. If (srp) starts at 0, B_RAM will not be
	889	* set on any buffer. Make sure B_RAM is cleared on any other buffers to
	890	* prevent degenerate read-aheads from being generated.
984263bc MD	891	*/
984263bc MD	892	static struct buf *
ae8e83e6	893	cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
cb1fa82f	894	int blksize, int run, struct buf fbp, int srp)
984263bc MD	895	{
984263bc MD	896	struct buf bp, tbp;
54078292 MD	897	off_t boffset;
54078292 MD	898	int i, j;
2ec4b00d	899	int maxiosize = vmaxiosize(vp);
984263bc	900
984263bc MD	901	/*
	902	* avoid a division
	903	*/
e92ca23a	904	while (loffset + run * blksize > filesize) {
984263bc MD	905	--run;
	906	}
	907
6260e485	908	tbp = fbp;
54078292	909	tbp->b_bio2.bio_offset = doffset;
8158299a	910	if (((tbp->b_flags & B_VMIO) == 0) \|\| (run <= 1)) {
cb1fa82f MD	911	if (--*srp == 0)
	912	cluster_setram(tbp);
	913	else
	914	cluster_clrram(tbp);
984263bc	915	return tbp;
10f3fee5	916	}
984263bc	917
d84f6fa1 MD	918	/*
	919	* Get a pbuf, limit cluster I/O on a per-device basis. If
	920	* doing cluster I/O for a file, limit cluster I/O on a
	921	* per-mount basis.
	922	*/
	923	if (vp->v_type == VCHR \|\| vp->v_type == VBLK)
	924	bp = trypbuf_kva(&vp->v_pbuf_count);
	925	else
	926	bp = trypbuf_kva(&vp->v_mount->mnt_pbuf_count);
	927
	928	if (bp == NULL)
984263bc MD	929	return tbp;
	930
	931	/*
	932	* We are synthesizing a buffer out of vm_page_t's, but
	933	* if the block size is not page aligned then the starting
	934	* address may not be either. Inherit the b_data offset
	935	* from the original buffer.
	936	*/
d84f6fa1	937	bp->b_vp = vp;
984263bc	938	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
d32579c3 MD	939	((vm_offset_t)tbp->b_data & PAGE_MASK));
d32579c3 MD	940	bp->b_flags \|= B_CLUSTER \| B_VMIO \| B_KVABIO;
10f3fee5	941	bp->b_cmd = BUF_CMD_READ;
ae8e83e6	942	bp->b_bio1.bio_done = cluster_callback; /* default to async */
81b5c339 MD	943	bp->b_bio1.bio_caller_info1.cluster_head = NULL;
81b5c339 MD	944	bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
54078292	945	bp->b_loffset = loffset;
e92ca23a	946	bp->b_bio2.bio_offset = doffset;
81b5c339 MD	947	KASSERT(bp->b_loffset != NOOFFSET,
81b5c339 MD	948	("cluster_rbuild: no buffer offset"));
984263bc	949
984263bc MD	950	bp->b_bcount = 0;
984263bc MD	951	bp->b_bufsize = 0;
54f51aeb	952	bp->b_xio.xio_npages = 0;
984263bc	953
e92ca23a	954	for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
10f3fee5	955	if (i) {
54f51aeb	956	if ((bp->b_xio.xio_npages * PAGE_SIZE) +
e92ca23a	957	round_page(blksize) > maxiosize) {
984263bc MD	958	break;
	959	}
	960
	961	/*
	962	* Shortcut some checks and try to avoid buffers that
	963	* would block in the lock. The same checks have to
	964	* be made again after we officially get the buffer.
	965	*/
b77cfc40	966	tbp = getblk(vp, loffset + i * blksize, blksize,
d32579c3 MD	967	GETBLK_SZMATCH \|
	968	GETBLK_NOWAIT \|
	969	GETBLK_KVABIO,
	970	0);
b77cfc40 MD	971	if (tbp == NULL)
	972	break;
	973	for (j = 0; j < tbp->b_xio.xio_npages; j++) {
	974	if (tbp->b_xio.xio_pages[j]->valid)
984263bc MD	975	break;
984263bc MD	976	}
b77cfc40 MD	977	if (j != tbp->b_xio.xio_npages) {
	978	bqrelse(tbp);
	979	break;
	980	}
984263bc MD	981
	982	/*
	983	* Stop scanning if the buffer is fuly valid
	984	* (marked B_CACHE), or locked (may be doing a
	985	* background write), or if the buffer is not
	986	* VMIO backed. The clustering code can only deal
	987	* with VMIO-backed buffers.
	988	*/
	989	if ((tbp->b_flags & (B_CACHE\|B_LOCKED)) \|\|
27bc0cb1 MD	990	(tbp->b_flags & B_VMIO) == 0 \|\|
	991	(LIST_FIRST(&tbp->b_dep) != NULL &&
	992	buf_checkread(tbp))
	993	) {
984263bc MD	994	bqrelse(tbp);
	995	break;
	996	}
	997
	998	/*
	999	* The buffer must be completely invalid in order to
	1000	* take part in the cluster. If it is partially valid
	1001	* then we stop.
	1002	*/
54f51aeb HP	1003	for (j = 0;j < tbp->b_xio.xio_npages; j++) {
54f51aeb HP	1004	if (tbp->b_xio.xio_pages[j]->valid)
984263bc MD	1005	break;
984263bc MD	1006	}
54f51aeb	1007	if (j != tbp->b_xio.xio_npages) {
984263bc MD	1008	bqrelse(tbp);
	1009	break;
	1010	}
	1011
b86460bf MD	1012	/*
	1013	* Depress the priority of buffers not explicitly
	1014	* requested.
	1015	*/
e92ca23a	1016	/* tbp->b_flags \|= B_AGE; */
b86460bf	1017
984263bc	1018	/*
984263bc MD	1019	* Set the block number if it isn't set, otherwise
	1020	* if it is make sure it matches the block number we
	1021	* expect.
	1022	*/
54078292 MD	1023	if (tbp->b_bio2.bio_offset == NOOFFSET) {
	1024	tbp->b_bio2.bio_offset = boffset;
	1025	} else if (tbp->b_bio2.bio_offset != boffset) {
984263bc MD	1026	brelse(tbp);
	1027	break;
	1028	}
	1029	}
ae8e83e6	1030
cb1fa82f MD	1031	/*
	1032	* Set B_RAM if (*srp) is 1. B_RAM is only set on one buffer
	1033	* in the cluster, including potentially the first buffer
	1034	* once we start streaming the read-aheads.
	1035	*/
	1036	if (--*srp == 0)
	1037	cluster_setram(tbp);
	1038	else
	1039	cluster_clrram(tbp);
	1040
984263bc	1041	/*
ae8e83e6 MD	1042	* The passed-in tbp (i == 0) will already be set up for
	1043	* async or sync operation. All other tbp's acquire in
	1044	* our loop are set up for async operation.
984263bc	1045	*/
10f3fee5	1046	tbp->b_cmd = BUF_CMD_READ;
984263bc	1047	BUF_KERNPROC(tbp);
81b5c339	1048	cluster_append(&bp->b_bio1, tbp);
54078292	1049	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
984263bc	1050	vm_page_t m;
b12defdc	1051
54f51aeb	1052	m = tbp->b_xio.xio_pages[j];
b12defdc	1053	vm_page_busy_wait(m, FALSE, "clurpg");
984263bc	1054	vm_page_io_start(m);
b12defdc	1055	vm_page_wakeup(m);
984263bc	1056	vm_object_pip_add(m->object, 1);
54f51aeb	1057	if ((bp->b_xio.xio_npages == 0) \|\|
c1f5cf51	1058	(bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
54f51aeb HP	1059	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
54f51aeb HP	1060	bp->b_xio.xio_npages++;
984263bc	1061	}
ca88a24a	1062	if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
54f51aeb	1063	tbp->b_xio.xio_pages[j] = bogus_page;
ca88a24a MD	1064	tbp->b_flags \|= B_HASBOGUS;
ca88a24a MD	1065	}
984263bc MD	1066	}
	1067	/*
	1068	* XXX shouldn't this be += size for both, like in
	1069	* cluster_wbuild()?
	1070	*
	1071	* Don't inherit tbp->b_bufsize as it may be larger due to
	1072	* a non-page-aligned size. Instead just aggregate using
	1073	* 'size'.
	1074	*/
e92ca23a MD	1075	if (tbp->b_bcount != blksize)
	1076	kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
	1077	if (tbp->b_bufsize != blksize)
	1078	kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
	1079	bp->b_bcount += blksize;
	1080	bp->b_bufsize += blksize;
984263bc MD	1081	}
	1082
	1083	/*
	1084	* Fully valid pages in the cluster are already good and do not need
	1085	* to be re-read from disk. Replace the page with bogus_page
	1086	*/
54f51aeb HP	1087	for (j = 0; j < bp->b_xio.xio_npages; j++) {
54f51aeb HP	1088	if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
984263bc	1089	VM_PAGE_BITS_ALL) {
54f51aeb	1090	bp->b_xio.xio_pages[j] = bogus_page;
ca88a24a	1091	bp->b_flags \|= B_HASBOGUS;
984263bc MD	1092	}
984263bc MD	1093	}
312dcd01	1094	if (bp->b_bufsize > bp->b_kvasize) {
54078292	1095	panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
984263bc	1096	bp->b_bufsize, bp->b_kvasize);
312dcd01	1097	}
d32579c3 MD	1098	pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data),
	1099	(vm_page_t *)bp->b_xio.xio_pages,
	1100	bp->b_xio.xio_npages);
ae8e83e6	1101	BUF_KERNPROC(bp);
984263bc MD	1102	return (bp);
	1103	}
	1104
	1105	/*
	1106	* Cleanup after a clustered read or write.
	1107	* This is complicated by the fact that any of the buffers might have
	1108	* extra memory (if there were no empty buffer headers at allocbuf time)
	1109	* that we will need to shift around.
81b5c339 MD	1110	*
81b5c339 MD	1111	* The returned bio is &bp->b_bio1
984263bc	1112	*/
59b728a7	1113	static void
81b5c339	1114	cluster_callback(struct bio *bio)
984263bc	1115	{
81b5c339 MD	1116	struct buf *bp = bio->bio_buf;
81b5c339 MD	1117	struct buf *tbp;
c3c895a6	1118	struct buf *next;
d84f6fa1	1119	struct vnode *vp;
984263bc	1120	int error = 0;
c3c895a6	1121	int bpflags;
984263bc MD	1122
984263bc MD	1123	/*
9a71d53f MD	1124	* Must propogate errors to all the components. A short read (EOF)
9a71d53f MD	1125	* is a critical error.
984263bc	1126	*/
9a71d53f	1127	if (bp->b_flags & B_ERROR) {
984263bc	1128	error = bp->b_error;
9a71d53f MD	1129	} else if (bp->b_bcount != bp->b_bufsize) {
	1130	panic("cluster_callback: unexpected EOF on cluster %p!", bio);
	1131	}
984263bc	1132
d32579c3 MD	1133	pmap_qremove_noinval(trunc_page((vm_offset_t) bp->b_data),
d32579c3 MD	1134	bp->b_xio.xio_npages);
c3c895a6 MD	1135
	1136	/*
	1137	* Retrieve the cluster head and dispose of the cluster buffer.
	1138	* the vp is only valid while we hold one or more cluster elements,
	1139	* so we have to do this before disposing of them.
	1140	*/
	1141	tbp = bio->bio_caller_info1.cluster_head;
	1142	bio->bio_caller_info1.cluster_head = NULL;
	1143	bpflags = bp->b_flags;
	1144	vp = bp->b_vp;
	1145	bp->b_vp = NULL;
	1146
	1147	if (vp->v_type == VCHR \|\| vp->v_type == VBLK)
	1148	relpbuf(bp, &vp->v_pbuf_count);
	1149	else
	1150	relpbuf(bp, &vp->v_mount->mnt_pbuf_count);
	1151	bp = NULL; /* SAFETY */
	1152
984263bc MD	1153	/*
984263bc MD	1154	* Move memory from the large cluster buffer into the component
81b5c339 MD	1155	* buffers and mark IO as done on these. Since the memory map
81b5c339 MD	1156	* is the same, no actual copying is required.
c3c895a6 MD	1157	*
c3c895a6 MD	1158	* (And we already disposed of the larger cluster buffer)
984263bc	1159	*/
c3c895a6 MD	1160	while (tbp) {
c3c895a6 MD	1161	next = tbp->b_cluster_next;
984263bc	1162	if (error) {
3b2afb67	1163	tbp->b_flags \|= B_ERROR \| B_IOISSUED;
984263bc MD	1164	tbp->b_error = error;
	1165	} else {
	1166	tbp->b_dirtyoff = tbp->b_dirtyend = 0;
9c93755a MD	1167	tbp->b_flags &= ~(B_ERROR \| B_INVAL);
	1168	if (tbp->b_cmd == BUF_CMD_READ) {
	1169	tbp->b_flags = (tbp->b_flags & ~B_NOTMETA) \|
c3c895a6	1170	(bpflags & B_NOTMETA);
9c93755a	1171	}
3b2afb67	1172	tbp->b_flags \|= B_IOISSUED;
c3c895a6	1173
984263bc MD	1174	/*
	1175	* XXX the bdwrite()/bqrelse() issued during
	1176	* cluster building clears B_RELBUF (see bqrelse()
	1177	* comment). If direct I/O was specified, we have
	1178	* to restore it here to allow the buffer and VM
	1179	* to be freed.
	1180	*/
	1181	if (tbp->b_flags & B_DIRECT)
	1182	tbp->b_flags \|= B_RELBUF;
ffd3e597 MD	1183
	1184	/*
	1185	* XXX I think biodone() below will do this, but do
	1186	* it here anyway for consistency.
	1187	*/
	1188	if (tbp->b_cmd == BUF_CMD_WRITE)
	1189	bundirty(tbp);
984263bc	1190	}
81b5c339	1191	biodone(&tbp->b_bio1);
c3c895a6	1192	tbp = next;
984263bc	1193	}
984263bc MD	1194	}
	1195
	1196	/*
504ea70e	1197	* Implement modified write build for cluster.
984263bc	1198	*
504ea70e MD	1199	* write_behind = 0 write behind disabled
	1200	* write_behind = 1 write behind normal (default)
	1201	* write_behind = 2 write behind backed-off
984263bc	1202	*
504ea70e MD	1203	* In addition, write_behind is only activated for files that have
	1204	* grown past a certain size (default 10MB). Otherwise temporary files
	1205	* wind up generating a lot of unnecessary disk I/O.
984263bc	1206	*/
984263bc	1207	static __inline int
e92ca23a	1208	cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
984263bc MD	1209	{
	1210	int r = 0;
	1211
	1212	switch(write_behind) {
	1213	case 2:
54078292	1214	if (start_loffset < len)
984263bc	1215	break;
54078292	1216	start_loffset -= len;
984263bc MD	1217	/* fall through */
984263bc MD	1218	case 1:
504ea70e MD	1219	if (vp->v_filesize >= write_behind_minfilesize) {
	1220	r = cluster_wbuild(vp, NULL, blksize,
	1221	start_loffset, len);
	1222	}
984263bc MD	1223	/* fall through */
	1224	default:
	1225	/* fall through */
	1226	break;
	1227	}
	1228	return(r);
	1229	}
	1230
	1231	/*
	1232	* Do clustered write for FFS.
	1233	*
	1234	* Three cases:
	1235	* 1. Write is not sequential (write asynchronously)
	1236	* Write is sequential:
	1237	* 2. beginning of cluster - begin cluster
	1238	* 3. middle of a cluster - add to cluster
	1239	* 4. end of a cluster - asynchronously write cluster
38a4b308 MD	1240	*
38a4b308 MD	1241	* WARNING! vnode fields are not locked and must ONLY be used heuristically.
984263bc MD	1242	*/
984263bc MD	1243	void
e92ca23a	1244	cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
984263bc MD	1245	{
984263bc MD	1246	struct vnode *vp;
54078292	1247	off_t loffset;
984263bc	1248	int maxclen, cursize;
984263bc	1249	int async;
38a4b308 MD	1250	cluster_cache_t dummy;
38a4b308 MD	1251	cluster_cache_t *cc;
984263bc MD	1252
984263bc MD	1253	vp = bp->b_vp;
e92ca23a	1254	if (vp->v_type == VREG)
984263bc	1255	async = vp->v_mount->mnt_flag & MNT_ASYNC;
e92ca23a	1256	else
984263bc	1257	async = 0;
54078292	1258	loffset = bp->b_loffset;
81b5c339 MD	1259	KASSERT(bp->b_loffset != NOOFFSET,
81b5c339 MD	1260	("cluster_write: no buffer offset"));
984263bc	1261
38a4b308 MD	1262	cc = cluster_getcache(&dummy, vp, loffset);
	1263
	1264	/*
	1265	* Initialize vnode to beginning of file.
	1266	*/
54078292	1267	if (loffset == 0)
38a4b308	1268	cc->v_lasta = cc->v_clen = cc->v_cstart = cc->v_lastw = 0;
984263bc	1269
cf297f2c	1270	if (cc->v_clen == 0 \|\| loffset != cc->v_lastw \|\|
d9a07a60	1271	(bp->b_bio2.bio_offset != NOOFFSET &&
cf297f2c	1272	(bp->b_bio2.bio_offset != cc->v_lasta))) {
d9a07a60 MD	1273	/*
	1274	* Next block is not logically sequential, or, if physical
	1275	* block offsets are available, not physically sequential.
	1276	*
	1277	* If physical block offsets are not available we only
	1278	* get here if we weren't logically sequential.
	1279	*/
2ec4b00d	1280	maxclen = vmaxiosize(vp);
38a4b308	1281	if (cc->v_clen != 0) {
984263bc MD	1282	/*
	1283	* Next block is not sequential.
	1284	*
	1285	* If we are not writing at end of file, the process
	1286	* seeked to another point in the file since its last
	1287	* write, or we have reached our maximum cluster size,
	1288	* then push the previous cluster. Otherwise try
	1289	* reallocating to make it sequential.
	1290	*
	1291	* Change to algorithm: only push previous cluster if
	1292	* it was sequential from the point of view of the
	1293	* seqcount heuristic, otherwise leave the buffer
	1294	* intact so we can potentially optimize the I/O
	1295	* later on in the buf_daemon or update daemon
	1296	* flush.
	1297	*/
cf297f2c	1298	cursize = cc->v_lastw - cc->v_cstart;
9de13b88	1299	if (bp->b_loffset + blksize < filesize \|\|
cf297f2c	1300	loffset != cc->v_lastw \|\|
38a4b308	1301	cc->v_clen <= cursize) {
984263bc	1302	if (!async && seqcount > 0) {
e92ca23a	1303	cluster_wbuild_wb(vp, blksize,
38a4b308	1304	cc->v_cstart, cursize);
984263bc MD	1305	}
	1306	} else {
	1307	struct buf bpp, endbp;
	1308	struct cluster_save *buflist;
	1309
38a4b308 MD	1310	buflist = cluster_collectbufs(cc, vp,
38a4b308 MD	1311	bp, blksize);
984263bc	1312	endbp = &buflist->bs_children
cf297f2c	1313	[buflist->bs_nchildren - 1];
984263bc MD	1314	if (VOP_REALLOCBLKS(vp, buflist)) {
	1315	/*
	1316	* Failed, push the previous cluster
	1317	* if really writing sequentially
	1318	* in the logical file (seqcount > 1),
	1319	* otherwise delay it in the hopes that
	1320	* the low level disk driver can
	1321	* optimize the write ordering.
38a4b308 MD	1322	*
	1323	* NOTE: We do not brelse the last
	1324	* element which is bp, and we
	1325	* do not return here.
984263bc MD	1326	*/
	1327	for (bpp = buflist->bs_children;
	1328	bpp < endbp; bpp++)
	1329	brelse(*bpp);
efda3bd0	1330	kfree(buflist, M_SEGMENT);
984263bc MD	1331	if (seqcount > 1) {
984263bc MD	1332	cluster_wbuild_wb(vp,
38a4b308	1333	blksize, cc->v_cstart,
984263bc MD	1334	cursize);
	1335	}
	1336	} else {
	1337	/*
	1338	* Succeeded, keep building cluster.
	1339	*/
	1340	for (bpp = buflist->bs_children;
	1341	bpp <= endbp; bpp++)
	1342	bdwrite(*bpp);
efda3bd0	1343	kfree(buflist, M_SEGMENT);
cf297f2c MD	1344	cc->v_lastw = loffset + blksize;
	1345	cc->v_lasta = bp->b_bio2.bio_offset +
	1346	blksize;
38a4b308	1347	cluster_putcache(cc);
984263bc MD	1348	return;
	1349	}
	1350	}
	1351	}
d9a07a60	1352
984263bc MD	1353	/*
	1354	* Consider beginning a cluster. If at end of file, make
	1355	* cluster as large as possible, otherwise find size of
	1356	* existing cluster.
	1357	*/
	1358	if ((vp->v_type == VREG) &&
9de13b88	1359	bp->b_loffset + blksize < filesize &&
54078292	1360	(bp->b_bio2.bio_offset == NOOFFSET) &&
e92ca23a	1361	(VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) \|\|
54078292	1362	bp->b_bio2.bio_offset == NOOFFSET)) {
b642a6c1	1363	bdwrite(bp);
38a4b308	1364	cc->v_clen = 0;
cf297f2c MD	1365	cc->v_lasta = bp->b_bio2.bio_offset + blksize;
	1366	cc->v_cstart = loffset;
	1367	cc->v_lastw = loffset + blksize;
38a4b308	1368	cluster_putcache(cc);
984263bc MD	1369	return;
984263bc MD	1370	}
e92ca23a	1371	if (maxclen > blksize)
cf297f2c	1372	cc->v_clen = maxclen;
54078292	1373	else
cf297f2c	1374	cc->v_clen = blksize;
38a4b308	1375	if (!async && cc->v_clen == 0) { /* I/O not contiguous */
cf297f2c	1376	cc->v_cstart = loffset;
b642a6c1	1377	bdwrite(bp);
984263bc	1378	} else { /* Wait for rest of cluster */
38a4b308	1379	cc->v_cstart = loffset;
984263bc MD	1380	bdwrite(bp);
984263bc MD	1381	}
38a4b308	1382	} else if (loffset == cc->v_cstart + cc->v_clen) {
984263bc MD	1383	/*
	1384	* At end of cluster, write it out if seqcount tells us we
	1385	* are operating sequentially, otherwise let the buf or
	1386	* update daemon handle it.
	1387	*/
	1388	bdwrite(bp);
	1389	if (seqcount > 1)
38a4b308 MD	1390	cluster_wbuild_wb(vp, blksize, cc->v_cstart,
	1391	cc->v_clen + blksize);
	1392	cc->v_clen = 0;
cf297f2c	1393	cc->v_cstart = loffset;
e91e64c7	1394	} else if (vm_paging_severe() &&
b642a6c1	1395	bp->b_loffset + blksize < filesize) {
984263bc	1396	/*
b642a6c1 MD	1397	* We are low on memory, get it going NOW. However, do not
	1398	* try to push out a partial block at the end of the file
	1399	* as this could lead to extremely non-optimal write activity.
984263bc MD	1400	*/
	1401	bawrite(bp);
	1402	} else {
	1403	/*
	1404	* In the middle of a cluster, so just delay the I/O for now.
	1405	*/
	1406	bdwrite(bp);
	1407	}
cf297f2c MD	1408	cc->v_lastw = loffset + blksize;
cf297f2c MD	1409	cc->v_lasta = bp->b_bio2.bio_offset + blksize;
38a4b308	1410	cluster_putcache(cc);
984263bc MD	1411	}
984263bc MD	1412
9de13b88 MD	1413	/*
	1414	* This is the clustered version of bawrite(). It works similarly to
	1415	* cluster_write() except I/O on the buffer is guaranteed to occur.
	1416	*/
	1417	int
	1418	cluster_awrite(struct buf *bp)
	1419	{
	1420	int total;
	1421
	1422	/*
	1423	* Don't bother if it isn't clusterable.
	1424	*/
	1425	if ((bp->b_flags & B_CLUSTEROK) == 0 \|\|
	1426	bp->b_vp == NULL \|\|
	1427	(bp->b_vp->v_flag & VOBJBUF) == 0) {
	1428	total = bp->b_bufsize;
	1429	bawrite(bp);
	1430	return (total);
	1431	}
	1432
	1433	total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize,
	1434	bp->b_loffset, vmaxiosize(bp->b_vp));
d9a07a60 MD	1435
	1436	/*
	1437	* If bp is still non-NULL then cluster_wbuild() did not initiate
	1438	* I/O on it and we must do so here to provide the API guarantee.
	1439	*/
9de13b88 MD	1440	if (bp)
	1441	bawrite(bp);
	1442
	1443	return total;
	1444	}
984263bc MD	1445
	1446	/*
	1447	* This is an awful lot like cluster_rbuild...wish they could be combined.
	1448	* The last lbn argument is the current block on which I/O is being
	1449	* performed. Check to see that it doesn't fall in the middle of
	1450	* the current block (if last_bp == NULL).
9de13b88 MD	1451	*
	1452	* cluster_wbuild() normally does not guarantee anything. If bpp is
	1453	* non-NULL and cluster_wbuild() is able to incorporate it into the
	1454	* I/O it will set *bpp to NULL, otherwise it will leave it alone and
	1455	* the caller must dispose of *bpp.
984263bc	1456	*/
9de13b88 MD	1457	static int
	1458	cluster_wbuild(struct vnode vp, struct buf *bpp,
	1459	int blksize, off_t start_loffset, int bytes)
984263bc MD	1460	{
984263bc MD	1461	struct buf bp, tbp;
e43a034f	1462	int i, j;
984263bc	1463	int totalwritten = 0;
9de13b88	1464	int must_initiate;
2ec4b00d	1465	int maxiosize = vmaxiosize(vp);
984263bc	1466
54078292	1467	while (bytes > 0) {
984263bc	1468	/*
9de13b88 MD	1469	* If the buffer matches the passed locked & removed buffer
	1470	* we used the passed buffer (which might not be B_DELWRI).
	1471	*
	1472	* Otherwise locate the buffer and determine if it is
	1473	* compatible.
984263bc	1474	*/
9de13b88 MD	1475	if (bpp && (*bpp)->b_loffset == start_loffset) {
	1476	tbp = *bpp;
	1477	*bpp = NULL;
	1478	bpp = NULL;
	1479	} else {
d32579c3 MD	1480	tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK \|
d32579c3 MD	1481	FINDBLK_KVABIO);
9de13b88 MD	1482	if (tbp == NULL \|\|
	1483	(tbp->b_flags & (B_LOCKED \| B_INVAL \| B_DELWRI)) !=
	1484	B_DELWRI \|\|
	1485	(LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
	1486	if (tbp)
	1487	BUF_UNLOCK(tbp);
	1488	start_loffset += blksize;
	1489	bytes -= blksize;
	1490	continue;
	1491	}
	1492	bremfree(tbp);
984263bc	1493	}
10f3fee5	1494	KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
984263bc MD	1495
	1496	/*
	1497	* Extra memory in the buffer, punt on this buffer.
	1498	* XXX we could handle this in most cases, but we would
	1499	* have to push the extra memory down to after our max
	1500	* possible cluster size and then potentially pull it back
	1501	* up if the cluster was terminated prematurely--too much
	1502	* hassle.
	1503	*/
8158299a MD	1504	if ((tbp->b_flags & B_CLUSTEROK) == 0 \|\|
	1505	tbp->b_bcount != tbp->b_bufsize \|\|
	1506	tbp->b_bcount != blksize \|\|
	1507	bytes == blksize) {
984263bc MD	1508	totalwritten += tbp->b_bufsize;
984263bc MD	1509	bawrite(tbp);
e92ca23a MD	1510	start_loffset += blksize;
e92ca23a MD	1511	bytes -= blksize;
984263bc MD	1512	continue;
	1513	}
	1514
d84f6fa1 MD	1515	/*
	1516	* Get a pbuf, limit cluster I/O on a per-device basis. If
	1517	* doing cluster I/O for a file, limit cluster I/O on a
	1518	* per-mount basis.
	1519	*
	1520	* HAMMER and other filesystems may attempt to queue a massive
	1521	* amount of write I/O, using trypbuf() here easily results in
	1522	* situation where the I/O stream becomes non-clustered.
	1523	*/
	1524	if (vp->v_type == VCHR \|\| vp->v_type == VBLK)
	1525	bp = getpbuf_kva(&vp->v_pbuf_count);
	1526	else
	1527	bp = getpbuf_kva(&vp->v_mount->mnt_pbuf_count);
	1528
984263bc	1529	/*
9a71d53f MD	1530	* Set up the pbuf. Track our append point with b_bcount
	1531	* and b_bufsize. b_bufsize is not used by the device but
	1532	* our caller uses it to loop clusters and we use it to
	1533	* detect a premature EOF on the block device.
984263bc	1534	*/
984263bc MD	1535	bp->b_bcount = 0;
984263bc MD	1536	bp->b_bufsize = 0;
54f51aeb	1537	bp->b_xio.xio_npages = 0;
81b5c339	1538	bp->b_loffset = tbp->b_loffset;
54078292	1539	bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
d84f6fa1	1540	bp->b_vp = vp;
984263bc MD	1541
	1542	/*
	1543	* We are synthesizing a buffer out of vm_page_t's, but
	1544	* if the block size is not page aligned then the starting
	1545	* address may not be either. Inherit the b_data offset
	1546	* from the original buffer.
	1547	*/
	1548	bp->b_data = (char *)((vm_offset_t)bp->b_data \|
d32579c3	1549	((vm_offset_t)tbp->b_data & PAGE_MASK));
9c93755a	1550	bp->b_flags &= ~(B_ERROR \| B_NOTMETA);
d32579c3	1551	bp->b_flags \|= B_CLUSTER \| B_BNOCLIP \| B_KVABIO \|
9c93755a MD	1552	(tbp->b_flags & (B_VMIO \| B_NEEDCOMMIT \|
9c93755a MD	1553	B_NOTMETA));
81b5c339 MD	1554	bp->b_bio1.bio_caller_info1.cluster_head = NULL;
81b5c339 MD	1555	bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
b1c20cfa	1556
984263bc MD	1557	/*
	1558	* From this location in the file, scan forward to see
	1559	* if there are buffers with adjacent data that need to
	1560	* be written as well.
9de13b88 MD	1561	*
	1562	* IO must be initiated on index 0 at this point
	1563	* (particularly when called from cluster_awrite()).
984263bc	1564	*/
e92ca23a	1565	for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
9de13b88 MD	1566	if (i == 0) {
	1567	must_initiate = 1;
	1568	} else {
	1569	/*
	1570	* Not first buffer.
	1571	*/
	1572	must_initiate = 0;
b1c20cfa	1573	tbp = findblk(vp, start_loffset,
d32579c3	1574	FINDBLK_NBLOCK \| FINDBLK_KVABIO);
984263bc	1575	/*
b1c20cfa MD	1576	* Buffer not found or could not be locked
b1c20cfa MD	1577	* non-blocking.
984263bc	1578	*/
b1c20cfa	1579	if (tbp == NULL)
984263bc	1580	break;
984263bc MD	1581
	1582	/*
	1583	* If it IS in core, but has different
b1c20cfa MD	1584	* characteristics, then don't cluster
b1c20cfa MD	1585	* with it.
984263bc MD	1586	*/
984263bc MD	1587	if ((tbp->b_flags & (B_VMIO \| B_CLUSTEROK \|
b1c20cfa MD	1588	B_INVAL \| B_DELWRI \| B_NEEDCOMMIT))
	1589	!= (B_DELWRI \| B_CLUSTEROK \|
	1590	(bp->b_flags & (B_VMIO \| B_NEEDCOMMIT))) \|\|
9de13b88	1591	(tbp->b_flags & B_LOCKED)
b1c20cfa MD	1592	) {
b1c20cfa MD	1593	BUF_UNLOCK(tbp);
984263bc MD	1594	break;
	1595	}
	1596
	1597	/*
	1598	* Check that the combined cluster
	1599	* would make sense with regard to pages
	1600	* and would not be too large
9de13b88 MD	1601	*
	1602	* WARNING! buf_checkwrite() must be the last
	1603	* check made. If it returns 0 then
	1604	* we must initiate the I/O.
984263bc	1605	*/
e92ca23a	1606	if ((tbp->b_bcount != blksize) \|\|
54078292 MD	1607	((bp->b_bio2.bio_offset + i) !=
54078292 MD	1608	tbp->b_bio2.bio_offset) \|\|
54f51aeb	1609	((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
9de13b88 MD	1610	(maxiosize / PAGE_SIZE)) \|\|
	1611	(LIST_FIRST(&tbp->b_dep) &&
	1612	buf_checkwrite(tbp))
	1613	) {
984263bc	1614	BUF_UNLOCK(tbp);
984263bc MD	1615	break;
984263bc MD	1616	}
9de13b88 MD	1617	if (LIST_FIRST(&tbp->b_dep))
9de13b88 MD	1618	must_initiate = 1;
984263bc MD	1619	/*
	1620	* Ok, it's passed all the tests,
	1621	* so remove it from the free list
	1622	* and mark it busy. We will use it.
	1623	*/
	1624	bremfree(tbp);
10f3fee5	1625	KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
9de13b88	1626	}
81b5c339	1627
984263bc MD	1628	/*
	1629	* If the IO is via the VM then we do some
	1630	* special VM hackery (yuck). Since the buffer's
	1631	* block size may not be page-aligned it is possible
	1632	* for a page to be shared between two buffers. We
	1633	* have to get rid of the duplication when building
	1634	* the cluster.
	1635	*/
	1636	if (tbp->b_flags & B_VMIO) {
	1637	vm_page_t m;
	1638
9de13b88 MD	1639	/*
	1640	* Try to avoid deadlocks with the VM system.
	1641	* However, we cannot abort the I/O if
	1642	* must_initiate is non-zero.
	1643	*/
	1644	if (must_initiate == 0) {
	1645	for (j = 0;
	1646	j < tbp->b_xio.xio_npages;
	1647	++j) {
54f51aeb	1648	m = tbp->b_xio.xio_pages[j];
bc0aa189 MD	1649	if (m->busy_count &
bc0aa189 MD	1650	PBUSY_LOCKED) {
984263bc MD	1651	bqrelse(tbp);
	1652	goto finishcluster;
	1653	}
	1654	}
	1655	}
	1656
54078292	1657	for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
54f51aeb	1658	m = tbp->b_xio.xio_pages[j];
b12defdc	1659	vm_page_busy_wait(m, FALSE, "clurpg");
984263bc	1660	vm_page_io_start(m);
b12defdc	1661	vm_page_wakeup(m);
984263bc	1662	vm_object_pip_add(m->object, 1);
54f51aeb HP	1663	if ((bp->b_xio.xio_npages == 0) \|\|
	1664	(bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
	1665	bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
	1666	bp->b_xio.xio_npages++;
984263bc MD	1667	}
	1668	}
	1669	}
e92ca23a MD	1670	bp->b_bcount += blksize;
e92ca23a MD	1671	bp->b_bufsize += blksize;
984263bc	1672
ffd3e597 MD	1673	/*
	1674	* NOTE: see bwrite/bawrite code for why we no longer
	1675	* undirty tbp here.
	1676	*
	1677	* bundirty(tbp); REMOVED
	1678	*/
10f3fee5	1679	tbp->b_flags &= ~B_ERROR;
10f3fee5	1680	tbp->b_cmd = BUF_CMD_WRITE;
984263bc	1681	BUF_KERNPROC(tbp);
81b5c339	1682	cluster_append(&bp->b_bio1, tbp);
2aee763b MD	1683
	1684	/*
	1685	* check for latent dependencies to be handled
	1686	*/
408357d8 MD	1687	if (LIST_FIRST(&tbp->b_dep) != NULL)
408357d8 MD	1688	buf_start(tbp);
984263bc MD	1689	}
984263bc MD	1690	finishcluster:
d32579c3 MD	1691	pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data),
	1692	(vm_page_t *)bp->b_xio.xio_pages,
	1693	bp->b_xio.xio_npages);
312dcd01	1694	if (bp->b_bufsize > bp->b_kvasize) {
9de13b88 MD	1695	panic("cluster_wbuild: b_bufsize(%d) "
	1696	"> b_kvasize(%d)\n",
	1697	bp->b_bufsize, bp->b_kvasize);
312dcd01	1698	}
984263bc MD	1699	totalwritten += bp->b_bufsize;
	1700	bp->b_dirtyoff = 0;
	1701	bp->b_dirtyend = bp->b_bufsize;
ae8e83e6	1702	bp->b_bio1.bio_done = cluster_callback;
10f3fee5	1703	bp->b_cmd = BUF_CMD_WRITE;
ae8e83e6	1704
10f3fee5	1705	vfs_busy_pages(vp, bp);
77912481	1706	bsetrunningbufspace(bp, bp->b_bufsize);
ae8e83e6	1707	BUF_KERNPROC(bp);
a8f169e2	1708	vn_strategy(vp, &bp->b_bio1);
984263bc	1709
54078292	1710	bytes -= i;
984263bc MD	1711	}
	1712	return totalwritten;
	1713	}
	1714
	1715	/*
47269f33 MD	1716	* Collect together all the buffers in a cluster, plus add one
	1717	* additional buffer passed-in.
	1718	*
	1719	* Only pre-existing buffers whos block size matches blksize are collected.
	1720	* (this is primarily because HAMMER1 uses varying block sizes and we don't
	1721	* want to override its choices).
65ec5030 MD	1722	*
	1723	* This code will not try to collect buffers that it cannot lock, otherwise
	1724	* it might deadlock against SMP-friendly filesystems.
984263bc MD	1725	*/
984263bc MD	1726	static struct cluster_save *
38a4b308 MD	1727	cluster_collectbufs(cluster_cache_t cc, struct vnode vp,
38a4b308 MD	1728	struct buf *last_bp, int blksize)
984263bc MD	1729	{
	1730	struct cluster_save *buflist;
	1731	struct buf *bp;
54078292	1732	off_t loffset;
984263bc	1733	int i, len;
47269f33 MD	1734	int j;
47269f33 MD	1735	int k;
984263bc	1736
cf297f2c	1737	len = (int)(cc->v_lastw - cc->v_cstart) / blksize;
38a4b308	1738	KKASSERT(len > 0);
77652cad	1739	buflist = kmalloc(sizeof(struct buf ) (len + 1) + sizeof(*buflist),
54078292	1740	M_SEGMENT, M_WAITOK);
984263bc MD	1741	buflist->bs_nchildren = 0;
984263bc MD	1742	buflist->bs_children = (struct buf **) (buflist + 1);
38a4b308	1743	for (loffset = cc->v_cstart, i = 0, j = 0;
47269f33 MD	1744	i < len;
	1745	(loffset += blksize), i++) {
	1746	bp = getcacheblk(vp, loffset,
65ec5030 MD	1747	last_bp->b_bcount, GETBLK_SZMATCH \|
65ec5030 MD	1748	GETBLK_NOWAIT);
984263bc	1749	buflist->bs_children[i] = bp;
47269f33 MD	1750	if (bp == NULL) {
	1751	j = i + 1;
	1752	} else if (bp->b_bio2.bio_offset == NOOFFSET) {
08daea96	1753	VOP_BMAP(bp->b_vp, bp->b_loffset,
e92ca23a MD	1754	&bp->b_bio2.bio_offset,
e92ca23a MD	1755	NULL, NULL, BUF_CMD_WRITE);
54078292	1756	}
984263bc	1757	}
47269f33 MD	1758
	1759	/*
	1760	* Get rid of gaps
	1761	*/
	1762	for (k = 0; k < j; ++k) {
	1763	if (buflist->bs_children[k]) {
	1764	bqrelse(buflist->bs_children[k]);
	1765	buflist->bs_children[k] = NULL;
	1766	}
	1767	}
	1768	if (j != 0) {
	1769	if (j != i) {
	1770	bcopy(buflist->bs_children + j,
	1771	buflist->bs_children + 0,
	1772	sizeof(buflist->bs_children[0]) * (i - j));
	1773	}
	1774	i -= j;
	1775	}
984263bc	1776	buflist->bs_children[i] = bp = last_bp;
54078292	1777	if (bp->b_bio2.bio_offset == NOOFFSET) {
e92ca23a MD	1778	VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
e92ca23a MD	1779	NULL, NULL, BUF_CMD_WRITE);
54078292	1780	}
984263bc MD	1781	buflist->bs_nchildren = i + 1;
	1782	return (buflist);
	1783	}
81b5c339 MD	1784
	1785	void
	1786	cluster_append(struct bio bio, struct buf tbp)
	1787	{
	1788	tbp->b_cluster_next = NULL;
	1789	if (bio->bio_caller_info1.cluster_head == NULL) {
	1790	bio->bio_caller_info1.cluster_head = tbp;
	1791	bio->bio_caller_info2.cluster_tail = tbp;
	1792	} else {
	1793	bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
	1794	bio->bio_caller_info2.cluster_tail = tbp;
	1795	}
	1796	}
	1797
cf1bb2a8 MD	1798	static
cf1bb2a8 MD	1799	void
cb1fa82f	1800	cluster_setram(struct buf *bp)
cf1bb2a8 MD	1801	{
	1802	bp->b_flags \|= B_RAM;
	1803	if (bp->b_xio.xio_npages)
	1804	vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
	1805	}
cb1fa82f MD	1806
	1807	static
	1808	void
	1809	cluster_clrram(struct buf *bp)
	1810	{
	1811	bp->b_flags &= ~B_RAM;
	1812	if (bp->b_xio.xio_npages)
	1813	vm_page_flag_clear(bp->b_xio.xio_pages[0], PG_RAM);
	1814	}