MPSAFE: MPSAFE kern/kern_plimit.c
[dragonfly.git] / sys / kern / vfs_cluster.c
CommitLineData
984263bc
MD
1/*-
2 * Copyright (c) 1993
3 * The Regents of the University of California. All rights reserved.
4 * Modifications/enhancements:
5 * Copyright (c) 1995 John S. Dyson. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
36 * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
b77cfc40 37 * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $
984263bc
MD
38 */
39
40#include "opt_debug_cluster.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/malloc.h>
49#include <sys/mount.h>
50#include <sys/resourcevar.h>
51#include <sys/vmmeter.h>
52#include <vm/vm.h>
53#include <vm/vm_object.h>
54#include <vm/vm_page.h>
55#include <sys/sysctl.h>
3020e3be 56#include <sys/buf2.h>
12e4aaff 57#include <vm/vm_page2.h>
984263bc 58
e54488bb
MD
59#include <machine/limits.h>
60
984263bc
MD
61#if defined(CLUSTERDEBUG)
62#include <sys/sysctl.h>
63static int rcluster= 0;
64SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
65#endif
66
d1cd9d97 67static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
984263bc
MD
68
69static struct cluster_save *
54078292 70 cluster_collectbufs (struct vnode *vp, struct buf *last_bp,
e92ca23a 71 int blksize);
984263bc 72static struct buf *
54078292 73 cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
e92ca23a 74 off_t doffset, int blksize, int run,
ae8e83e6 75 struct buf *fbp);
81b5c339 76static void cluster_callback (struct bio *);
cf1bb2a8 77static void cluster_setram (struct buf *);
984263bc
MD
78
79static int write_behind = 1;
80SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
364c022c
MD
81static int max_readahead = 2 * 1024 * 1024;
82SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, "");
984263bc
MD
83
84extern vm_page_t bogus_page;
85
86extern int cluster_pbuf_freecnt;
87
984263bc
MD
88/*
89 * This replaces bread.
364c022c
MD
90 *
91 * filesize - read-ahead @ blksize will not cross this boundary
92 * loffset - loffset for returned *bpp
93 * blksize - blocksize for returned *bpp and read-ahead bps
94 * minreq - minimum (not a hard minimum) in bytes, typically reflects
95 * a higher level uio resid.
96 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB)
97 * bpp - return buffer (*bpp) for (loffset,blksize)
984263bc
MD
98 */
99int
54078292 100cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
364c022c 101 int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
984263bc
MD
102{
103 struct buf *bp, *rbp, *reqbp;
54078292
MD
104 off_t origoffset;
105 off_t doffset;
106 int error;
984263bc 107 int i;
364c022c
MD
108 int maxra;
109 int maxrbuild;
984263bc
MD
110
111 error = 0;
112
113 /*
364c022c
MD
114 * Calculate the desired read-ahead in blksize'd blocks (maxra).
115 * To do this we calculate maxreq.
6b84c93e 116 *
364c022c
MD
117 * maxreq typically starts out as a sequential heuristic. If the
118 * high level uio/resid is bigger (minreq), we pop maxreq up to
119 * minreq. This represents the case where random I/O is being
120 * performed by the userland is issuing big read()'s.
6b84c93e 121 *
364c022c
MD
122 * Then we limit maxreq to max_readahead to ensure it is a reasonable
123 * value.
124 *
b28ad496 125 * Finally we must ensure that (loffset + maxreq) does not cross the
364c022c
MD
126 * boundary (filesize) for the current blocksize. If we allowed it
127 * to cross we could end up with buffers past the boundary with the
128 * wrong block size (HAMMER large-data areas use mixed block sizes).
b28ad496 129 * minreq is also absolutely limited to filesize.
984263bc 130 */
364c022c
MD
131 if (maxreq < minreq)
132 maxreq = minreq;
b28ad496
MD
133 /* minreq not used beyond this point */
134
364c022c
MD
135 if (maxreq > max_readahead) {
136 maxreq = max_readahead;
137 if (maxreq > 16 * 1024 * 1024)
138 maxreq = 16 * 1024 * 1024;
139 }
140 if (maxreq < blksize)
141 maxreq = blksize;
142 if (loffset + maxreq > filesize) {
143 if (loffset > filesize)
144 maxreq = 0;
145 else
146 maxreq = filesize - loffset;
147 }
148
149 maxra = (int)(maxreq / blksize);
984263bc
MD
150
151 /*
ae8e83e6 152 * Get the requested block.
984263bc 153 */
e92ca23a 154 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
54078292 155 origoffset = loffset;
984263bc 156
364c022c
MD
157 /*
158 * Calculate the maximum cluster size for a single I/O, used
159 * by cluster_rbuild().
160 */
161 maxrbuild = vmaxiosize(vp) / blksize;
162
984263bc
MD
163 /*
164 * if it is in the cache, then check to see if the reads have been
165 * sequential. If they have, then try some read-ahead, otherwise
166 * back-off on prospective read-aheads.
167 */
168 if (bp->b_flags & B_CACHE) {
6b84c93e
MD
169 /*
170 * Not sequential, do not do any read-ahead
171 */
364c022c 172 if (maxra <= 1)
984263bc 173 return 0;
6b84c93e
MD
174
175 /*
176 * No read-ahead mark, do not do any read-ahead
177 * yet.
178 */
179 if ((bp->b_flags & B_RAM) == 0)
984263bc 180 return 0;
b1c20cfa 181
6b84c93e
MD
182 /*
183 * We hit a read-ahead-mark, figure out how much read-ahead
184 * to do (maxra) and where to start (loffset).
185 *
186 * Shortcut the scan. Typically the way this works is that
187 * we've built up all the blocks inbetween except for the
188 * last in previous iterations, so if the second-to-last
189 * block is present we just skip ahead to it.
190 *
191 * This algorithm has O(1) cpu in the steady state no
192 * matter how large maxra is.
193 */
194 bp->b_flags &= ~B_RAM;
195
196 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
197 i = maxra - 1;
198 else
199 i = 1;
200 while (i < maxra) {
201 if (findblk(vp, loffset + i * blksize,
202 FINDBLK_TEST) == NULL) {
203 break;
984263bc 204 }
6b84c93e 205 ++i;
984263bc 206 }
364c022c
MD
207
208 /*
209 * We got everything or everything is in the cache, no
210 * point continuing.
211 */
6b84c93e
MD
212 if (i >= maxra)
213 return 0;
214 maxra -= i;
215 loffset += i * blksize;
984263bc
MD
216 reqbp = bp = NULL;
217 } else {
4d8329e1 218 __debugvar off_t firstread = bp->b_loffset;
54078292 219 int nblks;
984263bc 220
ae8e83e6
MD
221 /*
222 * Set-up synchronous read for bp.
223 */
224 bp->b_cmd = BUF_CMD_READ;
225 bp->b_bio1.bio_done = biodone_sync;
226 bp->b_bio1.bio_flags |= BIO_SYNC;
227
81b5c339
MD
228 KASSERT(firstread != NOOFFSET,
229 ("cluster_read: no buffer offset"));
54078292 230
364c022c
MD
231 /*
232 * nblks is our cluster_rbuild request size, limited
233 * primarily by the device.
234 */
235 if ((nblks = maxra) > maxrbuild)
236 nblks = maxrbuild;
237
238 if (nblks > 1) {
239 int burstbytes;
984263bc 240
e92ca23a
MD
241 error = VOP_BMAP(vp, loffset, &doffset,
242 &burstbytes, NULL, BUF_CMD_READ);
984263bc
MD
243 if (error)
244 goto single_block_read;
364c022c
MD
245 if (nblks > burstbytes / blksize)
246 nblks = burstbytes / blksize;
54078292 247 if (doffset == NOOFFSET)
984263bc 248 goto single_block_read;
364c022c 249 if (nblks <= 1)
984263bc 250 goto single_block_read;
984263bc 251
54078292 252 bp = cluster_rbuild(vp, filesize, loffset,
ae8e83e6 253 doffset, blksize, nblks, bp);
54078292 254 loffset += bp->b_bufsize;
364c022c 255 maxra -= bp->b_bufsize / blksize;
984263bc
MD
256 } else {
257single_block_read:
258 /*
364c022c 259 * If it isn't in the cache, then get a chunk from
984263bc
MD
260 * disk if sequential, otherwise just get the block.
261 */
cf1bb2a8 262 cluster_setram(bp);
e92ca23a 263 loffset += blksize;
364c022c 264 --maxra;
984263bc
MD
265 }
266 }
267
984263bc 268 /*
ae8e83e6
MD
269 * If B_CACHE was not set issue bp. bp will either be an
270 * asynchronous cluster buf or a synchronous single-buf.
271 * If it is a single buf it will be the same as reqbp.
272 *
273 * NOTE: Once an async cluster buf is issued bp becomes invalid.
984263bc
MD
274 */
275 if (bp) {
276#if defined(CLUSTERDEBUG)
277 if (rcluster)
364c022c
MD
278 kprintf("S(%012jx,%d,%d)\n",
279 (intmax_t)bp->b_loffset, bp->b_bcount, maxra);
984263bc 280#endif
10f3fee5
MD
281 if ((bp->b_flags & B_CLUSTER) == 0)
282 vfs_busy_pages(vp, bp);
984263bc 283 bp->b_flags &= ~(B_ERROR|B_INVAL);
81b5c339 284 vn_strategy(vp, &bp->b_bio1);
ae8e83e6
MD
285 error = 0;
286 /* bp invalid now */
984263bc
MD
287 }
288
289 /*
bfda7080 290 * If we have been doing sequential I/O, then do some read-ahead.
6b84c93e
MD
291 * The code above us should have positioned us at the next likely
292 * offset.
0728eafc
MD
293 *
294 * Only mess with buffers which we can immediately lock. HAMMER
295 * will do device-readahead irrespective of what the blocks
296 * represent.
984263bc 297 */
364c022c 298 while (error == 0 && maxra > 0) {
bfda7080 299 int burstbytes;
ac7ffc8a 300 int tmp_error;
364c022c 301 int nblks;
bfda7080 302
b77cfc40
MD
303 rbp = getblk(vp, loffset, blksize,
304 GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
305 if (rbp == NULL)
306 goto no_read_ahead;
bfda7080 307 if ((rbp->b_flags & B_CACHE)) {
984263bc 308 bqrelse(rbp);
bfda7080
SS
309 goto no_read_ahead;
310 }
311
ac7ffc8a
MD
312 /*
313 * An error from the read-ahead bmap has nothing to do
314 * with the caller's original request.
315 */
316 tmp_error = VOP_BMAP(vp, loffset, &doffset,
317 &burstbytes, NULL, BUF_CMD_READ);
318 if (tmp_error || doffset == NOOFFSET) {
bfda7080
SS
319 rbp->b_flags |= B_INVAL;
320 brelse(rbp);
321 rbp = NULL;
322 goto no_read_ahead;
323 }
364c022c
MD
324 if ((nblks = maxra) > maxrbuild)
325 nblks = maxrbuild;
326 if (nblks > burstbytes / blksize)
327 nblks = burstbytes / blksize;
bfda7080 328
ae8e83e6
MD
329 /*
330 * rbp: async read
331 */
332 rbp->b_cmd = BUF_CMD_READ;
cf1bb2a8
MD
333 /*rbp->b_flags |= B_AGE*/;
334 cluster_setram(rbp);
ae8e83e6 335
364c022c 336 if (nblks > 1) {
bfda7080 337 rbp = cluster_rbuild(vp, filesize, loffset,
e92ca23a 338 doffset, blksize,
364c022c 339 nblks, rbp);
984263bc 340 } else {
bfda7080
SS
341 rbp->b_bio2.bio_offset = doffset;
342 }
364c022c 343
984263bc 344#if defined(CLUSTERDEBUG)
bfda7080 345 if (rcluster) {
364c022c
MD
346 if (bp) {
347 kprintf("A+(%012jx,%d,%jd) "
348 "doff=%012jx minr=%zd ra=%d\n",
349 (intmax_t)loffset, rbp->b_bcount,
350 (intmax_t)(loffset - origoffset),
351 (intmax_t)doffset, minreq, maxra);
352 } else {
353 kprintf("A-(%012jx,%d,%jd) "
354 "doff=%012jx minr=%zd ra=%d\n",
355 (intmax_t)rbp->b_loffset, rbp->b_bcount,
356 (intmax_t)(loffset - origoffset),
357 (intmax_t)doffset, minreq, maxra);
358 }
bfda7080 359 }
984263bc 360#endif
bfda7080 361 rbp->b_flags &= ~(B_ERROR|B_INVAL);
10f3fee5 362
bfda7080
SS
363 if ((rbp->b_flags & B_CLUSTER) == 0)
364 vfs_busy_pages(vp, rbp);
ae8e83e6 365 BUF_KERNPROC(rbp);
6b84c93e
MD
366 loffset += rbp->b_bufsize;
367 maxra -= rbp->b_bufsize / blksize;
bfda7080 368 vn_strategy(vp, &rbp->b_bio1);
ae8e83e6 369 /* rbp invalid now */
984263bc 370 }
bfda7080 371
ae8e83e6
MD
372 /*
373 * Wait for our original buffer to complete its I/O. reqbp will
374 * be NULL if the original buffer was B_CACHE. We are returning
375 * (*bpp) which is the same as reqbp when reqbp != NULL.
376 */
377no_read_ahead:
378 if (reqbp) {
379 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
380 error = biowait(&reqbp->b_bio1, "clurd");
381 }
382 return (error);
984263bc
MD
383}
384
385/*
386 * If blocks are contiguous on disk, use this to provide clustered
387 * read ahead. We will read as many blocks as possible sequentially
388 * and then parcel them up into logical blocks in the buffer hash table.
ae8e83e6
MD
389 *
390 * This function either returns a cluster buf or it returns fbp. fbp is
391 * already expected to be set up as a synchronous or asynchronous request.
392 *
393 * If a cluster buf is returned it will always be async.
984263bc
MD
394 */
395static struct buf *
ae8e83e6
MD
396cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
397 int blksize, int run, struct buf *fbp)
984263bc
MD
398{
399 struct buf *bp, *tbp;
54078292
MD
400 off_t boffset;
401 int i, j;
2ec4b00d 402 int maxiosize = vmaxiosize(vp);
984263bc 403
984263bc
MD
404 /*
405 * avoid a division
406 */
e92ca23a 407 while (loffset + run * blksize > filesize) {
984263bc
MD
408 --run;
409 }
410
6260e485 411 tbp = fbp;
54078292 412 tbp->b_bio2.bio_offset = doffset;
10f3fee5
MD
413 if((tbp->b_flags & B_MALLOC) ||
414 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) {
984263bc 415 return tbp;
10f3fee5 416 }
984263bc 417
9a82e536 418 bp = trypbuf_kva(&cluster_pbuf_freecnt);
ae8e83e6 419 if (bp == NULL) {
984263bc 420 return tbp;
ae8e83e6 421 }
984263bc
MD
422
423 /*
424 * We are synthesizing a buffer out of vm_page_t's, but
425 * if the block size is not page aligned then the starting
426 * address may not be either. Inherit the b_data offset
427 * from the original buffer.
428 */
429 bp->b_data = (char *)((vm_offset_t)bp->b_data |
430 ((vm_offset_t)tbp->b_data & PAGE_MASK));
ae8e83e6 431 bp->b_flags |= B_CLUSTER | B_VMIO;
10f3fee5 432 bp->b_cmd = BUF_CMD_READ;
ae8e83e6 433 bp->b_bio1.bio_done = cluster_callback; /* default to async */
81b5c339
MD
434 bp->b_bio1.bio_caller_info1.cluster_head = NULL;
435 bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
54078292 436 bp->b_loffset = loffset;
e92ca23a 437 bp->b_bio2.bio_offset = doffset;
81b5c339
MD
438 KASSERT(bp->b_loffset != NOOFFSET,
439 ("cluster_rbuild: no buffer offset"));
984263bc 440
984263bc
MD
441 bp->b_bcount = 0;
442 bp->b_bufsize = 0;
54f51aeb 443 bp->b_xio.xio_npages = 0;
984263bc 444
e92ca23a 445 for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
10f3fee5 446 if (i) {
54f51aeb 447 if ((bp->b_xio.xio_npages * PAGE_SIZE) +
e92ca23a 448 round_page(blksize) > maxiosize) {
984263bc
MD
449 break;
450 }
451
452 /*
453 * Shortcut some checks and try to avoid buffers that
454 * would block in the lock. The same checks have to
455 * be made again after we officially get the buffer.
456 */
b77cfc40
MD
457 tbp = getblk(vp, loffset + i * blksize, blksize,
458 GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
459 if (tbp == NULL)
460 break;
461 for (j = 0; j < tbp->b_xio.xio_npages; j++) {
462 if (tbp->b_xio.xio_pages[j]->valid)
984263bc
MD
463 break;
464 }
b77cfc40
MD
465 if (j != tbp->b_xio.xio_npages) {
466 bqrelse(tbp);
467 break;
468 }
984263bc
MD
469
470 /*
471 * Stop scanning if the buffer is fuly valid
472 * (marked B_CACHE), or locked (may be doing a
473 * background write), or if the buffer is not
474 * VMIO backed. The clustering code can only deal
475 * with VMIO-backed buffers.
476 */
477 if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
27bc0cb1
MD
478 (tbp->b_flags & B_VMIO) == 0 ||
479 (LIST_FIRST(&tbp->b_dep) != NULL &&
480 buf_checkread(tbp))
481 ) {
984263bc
MD
482 bqrelse(tbp);
483 break;
484 }
485
486 /*
487 * The buffer must be completely invalid in order to
488 * take part in the cluster. If it is partially valid
489 * then we stop.
490 */
54f51aeb
HP
491 for (j = 0;j < tbp->b_xio.xio_npages; j++) {
492 if (tbp->b_xio.xio_pages[j]->valid)
984263bc
MD
493 break;
494 }
54f51aeb 495 if (j != tbp->b_xio.xio_npages) {
984263bc
MD
496 bqrelse(tbp);
497 break;
498 }
499
500 /*
501 * Set a read-ahead mark as appropriate
502 */
6260e485 503 if (i == 1 || i == (run - 1))
cf1bb2a8 504 cluster_setram(tbp);
984263bc 505
b86460bf
MD
506 /*
507 * Depress the priority of buffers not explicitly
508 * requested.
509 */
e92ca23a 510 /* tbp->b_flags |= B_AGE; */
b86460bf 511
984263bc 512 /*
984263bc
MD
513 * Set the block number if it isn't set, otherwise
514 * if it is make sure it matches the block number we
515 * expect.
516 */
54078292
MD
517 if (tbp->b_bio2.bio_offset == NOOFFSET) {
518 tbp->b_bio2.bio_offset = boffset;
519 } else if (tbp->b_bio2.bio_offset != boffset) {
984263bc
MD
520 brelse(tbp);
521 break;
522 }
523 }
ae8e83e6 524
984263bc 525 /*
ae8e83e6
MD
526 * The passed-in tbp (i == 0) will already be set up for
527 * async or sync operation. All other tbp's acquire in
528 * our loop are set up for async operation.
984263bc 529 */
10f3fee5 530 tbp->b_cmd = BUF_CMD_READ;
984263bc 531 BUF_KERNPROC(tbp);
81b5c339 532 cluster_append(&bp->b_bio1, tbp);
54078292 533 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
984263bc 534 vm_page_t m;
54f51aeb 535 m = tbp->b_xio.xio_pages[j];
984263bc
MD
536 vm_page_io_start(m);
537 vm_object_pip_add(m->object, 1);
54f51aeb
HP
538 if ((bp->b_xio.xio_npages == 0) ||
539 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
540 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
541 bp->b_xio.xio_npages++;
984263bc
MD
542 }
543 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
54f51aeb 544 tbp->b_xio.xio_pages[j] = bogus_page;
984263bc
MD
545 }
546 /*
547 * XXX shouldn't this be += size for both, like in
548 * cluster_wbuild()?
549 *
550 * Don't inherit tbp->b_bufsize as it may be larger due to
551 * a non-page-aligned size. Instead just aggregate using
552 * 'size'.
553 */
e92ca23a
MD
554 if (tbp->b_bcount != blksize)
555 kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
556 if (tbp->b_bufsize != blksize)
557 kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
558 bp->b_bcount += blksize;
559 bp->b_bufsize += blksize;
984263bc
MD
560 }
561
562 /*
563 * Fully valid pages in the cluster are already good and do not need
564 * to be re-read from disk. Replace the page with bogus_page
565 */
54f51aeb
HP
566 for (j = 0; j < bp->b_xio.xio_npages; j++) {
567 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
984263bc 568 VM_PAGE_BITS_ALL) {
54f51aeb 569 bp->b_xio.xio_pages[j] = bogus_page;
984263bc
MD
570 }
571 }
312dcd01 572 if (bp->b_bufsize > bp->b_kvasize) {
54078292 573 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
984263bc 574 bp->b_bufsize, bp->b_kvasize);
312dcd01 575 }
984263bc 576 pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
54f51aeb 577 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
ae8e83e6 578 BUF_KERNPROC(bp);
984263bc
MD
579 return (bp);
580}
581
582/*
583 * Cleanup after a clustered read or write.
584 * This is complicated by the fact that any of the buffers might have
585 * extra memory (if there were no empty buffer headers at allocbuf time)
586 * that we will need to shift around.
81b5c339
MD
587 *
588 * The returned bio is &bp->b_bio1
984263bc
MD
589 */
590void
81b5c339 591cluster_callback(struct bio *bio)
984263bc 592{
81b5c339
MD
593 struct buf *bp = bio->bio_buf;
594 struct buf *tbp;
984263bc
MD
595 int error = 0;
596
597 /*
9a71d53f
MD
598 * Must propogate errors to all the components. A short read (EOF)
599 * is a critical error.
984263bc 600 */
9a71d53f 601 if (bp->b_flags & B_ERROR) {
984263bc 602 error = bp->b_error;
9a71d53f
MD
603 } else if (bp->b_bcount != bp->b_bufsize) {
604 panic("cluster_callback: unexpected EOF on cluster %p!", bio);
605 }
984263bc 606
54f51aeb 607 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
984263bc
MD
608 /*
609 * Move memory from the large cluster buffer into the component
81b5c339
MD
610 * buffers and mark IO as done on these. Since the memory map
611 * is the same, no actual copying is required.
984263bc 612 */
81b5c339
MD
613 while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
614 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
984263bc 615 if (error) {
24c8374a 616 tbp->b_flags |= B_ERROR | B_IODEBUG;
984263bc
MD
617 tbp->b_error = error;
618 } else {
619 tbp->b_dirtyoff = tbp->b_dirtyend = 0;
620 tbp->b_flags &= ~(B_ERROR|B_INVAL);
24c8374a 621 tbp->b_flags |= B_IODEBUG;
984263bc
MD
622 /*
623 * XXX the bdwrite()/bqrelse() issued during
624 * cluster building clears B_RELBUF (see bqrelse()
625 * comment). If direct I/O was specified, we have
626 * to restore it here to allow the buffer and VM
627 * to be freed.
628 */
629 if (tbp->b_flags & B_DIRECT)
630 tbp->b_flags |= B_RELBUF;
631 }
81b5c339 632 biodone(&tbp->b_bio1);
984263bc
MD
633 }
634 relpbuf(bp, &cluster_pbuf_freecnt);
635}
636
637/*
638 * cluster_wbuild_wb:
639 *
640 * Implement modified write build for cluster.
641 *
642 * write_behind = 0 write behind disabled
643 * write_behind = 1 write behind normal (default)
644 * write_behind = 2 write behind backed-off
645 */
646
647static __inline int
e92ca23a 648cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
984263bc
MD
649{
650 int r = 0;
651
652 switch(write_behind) {
653 case 2:
54078292 654 if (start_loffset < len)
984263bc 655 break;
54078292 656 start_loffset -= len;
984263bc
MD
657 /* fall through */
658 case 1:
e92ca23a 659 r = cluster_wbuild(vp, blksize, start_loffset, len);
984263bc
MD
660 /* fall through */
661 default:
662 /* fall through */
663 break;
664 }
665 return(r);
666}
667
668/*
669 * Do clustered write for FFS.
670 *
671 * Three cases:
672 * 1. Write is not sequential (write asynchronously)
673 * Write is sequential:
674 * 2. beginning of cluster - begin cluster
675 * 3. middle of a cluster - add to cluster
676 * 4. end of a cluster - asynchronously write cluster
677 */
678void
e92ca23a 679cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
984263bc
MD
680{
681 struct vnode *vp;
54078292 682 off_t loffset;
984263bc 683 int maxclen, cursize;
984263bc
MD
684 int async;
685
686 vp = bp->b_vp;
e92ca23a 687 if (vp->v_type == VREG)
984263bc 688 async = vp->v_mount->mnt_flag & MNT_ASYNC;
e92ca23a 689 else
984263bc 690 async = 0;
54078292 691 loffset = bp->b_loffset;
81b5c339
MD
692 KASSERT(bp->b_loffset != NOOFFSET,
693 ("cluster_write: no buffer offset"));
984263bc
MD
694
695 /* Initialize vnode to beginning of file. */
54078292 696 if (loffset == 0)
984263bc
MD
697 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
698
e92ca23a 699 if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize ||
54078292 700 bp->b_bio2.bio_offset == NOOFFSET ||
e92ca23a 701 (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) {
2ec4b00d 702 maxclen = vmaxiosize(vp);
984263bc
MD
703 if (vp->v_clen != 0) {
704 /*
705 * Next block is not sequential.
706 *
707 * If we are not writing at end of file, the process
708 * seeked to another point in the file since its last
709 * write, or we have reached our maximum cluster size,
710 * then push the previous cluster. Otherwise try
711 * reallocating to make it sequential.
712 *
713 * Change to algorithm: only push previous cluster if
714 * it was sequential from the point of view of the
715 * seqcount heuristic, otherwise leave the buffer
716 * intact so we can potentially optimize the I/O
717 * later on in the buf_daemon or update daemon
718 * flush.
719 */
e92ca23a
MD
720 cursize = vp->v_lastw - vp->v_cstart + blksize;
721 if (bp->b_loffset + blksize != filesize ||
722 loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) {
984263bc 723 if (!async && seqcount > 0) {
e92ca23a 724 cluster_wbuild_wb(vp, blksize,
984263bc
MD
725 vp->v_cstart, cursize);
726 }
727 } else {
728 struct buf **bpp, **endbp;
729 struct cluster_save *buflist;
730
e92ca23a 731 buflist = cluster_collectbufs(vp, bp, blksize);
984263bc
MD
732 endbp = &buflist->bs_children
733 [buflist->bs_nchildren - 1];
734 if (VOP_REALLOCBLKS(vp, buflist)) {
735 /*
736 * Failed, push the previous cluster
737 * if *really* writing sequentially
738 * in the logical file (seqcount > 1),
739 * otherwise delay it in the hopes that
740 * the low level disk driver can
741 * optimize the write ordering.
742 */
743 for (bpp = buflist->bs_children;
744 bpp < endbp; bpp++)
745 brelse(*bpp);
efda3bd0 746 kfree(buflist, M_SEGMENT);
984263bc
MD
747 if (seqcount > 1) {
748 cluster_wbuild_wb(vp,
e92ca23a 749 blksize, vp->v_cstart,
984263bc
MD
750 cursize);
751 }
752 } else {
753 /*
754 * Succeeded, keep building cluster.
755 */
756 for (bpp = buflist->bs_children;
757 bpp <= endbp; bpp++)
758 bdwrite(*bpp);
efda3bd0 759 kfree(buflist, M_SEGMENT);
54078292
MD
760 vp->v_lastw = loffset;
761 vp->v_lasta = bp->b_bio2.bio_offset;
984263bc
MD
762 return;
763 }
764 }
765 }
766 /*
767 * Consider beginning a cluster. If at end of file, make
768 * cluster as large as possible, otherwise find size of
769 * existing cluster.
770 */
771 if ((vp->v_type == VREG) &&
e92ca23a 772 bp->b_loffset + blksize != filesize &&
54078292 773 (bp->b_bio2.bio_offset == NOOFFSET) &&
e92ca23a 774 (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) ||
54078292 775 bp->b_bio2.bio_offset == NOOFFSET)) {
984263bc
MD
776 bawrite(bp);
777 vp->v_clen = 0;
54078292 778 vp->v_lasta = bp->b_bio2.bio_offset;
e92ca23a 779 vp->v_cstart = loffset + blksize;
54078292 780 vp->v_lastw = loffset;
984263bc
MD
781 return;
782 }
e92ca23a
MD
783 if (maxclen > blksize)
784 vp->v_clen = maxclen - blksize;
54078292
MD
785 else
786 vp->v_clen = 0;
787 if (!async && vp->v_clen == 0) { /* I/O not contiguous */
e92ca23a 788 vp->v_cstart = loffset + blksize;
984263bc
MD
789 bawrite(bp);
790 } else { /* Wait for rest of cluster */
54078292 791 vp->v_cstart = loffset;
984263bc
MD
792 bdwrite(bp);
793 }
54078292 794 } else if (loffset == vp->v_cstart + vp->v_clen) {
984263bc
MD
795 /*
796 * At end of cluster, write it out if seqcount tells us we
797 * are operating sequentially, otherwise let the buf or
798 * update daemon handle it.
799 */
800 bdwrite(bp);
801 if (seqcount > 1)
e92ca23a
MD
802 cluster_wbuild_wb(vp, blksize, vp->v_cstart,
803 vp->v_clen + blksize);
984263bc 804 vp->v_clen = 0;
e92ca23a 805 vp->v_cstart = loffset + blksize;
984263bc
MD
806 } else if (vm_page_count_severe()) {
807 /*
808 * We are low on memory, get it going NOW
809 */
810 bawrite(bp);
811 } else {
812 /*
813 * In the middle of a cluster, so just delay the I/O for now.
814 */
815 bdwrite(bp);
816 }
54078292
MD
817 vp->v_lastw = loffset;
818 vp->v_lasta = bp->b_bio2.bio_offset;
984263bc
MD
819}
820
821
822/*
823 * This is an awful lot like cluster_rbuild...wish they could be combined.
824 * The last lbn argument is the current block on which I/O is being
825 * performed. Check to see that it doesn't fall in the middle of
826 * the current block (if last_bp == NULL).
827 */
828int
e92ca23a 829cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
984263bc
MD
830{
831 struct buf *bp, *tbp;
e43a034f 832 int i, j;
984263bc 833 int totalwritten = 0;
2ec4b00d 834 int maxiosize = vmaxiosize(vp);
984263bc 835
54078292 836 while (bytes > 0) {
984263bc
MD
837 /*
838 * If the buffer is not delayed-write (i.e. dirty), or it
839 * is delayed-write but either locked or inval, it cannot
840 * partake in the clustered write.
841 */
b1c20cfa
MD
842 tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
843 if (tbp == NULL ||
844 (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI ||
845 (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
846 if (tbp)
847 BUF_UNLOCK(tbp);
e92ca23a
MD
848 start_loffset += blksize;
849 bytes -= blksize;
984263bc
MD
850 continue;
851 }
852 bremfree(tbp);
10f3fee5 853 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
984263bc
MD
854
855 /*
856 * Extra memory in the buffer, punt on this buffer.
857 * XXX we could handle this in most cases, but we would
858 * have to push the extra memory down to after our max
859 * possible cluster size and then potentially pull it back
860 * up if the cluster was terminated prematurely--too much
861 * hassle.
862 */
863 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
b1c20cfa
MD
864 (tbp->b_bcount != tbp->b_bufsize) ||
865 (tbp->b_bcount != blksize) ||
866 (bytes == blksize) ||
9a82e536 867 ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) {
984263bc
MD
868 totalwritten += tbp->b_bufsize;
869 bawrite(tbp);
e92ca23a
MD
870 start_loffset += blksize;
871 bytes -= blksize;
984263bc
MD
872 continue;
873 }
874
875 /*
9a71d53f
MD
876 * Set up the pbuf. Track our append point with b_bcount
877 * and b_bufsize. b_bufsize is not used by the device but
878 * our caller uses it to loop clusters and we use it to
879 * detect a premature EOF on the block device.
984263bc 880 */
984263bc
MD
881 bp->b_bcount = 0;
882 bp->b_bufsize = 0;
54f51aeb 883 bp->b_xio.xio_npages = 0;
81b5c339 884 bp->b_loffset = tbp->b_loffset;
54078292 885 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
984263bc
MD
886
887 /*
888 * We are synthesizing a buffer out of vm_page_t's, but
889 * if the block size is not page aligned then the starting
890 * address may not be either. Inherit the b_data offset
891 * from the original buffer.
892 */
893 bp->b_data = (char *)((vm_offset_t)bp->b_data |
894 ((vm_offset_t)tbp->b_data & PAGE_MASK));
10f3fee5 895 bp->b_flags &= ~B_ERROR;
4414f2c9 896 bp->b_flags |= B_CLUSTER | B_BNOCLIP |
f2d7fcf0 897 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
81b5c339
MD
898 bp->b_bio1.bio_caller_info1.cluster_head = NULL;
899 bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
b1c20cfa 900
984263bc
MD
901 /*
902 * From this location in the file, scan forward to see
903 * if there are buffers with adjacent data that need to
904 * be written as well.
905 */
e92ca23a 906 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
984263bc 907 if (i != 0) { /* If not the first buffer */
b1c20cfa
MD
908 tbp = findblk(vp, start_loffset,
909 FINDBLK_NBLOCK);
984263bc 910 /*
b1c20cfa
MD
911 * Buffer not found or could not be locked
912 * non-blocking.
984263bc 913 */
b1c20cfa 914 if (tbp == NULL)
984263bc 915 break;
984263bc
MD
916
917 /*
918 * If it IS in core, but has different
b1c20cfa
MD
919 * characteristics, then don't cluster
920 * with it.
984263bc
MD
921 */
922 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
b1c20cfa
MD
923 B_INVAL | B_DELWRI | B_NEEDCOMMIT))
924 != (B_DELWRI | B_CLUSTEROK |
925 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
984263bc 926 (tbp->b_flags & B_LOCKED) ||
b1c20cfa
MD
927 (LIST_FIRST(&tbp->b_dep) &&
928 buf_checkwrite(tbp))
929 ) {
930 BUF_UNLOCK(tbp);
984263bc
MD
931 break;
932 }
933
934 /*
935 * Check that the combined cluster
936 * would make sense with regard to pages
937 * and would not be too large
938 */
e92ca23a 939 if ((tbp->b_bcount != blksize) ||
54078292
MD
940 ((bp->b_bio2.bio_offset + i) !=
941 tbp->b_bio2.bio_offset) ||
54f51aeb 942 ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
2ec4b00d 943 (maxiosize / PAGE_SIZE))) {
984263bc 944 BUF_UNLOCK(tbp);
984263bc
MD
945 break;
946 }
947 /*
948 * Ok, it's passed all the tests,
949 * so remove it from the free list
950 * and mark it busy. We will use it.
951 */
952 bremfree(tbp);
10f3fee5 953 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
984263bc 954 } /* end of code for non-first buffers only */
81b5c339 955
984263bc
MD
956 /*
957 * If the IO is via the VM then we do some
958 * special VM hackery (yuck). Since the buffer's
959 * block size may not be page-aligned it is possible
960 * for a page to be shared between two buffers. We
961 * have to get rid of the duplication when building
962 * the cluster.
963 */
964 if (tbp->b_flags & B_VMIO) {
965 vm_page_t m;
966
967 if (i != 0) { /* if not first buffer */
54078292 968 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
54f51aeb 969 m = tbp->b_xio.xio_pages[j];
984263bc
MD
970 if (m->flags & PG_BUSY) {
971 bqrelse(tbp);
972 goto finishcluster;
973 }
974 }
975 }
976
54078292 977 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
54f51aeb 978 m = tbp->b_xio.xio_pages[j];
984263bc
MD
979 vm_page_io_start(m);
980 vm_object_pip_add(m->object, 1);
54f51aeb
HP
981 if ((bp->b_xio.xio_npages == 0) ||
982 (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
983 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
984 bp->b_xio.xio_npages++;
984263bc
MD
985 }
986 }
987 }
e92ca23a
MD
988 bp->b_bcount += blksize;
989 bp->b_bufsize += blksize;
984263bc 990
984263bc 991 bundirty(tbp);
10f3fee5 992 tbp->b_flags &= ~B_ERROR;
10f3fee5 993 tbp->b_cmd = BUF_CMD_WRITE;
984263bc 994 BUF_KERNPROC(tbp);
81b5c339 995 cluster_append(&bp->b_bio1, tbp);
2aee763b
MD
996
997 /*
998 * check for latent dependencies to be handled
999 */
408357d8
MD
1000 if (LIST_FIRST(&tbp->b_dep) != NULL)
1001 buf_start(tbp);
984263bc
MD
1002 }
1003 finishcluster:
1004 pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
54f51aeb 1005 (vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages);
312dcd01 1006 if (bp->b_bufsize > bp->b_kvasize) {
984263bc 1007 panic(
54078292 1008 "cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
984263bc 1009 bp->b_bufsize, bp->b_kvasize);
312dcd01 1010 }
984263bc
MD
1011 totalwritten += bp->b_bufsize;
1012 bp->b_dirtyoff = 0;
1013 bp->b_dirtyend = bp->b_bufsize;
ae8e83e6 1014 bp->b_bio1.bio_done = cluster_callback;
10f3fee5 1015 bp->b_cmd = BUF_CMD_WRITE;
ae8e83e6 1016
10f3fee5 1017 vfs_busy_pages(vp, bp);
a8f169e2 1018 bp->b_runningbufspace = bp->b_bufsize;
1b30fbcc
MD
1019 if (bp->b_runningbufspace) {
1020 runningbufspace += bp->b_runningbufspace;
1021 ++runningbufcount;
1022 }
ae8e83e6 1023 BUF_KERNPROC(bp);
a8f169e2 1024 vn_strategy(vp, &bp->b_bio1);
984263bc 1025
54078292 1026 bytes -= i;
984263bc
MD
1027 }
1028 return totalwritten;
1029}
1030
1031/*
1032 * Collect together all the buffers in a cluster.
1033 * Plus add one additional buffer.
1034 */
1035static struct cluster_save *
e92ca23a 1036cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize)
984263bc
MD
1037{
1038 struct cluster_save *buflist;
1039 struct buf *bp;
54078292 1040 off_t loffset;
984263bc
MD
1041 int i, len;
1042
e92ca23a 1043 len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize;
77652cad 1044 buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
54078292 1045 M_SEGMENT, M_WAITOK);
984263bc
MD
1046 buflist->bs_nchildren = 0;
1047 buflist->bs_children = (struct buf **) (buflist + 1);
e92ca23a 1048 for (loffset = vp->v_cstart, i = 0; i < len; (loffset += blksize), i++) {
54078292 1049 (void) bread(vp, loffset, last_bp->b_bcount, &bp);
984263bc 1050 buflist->bs_children[i] = bp;
54078292 1051 if (bp->b_bio2.bio_offset == NOOFFSET) {
08daea96 1052 VOP_BMAP(bp->b_vp, bp->b_loffset,
e92ca23a
MD
1053 &bp->b_bio2.bio_offset,
1054 NULL, NULL, BUF_CMD_WRITE);
54078292 1055 }
984263bc
MD
1056 }
1057 buflist->bs_children[i] = bp = last_bp;
54078292 1058 if (bp->b_bio2.bio_offset == NOOFFSET) {
e92ca23a
MD
1059 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
1060 NULL, NULL, BUF_CMD_WRITE);
54078292 1061 }
984263bc
MD
1062 buflist->bs_nchildren = i + 1;
1063 return (buflist);
1064}
81b5c339
MD
1065
1066void
1067cluster_append(struct bio *bio, struct buf *tbp)
1068{
1069 tbp->b_cluster_next = NULL;
1070 if (bio->bio_caller_info1.cluster_head == NULL) {
1071 bio->bio_caller_info1.cluster_head = tbp;
1072 bio->bio_caller_info2.cluster_tail = tbp;
1073 } else {
1074 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
1075 bio->bio_caller_info2.cluster_tail = tbp;
1076 }
1077}
1078
cf1bb2a8
MD
1079static
1080void
1081cluster_setram (struct buf *bp)
1082{
1083 bp->b_flags |= B_RAM;
1084 if (bp->b_xio.xio_npages)
1085 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
1086}