kernel - Reduce impact of write_behind on small/temporary files
[dragonfly.git] / sys / kern / vfs_cluster.c
1 /*-
2  * Copyright (c) 1993
3  *      The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  *      Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *      This product includes software developed by the University of
18  *      California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *      @(#)vfs_cluster.c       8.7 (Berkeley) 2/13/94
36  * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
37  * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $
38  */
39
40 #include "opt_debug_cluster.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/buf.h>
47 #include <sys/vnode.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/resourcevar.h>
51 #include <sys/vmmeter.h>
52 #include <vm/vm.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_page.h>
55 #include <sys/sysctl.h>
56
57 #include <sys/buf2.h>
58 #include <vm/vm_page2.h>
59
60 #include <machine/limits.h>
61
62 #if defined(CLUSTERDEBUG)
63 #include <sys/sysctl.h>
64 static int      rcluster= 0;
65 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
66 #endif
67
68 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
69
70 static struct cluster_save *
71         cluster_collectbufs (struct vnode *vp, struct buf *last_bp,
72                             int blksize);
73 static struct buf *
74         cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
75                             off_t doffset, int blksize, int run, 
76                             struct buf *fbp);
77 static void cluster_callback (struct bio *);
78 static void cluster_setram (struct buf *);
79 static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize,
80                             off_t start_loffset, int bytes);
81
82 static int write_behind = 1;
83 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
84     "Cluster write-behind setting");
85 static quad_t write_behind_minfilesize = 10 * 1024 * 1024;
86 SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW,
87     &write_behind_minfilesize, 0, "Cluster write-behind setting");
88 static int max_readahead = 2 * 1024 * 1024;
89 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0,
90     "Limit in bytes for desired cluster read-ahead");
91
92 extern vm_page_t        bogus_page;
93
94 extern int cluster_pbuf_freecnt;
95
96 /*
97  * This replaces bread.
98  *
99  * filesize     - read-ahead @ blksize will not cross this boundary
100  * loffset      - loffset for returned *bpp
101  * blksize      - blocksize for returned *bpp and read-ahead bps
102  * minreq       - minimum (not a hard minimum) in bytes, typically reflects
103  *                a higher level uio resid.
104  * maxreq       - maximum (sequential heuristic) in bytes (highet typ ~2MB)
105  * bpp          - return buffer (*bpp) for (loffset,blksize)
106  */
107 int
108 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset,
109              int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
110 {
111         struct buf *bp, *rbp, *reqbp;
112         off_t origoffset;
113         off_t doffset;
114         int error;
115         int i;
116         int maxra;
117         int maxrbuild;
118
119         error = 0;
120
121         /*
122          * Calculate the desired read-ahead in blksize'd blocks (maxra).
123          * To do this we calculate maxreq.
124          *
125          * maxreq typically starts out as a sequential heuristic.  If the
126          * high level uio/resid is bigger (minreq), we pop maxreq up to
127          * minreq.  This represents the case where random I/O is being
128          * performed by the userland is issuing big read()'s.
129          *
130          * Then we limit maxreq to max_readahead to ensure it is a reasonable
131          * value.
132          *
133          * Finally we must ensure that (loffset + maxreq) does not cross the
134          * boundary (filesize) for the current blocksize.  If we allowed it
135          * to cross we could end up with buffers past the boundary with the
136          * wrong block size (HAMMER large-data areas use mixed block sizes).
137          * minreq is also absolutely limited to filesize.
138          */
139         if (maxreq < minreq)
140                 maxreq = minreq;
141         /* minreq not used beyond this point */
142
143         if (maxreq > max_readahead) {
144                 maxreq = max_readahead;
145                 if (maxreq > 16 * 1024 * 1024)
146                         maxreq = 16 * 1024 * 1024;
147         }
148         if (maxreq < blksize)
149                 maxreq = blksize;
150         if (loffset + maxreq > filesize) {
151                 if (loffset > filesize)
152                         maxreq = 0;
153                 else
154                         maxreq = filesize - loffset;
155         }
156
157         maxra = (int)(maxreq / blksize);
158
159         /*
160          * Get the requested block.
161          */
162         if (*bpp)
163                 reqbp = bp = *bpp;
164         else
165                 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
166         origoffset = loffset;
167
168         /*
169          * Calculate the maximum cluster size for a single I/O, used
170          * by cluster_rbuild().
171          */
172         maxrbuild = vmaxiosize(vp) / blksize;
173
174         /*
175          * if it is in the cache, then check to see if the reads have been
176          * sequential.  If they have, then try some read-ahead, otherwise
177          * back-off on prospective read-aheads.
178          */
179         if (bp->b_flags & B_CACHE) {
180                 /*
181                  * Not sequential, do not do any read-ahead
182                  */
183                 if (maxra <= 1)
184                         return 0;
185
186                 /*
187                  * No read-ahead mark, do not do any read-ahead
188                  * yet.
189                  */
190                 if ((bp->b_flags & B_RAM) == 0)
191                         return 0;
192
193                 /*
194                  * We hit a read-ahead-mark, figure out how much read-ahead
195                  * to do (maxra) and where to start (loffset).
196                  *
197                  * Shortcut the scan.  Typically the way this works is that
198                  * we've built up all the blocks inbetween except for the
199                  * last in previous iterations, so if the second-to-last
200                  * block is present we just skip ahead to it.
201                  *
202                  * This algorithm has O(1) cpu in the steady state no
203                  * matter how large maxra is.
204                  */
205                 bp->b_flags &= ~B_RAM;
206
207                 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
208                         i = maxra - 1;
209                 else
210                         i = 1;
211                 while (i < maxra) {
212                         if (findblk(vp, loffset + i * blksize,
213                                     FINDBLK_TEST) == NULL) {
214                                 break;
215                         }
216                         ++i;
217                 }
218
219                 /*
220                  * We got everything or everything is in the cache, no
221                  * point continuing.
222                  */
223                 if (i >= maxra)
224                         return 0;
225
226                 /*
227                  * Calculate where to start the read-ahead and how much
228                  * to do.  Generally speaking we want to read-ahead by
229                  * (maxra) when we've found a read-ahead mark.  We do
230                  * not want to reduce maxra here as it will cause
231                  * successive read-ahead I/O's to be smaller and smaller.
232                  *
233                  * However, we have to make sure we don't break the
234                  * filesize limitation for the clustered operation.
235                  */
236                 loffset += i * blksize;
237                 reqbp = bp = NULL;
238
239                 if (loffset >= filesize)
240                         return 0;
241                 if (loffset + maxra * blksize > filesize) {
242                         maxreq = filesize - loffset;
243                         maxra = (int)(maxreq / blksize);
244                 }
245         } else {
246                 __debugvar off_t firstread = bp->b_loffset;
247                 int nblks;
248
249                 /*
250                  * Set-up synchronous read for bp.
251                  */
252                 bp->b_cmd = BUF_CMD_READ;
253                 bp->b_bio1.bio_done = biodone_sync;
254                 bp->b_bio1.bio_flags |= BIO_SYNC;
255
256                 KASSERT(firstread != NOOFFSET, 
257                         ("cluster_read: no buffer offset"));
258
259                 /*
260                  * nblks is our cluster_rbuild request size, limited
261                  * primarily by the device.
262                  */
263                 if ((nblks = maxra) > maxrbuild)
264                         nblks = maxrbuild;
265
266                 if (nblks > 1) {
267                         int burstbytes;
268
269                         error = VOP_BMAP(vp, loffset, &doffset,
270                                          &burstbytes, NULL, BUF_CMD_READ);
271                         if (error)
272                                 goto single_block_read;
273                         if (nblks > burstbytes / blksize)
274                                 nblks = burstbytes / blksize;
275                         if (doffset == NOOFFSET)
276                                 goto single_block_read;
277                         if (nblks <= 1)
278                                 goto single_block_read;
279
280                         bp = cluster_rbuild(vp, filesize, loffset,
281                                             doffset, blksize, nblks, bp);
282                         loffset += bp->b_bufsize;
283                         maxra -= bp->b_bufsize / blksize;
284                 } else {
285 single_block_read:
286                         /*
287                          * If it isn't in the cache, then get a chunk from
288                          * disk if sequential, otherwise just get the block.
289                          */
290                         cluster_setram(bp);
291                         loffset += blksize;
292                         --maxra;
293                 }
294         }
295
296         /*
297          * If B_CACHE was not set issue bp.  bp will either be an
298          * asynchronous cluster buf or a synchronous single-buf.
299          * If it is a single buf it will be the same as reqbp.
300          *
301          * NOTE: Once an async cluster buf is issued bp becomes invalid.
302          */
303         if (bp) {
304 #if defined(CLUSTERDEBUG)
305                 if (rcluster)
306                         kprintf("S(%012jx,%d,%d)\n",
307                             (intmax_t)bp->b_loffset, bp->b_bcount, maxra);
308 #endif
309                 if ((bp->b_flags & B_CLUSTER) == 0)
310                         vfs_busy_pages(vp, bp);
311                 bp->b_flags &= ~(B_ERROR|B_INVAL);
312                 vn_strategy(vp, &bp->b_bio1);
313                 error = 0;
314                 /* bp invalid now */
315         }
316
317         /*
318          * If we have been doing sequential I/O, then do some read-ahead.
319          * The code above us should have positioned us at the next likely
320          * offset.
321          *
322          * Only mess with buffers which we can immediately lock.  HAMMER
323          * will do device-readahead irrespective of what the blocks
324          * represent.
325          */
326         while (error == 0 && maxra > 0) {
327                 int burstbytes;
328                 int tmp_error;
329                 int nblks;
330
331                 rbp = getblk(vp, loffset, blksize,
332                              GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
333                 if (rbp == NULL)
334                         goto no_read_ahead;
335                 if ((rbp->b_flags & B_CACHE)) {
336                         bqrelse(rbp);
337                         goto no_read_ahead;
338                 }
339
340                 /*
341                  * An error from the read-ahead bmap has nothing to do
342                  * with the caller's original request.
343                  */
344                 tmp_error = VOP_BMAP(vp, loffset, &doffset,
345                                      &burstbytes, NULL, BUF_CMD_READ);
346                 if (tmp_error || doffset == NOOFFSET) {
347                         rbp->b_flags |= B_INVAL;
348                         brelse(rbp);
349                         rbp = NULL;
350                         goto no_read_ahead;
351                 }
352                 if ((nblks = maxra) > maxrbuild)
353                         nblks = maxrbuild;
354                 if (nblks > burstbytes / blksize)
355                         nblks = burstbytes / blksize;
356
357                 /*
358                  * rbp: async read
359                  */
360                 rbp->b_cmd = BUF_CMD_READ;
361                 /*rbp->b_flags |= B_AGE*/;
362                 cluster_setram(rbp);
363
364                 if (nblks > 1) {
365                         rbp = cluster_rbuild(vp, filesize, loffset,
366                                              doffset, blksize, 
367                                              nblks, rbp);
368                 } else {
369                         rbp->b_bio2.bio_offset = doffset;
370                 }
371
372 #if defined(CLUSTERDEBUG)
373                 if (rcluster) {
374                         if (bp) {
375                                 kprintf("A+(%012jx,%d,%jd) "
376                                         "doff=%012jx minr=%zd ra=%d\n",
377                                     (intmax_t)loffset, rbp->b_bcount,
378                                     (intmax_t)(loffset - origoffset),
379                                     (intmax_t)doffset, minreq, maxra);
380                         } else {
381                                 kprintf("A-(%012jx,%d,%jd) "
382                                         "doff=%012jx minr=%zd ra=%d\n",
383                                     (intmax_t)rbp->b_loffset, rbp->b_bcount,
384                                     (intmax_t)(loffset - origoffset),
385                                     (intmax_t)doffset, minreq, maxra);
386                         }
387                 }
388 #endif
389                 rbp->b_flags &= ~(B_ERROR|B_INVAL);
390
391                 if ((rbp->b_flags & B_CLUSTER) == 0)
392                         vfs_busy_pages(vp, rbp);
393                 BUF_KERNPROC(rbp);
394                 loffset += rbp->b_bufsize;
395                 maxra -= rbp->b_bufsize / blksize;
396                 vn_strategy(vp, &rbp->b_bio1);
397                 /* rbp invalid now */
398         }
399
400         /*
401          * Wait for our original buffer to complete its I/O.  reqbp will
402          * be NULL if the original buffer was B_CACHE.  We are returning
403          * (*bpp) which is the same as reqbp when reqbp != NULL.
404          */
405 no_read_ahead:
406         if (reqbp) {
407                 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
408                 error = biowait(&reqbp->b_bio1, "clurd");
409         }
410         return (error);
411 }
412
413 /*
414  * If blocks are contiguous on disk, use this to provide clustered
415  * read ahead.  We will read as many blocks as possible sequentially
416  * and then parcel them up into logical blocks in the buffer hash table.
417  *
418  * This function either returns a cluster buf or it returns fbp.  fbp is
419  * already expected to be set up as a synchronous or asynchronous request.
420  *
421  * If a cluster buf is returned it will always be async.
422  */
423 static struct buf *
424 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
425                int blksize, int run, struct buf *fbp)
426 {
427         struct buf *bp, *tbp;
428         off_t boffset;
429         int i, j;
430         int maxiosize = vmaxiosize(vp);
431
432         /*
433          * avoid a division
434          */
435         while (loffset + run * blksize > filesize) {
436                 --run;
437         }
438
439         tbp = fbp;
440         tbp->b_bio2.bio_offset = doffset;
441         if((tbp->b_flags & B_MALLOC) ||
442             ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) {
443                 return tbp;
444         }
445
446         bp = trypbuf_kva(&cluster_pbuf_freecnt);
447         if (bp == NULL) {
448                 return tbp;
449         }
450
451         /*
452          * We are synthesizing a buffer out of vm_page_t's, but
453          * if the block size is not page aligned then the starting
454          * address may not be either.  Inherit the b_data offset
455          * from the original buffer.
456          */
457         bp->b_data = (char *)((vm_offset_t)bp->b_data |
458             ((vm_offset_t)tbp->b_data & PAGE_MASK));
459         bp->b_flags |= B_CLUSTER | B_VMIO;
460         bp->b_cmd = BUF_CMD_READ;
461         bp->b_bio1.bio_done = cluster_callback;         /* default to async */
462         bp->b_bio1.bio_caller_info1.cluster_head = NULL;
463         bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
464         bp->b_loffset = loffset;
465         bp->b_bio2.bio_offset = doffset;
466         KASSERT(bp->b_loffset != NOOFFSET,
467                 ("cluster_rbuild: no buffer offset"));
468
469         bp->b_bcount = 0;
470         bp->b_bufsize = 0;
471         bp->b_xio.xio_npages = 0;
472
473         for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
474                 if (i) {
475                         if ((bp->b_xio.xio_npages * PAGE_SIZE) +
476                             round_page(blksize) > maxiosize) {
477                                 break;
478                         }
479
480                         /*
481                          * Shortcut some checks and try to avoid buffers that
482                          * would block in the lock.  The same checks have to
483                          * be made again after we officially get the buffer.
484                          */
485                         tbp = getblk(vp, loffset + i * blksize, blksize,
486                                      GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
487                         if (tbp == NULL)
488                                 break;
489                         for (j = 0; j < tbp->b_xio.xio_npages; j++) {
490                                 if (tbp->b_xio.xio_pages[j]->valid)
491                                         break;
492                         }
493                         if (j != tbp->b_xio.xio_npages) {
494                                 bqrelse(tbp);
495                                 break;
496                         }
497
498                         /*
499                          * Stop scanning if the buffer is fuly valid 
500                          * (marked B_CACHE), or locked (may be doing a
501                          * background write), or if the buffer is not
502                          * VMIO backed.  The clustering code can only deal
503                          * with VMIO-backed buffers.
504                          */
505                         if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
506                             (tbp->b_flags & B_VMIO) == 0 ||
507                             (LIST_FIRST(&tbp->b_dep) != NULL &&
508                              buf_checkread(tbp))
509                         ) {
510                                 bqrelse(tbp);
511                                 break;
512                         }
513
514                         /*
515                          * The buffer must be completely invalid in order to
516                          * take part in the cluster.  If it is partially valid
517                          * then we stop.
518                          */
519                         for (j = 0;j < tbp->b_xio.xio_npages; j++) {
520                                 if (tbp->b_xio.xio_pages[j]->valid)
521                                         break;
522                         }
523                         if (j != tbp->b_xio.xio_npages) {
524                                 bqrelse(tbp);
525                                 break;
526                         }
527
528                         /*
529                          * Set a read-ahead mark as appropriate.  Always
530                          * set the read-ahead mark at (run - 1).  It is
531                          * unclear why we were also setting it at i == 1.
532                          */
533                         if (/*i == 1 ||*/ i == (run - 1))
534                                 cluster_setram(tbp);
535
536                         /*
537                          * Depress the priority of buffers not explicitly
538                          * requested.
539                          */
540                         /* tbp->b_flags |= B_AGE; */
541
542                         /*
543                          * Set the block number if it isn't set, otherwise
544                          * if it is make sure it matches the block number we
545                          * expect.
546                          */
547                         if (tbp->b_bio2.bio_offset == NOOFFSET) {
548                                 tbp->b_bio2.bio_offset = boffset;
549                         } else if (tbp->b_bio2.bio_offset != boffset) {
550                                 brelse(tbp);
551                                 break;
552                         }
553                 }
554
555                 /*
556                  * The passed-in tbp (i == 0) will already be set up for
557                  * async or sync operation.  All other tbp's acquire in
558                  * our loop are set up for async operation.
559                  */
560                 tbp->b_cmd = BUF_CMD_READ;
561                 BUF_KERNPROC(tbp);
562                 cluster_append(&bp->b_bio1, tbp);
563                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
564                         vm_page_t m;
565
566                         m = tbp->b_xio.xio_pages[j];
567                         vm_page_busy_wait(m, FALSE, "clurpg");
568                         vm_page_io_start(m);
569                         vm_page_wakeup(m);
570                         vm_object_pip_add(m->object, 1);
571                         if ((bp->b_xio.xio_npages == 0) ||
572                                 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
573                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
574                                 bp->b_xio.xio_npages++;
575                         }
576                         if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
577                                 tbp->b_xio.xio_pages[j] = bogus_page;
578                 }
579                 /*
580                  * XXX shouldn't this be += size for both, like in 
581                  * cluster_wbuild()?
582                  *
583                  * Don't inherit tbp->b_bufsize as it may be larger due to
584                  * a non-page-aligned size.  Instead just aggregate using
585                  * 'size'.
586                  */
587                 if (tbp->b_bcount != blksize)
588                     kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
589                 if (tbp->b_bufsize != blksize)
590                     kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
591                 bp->b_bcount += blksize;
592                 bp->b_bufsize += blksize;
593         }
594
595         /*
596          * Fully valid pages in the cluster are already good and do not need
597          * to be re-read from disk.  Replace the page with bogus_page
598          */
599         for (j = 0; j < bp->b_xio.xio_npages; j++) {
600                 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
601                     VM_PAGE_BITS_ALL) {
602                         bp->b_xio.xio_pages[j] = bogus_page;
603                 }
604         }
605         if (bp->b_bufsize > bp->b_kvasize) {
606                 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
607                     bp->b_bufsize, bp->b_kvasize);
608         }
609         pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
610                 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
611         BUF_KERNPROC(bp);
612         return (bp);
613 }
614
615 /*
616  * Cleanup after a clustered read or write.
617  * This is complicated by the fact that any of the buffers might have
618  * extra memory (if there were no empty buffer headers at allocbuf time)
619  * that we will need to shift around.
620  *
621  * The returned bio is &bp->b_bio1
622  */
623 void
624 cluster_callback(struct bio *bio)
625 {
626         struct buf *bp = bio->bio_buf;
627         struct buf *tbp;
628         int error = 0;
629
630         /*
631          * Must propogate errors to all the components.  A short read (EOF)
632          * is a critical error.
633          */
634         if (bp->b_flags & B_ERROR) {
635                 error = bp->b_error;
636         } else if (bp->b_bcount != bp->b_bufsize) {
637                 panic("cluster_callback: unexpected EOF on cluster %p!", bio);
638         }
639
640         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
641         /*
642          * Move memory from the large cluster buffer into the component
643          * buffers and mark IO as done on these.  Since the memory map
644          * is the same, no actual copying is required.
645          */
646         while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
647                 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
648                 if (error) {
649                         tbp->b_flags |= B_ERROR | B_IODEBUG;
650                         tbp->b_error = error;
651                 } else {
652                         tbp->b_dirtyoff = tbp->b_dirtyend = 0;
653                         tbp->b_flags &= ~(B_ERROR|B_INVAL);
654                         tbp->b_flags |= B_IODEBUG;
655                         /*
656                          * XXX the bdwrite()/bqrelse() issued during
657                          * cluster building clears B_RELBUF (see bqrelse()
658                          * comment).  If direct I/O was specified, we have
659                          * to restore it here to allow the buffer and VM
660                          * to be freed.
661                          */
662                         if (tbp->b_flags & B_DIRECT)
663                                 tbp->b_flags |= B_RELBUF;
664                 }
665                 biodone(&tbp->b_bio1);
666         }
667         relpbuf(bp, &cluster_pbuf_freecnt);
668 }
669
670 /*
671  * Implement modified write build for cluster.
672  *
673  *      write_behind = 0        write behind disabled
674  *      write_behind = 1        write behind normal (default)
675  *      write_behind = 2        write behind backed-off
676  *
677  * In addition, write_behind is only activated for files that have
678  * grown past a certain size (default 10MB).  Otherwise temporary files
679  * wind up generating a lot of unnecessary disk I/O.
680  */
681 static __inline int
682 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
683 {
684         int r = 0;
685
686         switch(write_behind) {
687         case 2:
688                 if (start_loffset < len)
689                         break;
690                 start_loffset -= len;
691                 /* fall through */
692         case 1:
693                 if (vp->v_filesize >= write_behind_minfilesize) {
694                         r = cluster_wbuild(vp, NULL, blksize,
695                                            start_loffset, len);
696                 }
697                 /* fall through */
698         default:
699                 /* fall through */
700                 break;
701         }
702         return(r);
703 }
704
705 /*
706  * Do clustered write for FFS.
707  *
708  * Three cases:
709  *      1. Write is not sequential (write asynchronously)
710  *      Write is sequential:
711  *      2.      beginning of cluster - begin cluster
712  *      3.      middle of a cluster - add to cluster
713  *      4.      end of a cluster - asynchronously write cluster
714  */
715 void
716 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
717 {
718         struct vnode *vp;
719         off_t loffset;
720         int maxclen, cursize;
721         int async;
722
723         vp = bp->b_vp;
724         if (vp->v_type == VREG)
725                 async = vp->v_mount->mnt_flag & MNT_ASYNC;
726         else
727                 async = 0;
728         loffset = bp->b_loffset;
729         KASSERT(bp->b_loffset != NOOFFSET, 
730                 ("cluster_write: no buffer offset"));
731
732         /* Initialize vnode to beginning of file. */
733         if (loffset == 0)
734                 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
735
736         if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize ||
737             bp->b_bio2.bio_offset == NOOFFSET ||
738             (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) {
739                 maxclen = vmaxiosize(vp);
740                 if (vp->v_clen != 0) {
741                         /*
742                          * Next block is not sequential.
743                          *
744                          * If we are not writing at end of file, the process
745                          * seeked to another point in the file since its last
746                          * write, or we have reached our maximum cluster size,
747                          * then push the previous cluster. Otherwise try
748                          * reallocating to make it sequential.
749                          *
750                          * Change to algorithm: only push previous cluster if
751                          * it was sequential from the point of view of the
752                          * seqcount heuristic, otherwise leave the buffer 
753                          * intact so we can potentially optimize the I/O
754                          * later on in the buf_daemon or update daemon
755                          * flush.
756                          */
757                         cursize = vp->v_lastw - vp->v_cstart + blksize;
758                         if (bp->b_loffset + blksize < filesize ||
759                             loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) {
760                                 if (!async && seqcount > 0) {
761                                         cluster_wbuild_wb(vp, blksize,
762                                                 vp->v_cstart, cursize);
763                                 }
764                         } else {
765                                 struct buf **bpp, **endbp;
766                                 struct cluster_save *buflist;
767
768                                 buflist = cluster_collectbufs(vp, bp, blksize);
769                                 endbp = &buflist->bs_children
770                                     [buflist->bs_nchildren - 1];
771                                 if (VOP_REALLOCBLKS(vp, buflist)) {
772                                         /*
773                                          * Failed, push the previous cluster
774                                          * if *really* writing sequentially
775                                          * in the logical file (seqcount > 1),
776                                          * otherwise delay it in the hopes that
777                                          * the low level disk driver can
778                                          * optimize the write ordering.
779                                          */
780                                         for (bpp = buflist->bs_children;
781                                              bpp < endbp; bpp++)
782                                                 brelse(*bpp);
783                                         kfree(buflist, M_SEGMENT);
784                                         if (seqcount > 1) {
785                                                 cluster_wbuild_wb(vp, 
786                                                     blksize, vp->v_cstart, 
787                                                     cursize);
788                                         }
789                                 } else {
790                                         /*
791                                          * Succeeded, keep building cluster.
792                                          */
793                                         for (bpp = buflist->bs_children;
794                                              bpp <= endbp; bpp++)
795                                                 bdwrite(*bpp);
796                                         kfree(buflist, M_SEGMENT);
797                                         vp->v_lastw = loffset;
798                                         vp->v_lasta = bp->b_bio2.bio_offset;
799                                         return;
800                                 }
801                         }
802                 }
803                 /*
804                  * Consider beginning a cluster. If at end of file, make
805                  * cluster as large as possible, otherwise find size of
806                  * existing cluster.
807                  */
808                 if ((vp->v_type == VREG) &&
809                     bp->b_loffset + blksize < filesize &&
810                     (bp->b_bio2.bio_offset == NOOFFSET) &&
811                     (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) ||
812                      bp->b_bio2.bio_offset == NOOFFSET)) {
813                         bawrite(bp);
814                         vp->v_clen = 0;
815                         vp->v_lasta = bp->b_bio2.bio_offset;
816                         vp->v_cstart = loffset + blksize;
817                         vp->v_lastw = loffset;
818                         return;
819                 }
820                 if (maxclen > blksize)
821                         vp->v_clen = maxclen - blksize;
822                 else
823                         vp->v_clen = 0;
824                 if (!async && vp->v_clen == 0) { /* I/O not contiguous */
825                         vp->v_cstart = loffset + blksize;
826                         bawrite(bp);
827                 } else {        /* Wait for rest of cluster */
828                         vp->v_cstart = loffset;
829                         bdwrite(bp);
830                 }
831         } else if (loffset == vp->v_cstart + vp->v_clen) {
832                 /*
833                  * At end of cluster, write it out if seqcount tells us we
834                  * are operating sequentially, otherwise let the buf or
835                  * update daemon handle it.
836                  */
837                 bdwrite(bp);
838                 if (seqcount > 1)
839                         cluster_wbuild_wb(vp, blksize, vp->v_cstart,
840                                           vp->v_clen + blksize);
841                 vp->v_clen = 0;
842                 vp->v_cstart = loffset + blksize;
843         } else if (vm_page_count_severe()) {
844                 /*
845                  * We are low on memory, get it going NOW
846                  */
847                 bawrite(bp);
848         } else {
849                 /*
850                  * In the middle of a cluster, so just delay the I/O for now.
851                  */
852                 bdwrite(bp);
853         }
854         vp->v_lastw = loffset;
855         vp->v_lasta = bp->b_bio2.bio_offset;
856 }
857
858 /*
859  * This is the clustered version of bawrite().  It works similarly to
860  * cluster_write() except I/O on the buffer is guaranteed to occur.
861  */
862 int
863 cluster_awrite(struct buf *bp)
864 {
865         int total;
866
867         /*
868          * Don't bother if it isn't clusterable.
869          */
870         if ((bp->b_flags & B_CLUSTEROK) == 0 ||
871             bp->b_vp == NULL ||
872             (bp->b_vp->v_flag & VOBJBUF) == 0) {
873                 total = bp->b_bufsize;
874                 bawrite(bp);
875                 return (total);
876         }
877
878         total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize,
879                                bp->b_loffset, vmaxiosize(bp->b_vp));
880         if (bp)
881                 bawrite(bp);
882
883         return total;
884 }
885
886 /*
887  * This is an awful lot like cluster_rbuild...wish they could be combined.
888  * The last lbn argument is the current block on which I/O is being
889  * performed.  Check to see that it doesn't fall in the middle of
890  * the current block (if last_bp == NULL).
891  *
892  * cluster_wbuild() normally does not guarantee anything.  If bpp is
893  * non-NULL and cluster_wbuild() is able to incorporate it into the
894  * I/O it will set *bpp to NULL, otherwise it will leave it alone and
895  * the caller must dispose of *bpp.
896  */
897 static int
898 cluster_wbuild(struct vnode *vp, struct buf **bpp,
899                int blksize, off_t start_loffset, int bytes)
900 {
901         struct buf *bp, *tbp;
902         int i, j;
903         int totalwritten = 0;
904         int must_initiate;
905         int maxiosize = vmaxiosize(vp);
906
907         while (bytes > 0) {
908                 /*
909                  * If the buffer matches the passed locked & removed buffer
910                  * we used the passed buffer (which might not be B_DELWRI).
911                  *
912                  * Otherwise locate the buffer and determine if it is
913                  * compatible.
914                  */
915                 if (bpp && (*bpp)->b_loffset == start_loffset) {
916                         tbp = *bpp;
917                         *bpp = NULL;
918                         bpp = NULL;
919                 } else {
920                         tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
921                         if (tbp == NULL ||
922                             (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) !=
923                              B_DELWRI ||
924                             (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
925                                 if (tbp)
926                                         BUF_UNLOCK(tbp);
927                                 start_loffset += blksize;
928                                 bytes -= blksize;
929                                 continue;
930                         }
931                         bremfree(tbp);
932                 }
933                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
934
935                 /*
936                  * Extra memory in the buffer, punt on this buffer.
937                  * XXX we could handle this in most cases, but we would
938                  * have to push the extra memory down to after our max
939                  * possible cluster size and then potentially pull it back
940                  * up if the cluster was terminated prematurely--too much
941                  * hassle.
942                  */
943                 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
944                     (tbp->b_bcount != tbp->b_bufsize) ||
945                     (tbp->b_bcount != blksize) ||
946                     (bytes == blksize) ||
947                     ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) {
948                         totalwritten += tbp->b_bufsize;
949                         bawrite(tbp);
950                         start_loffset += blksize;
951                         bytes -= blksize;
952                         continue;
953                 }
954
955                 /*
956                  * Set up the pbuf.  Track our append point with b_bcount
957                  * and b_bufsize.  b_bufsize is not used by the device but
958                  * our caller uses it to loop clusters and we use it to
959                  * detect a premature EOF on the block device.
960                  */
961                 bp->b_bcount = 0;
962                 bp->b_bufsize = 0;
963                 bp->b_xio.xio_npages = 0;
964                 bp->b_loffset = tbp->b_loffset;
965                 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
966
967                 /*
968                  * We are synthesizing a buffer out of vm_page_t's, but
969                  * if the block size is not page aligned then the starting
970                  * address may not be either.  Inherit the b_data offset
971                  * from the original buffer.
972                  */
973                 bp->b_data = (char *)((vm_offset_t)bp->b_data |
974                     ((vm_offset_t)tbp->b_data & PAGE_MASK));
975                 bp->b_flags &= ~B_ERROR;
976                 bp->b_flags |= B_CLUSTER | B_BNOCLIP |
977                         (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
978                 bp->b_bio1.bio_caller_info1.cluster_head = NULL;
979                 bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
980
981                 /*
982                  * From this location in the file, scan forward to see
983                  * if there are buffers with adjacent data that need to
984                  * be written as well.
985                  *
986                  * IO *must* be initiated on index 0 at this point
987                  * (particularly when called from cluster_awrite()).
988                  */
989                 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
990                         if (i == 0) {
991                                 must_initiate = 1;
992                         } else {
993                                 /*
994                                  * Not first buffer.
995                                  */
996                                 must_initiate = 0;
997                                 tbp = findblk(vp, start_loffset,
998                                               FINDBLK_NBLOCK);
999                                 /*
1000                                  * Buffer not found or could not be locked
1001                                  * non-blocking.
1002                                  */
1003                                 if (tbp == NULL)
1004                                         break;
1005
1006                                 /*
1007                                  * If it IS in core, but has different
1008                                  * characteristics, then don't cluster
1009                                  * with it.
1010                                  */
1011                                 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
1012                                      B_INVAL | B_DELWRI | B_NEEDCOMMIT))
1013                                     != (B_DELWRI | B_CLUSTEROK |
1014                                      (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
1015                                     (tbp->b_flags & B_LOCKED)
1016                                 ) {
1017                                         BUF_UNLOCK(tbp);
1018                                         break;
1019                                 }
1020
1021                                 /*
1022                                  * Check that the combined cluster
1023                                  * would make sense with regard to pages
1024                                  * and would not be too large
1025                                  *
1026                                  * WARNING! buf_checkwrite() must be the last
1027                                  *          check made.  If it returns 0 then
1028                                  *          we must initiate the I/O.
1029                                  */
1030                                 if ((tbp->b_bcount != blksize) ||
1031                                   ((bp->b_bio2.bio_offset + i) !=
1032                                     tbp->b_bio2.bio_offset) ||
1033                                   ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
1034                                     (maxiosize / PAGE_SIZE)) ||
1035                                   (LIST_FIRST(&tbp->b_dep) &&
1036                                    buf_checkwrite(tbp))
1037                                 ) {
1038                                         BUF_UNLOCK(tbp);
1039                                         break;
1040                                 }
1041                                 if (LIST_FIRST(&tbp->b_dep))
1042                                         must_initiate = 1;
1043                                 /*
1044                                  * Ok, it's passed all the tests,
1045                                  * so remove it from the free list
1046                                  * and mark it busy. We will use it.
1047                                  */
1048                                 bremfree(tbp);
1049                                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
1050                         }
1051
1052                         /*
1053                          * If the IO is via the VM then we do some
1054                          * special VM hackery (yuck).  Since the buffer's
1055                          * block size may not be page-aligned it is possible
1056                          * for a page to be shared between two buffers.  We
1057                          * have to get rid of the duplication when building
1058                          * the cluster.
1059                          */
1060                         if (tbp->b_flags & B_VMIO) {
1061                                 vm_page_t m;
1062
1063                                 /*
1064                                  * Try to avoid deadlocks with the VM system.
1065                                  * However, we cannot abort the I/O if
1066                                  * must_initiate is non-zero.
1067                                  */
1068                                 if (must_initiate == 0) {
1069                                         for (j = 0;
1070                                              j < tbp->b_xio.xio_npages;
1071                                              ++j) {
1072                                                 m = tbp->b_xio.xio_pages[j];
1073                                                 if (m->flags & PG_BUSY) {
1074                                                         bqrelse(tbp);
1075                                                         goto finishcluster;
1076                                                 }
1077                                         }
1078                                 }
1079                                         
1080                                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
1081                                         m = tbp->b_xio.xio_pages[j];
1082                                         vm_page_busy_wait(m, FALSE, "clurpg");
1083                                         vm_page_io_start(m);
1084                                         vm_page_wakeup(m);
1085                                         vm_object_pip_add(m->object, 1);
1086                                         if ((bp->b_xio.xio_npages == 0) ||
1087                                           (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
1088                                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
1089                                                 bp->b_xio.xio_npages++;
1090                                         }
1091                                 }
1092                         }
1093                         bp->b_bcount += blksize;
1094                         bp->b_bufsize += blksize;
1095
1096                         bundirty(tbp);
1097                         tbp->b_flags &= ~B_ERROR;
1098                         tbp->b_cmd = BUF_CMD_WRITE;
1099                         BUF_KERNPROC(tbp);
1100                         cluster_append(&bp->b_bio1, tbp);
1101
1102                         /*
1103                          * check for latent dependencies to be handled 
1104                          */
1105                         if (LIST_FIRST(&tbp->b_dep) != NULL)
1106                                 buf_start(tbp);
1107                 }
1108         finishcluster:
1109                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
1110                             (vm_page_t *)bp->b_xio.xio_pages,
1111                             bp->b_xio.xio_npages);
1112                 if (bp->b_bufsize > bp->b_kvasize) {
1113                         panic("cluster_wbuild: b_bufsize(%d) "
1114                               "> b_kvasize(%d)\n",
1115                               bp->b_bufsize, bp->b_kvasize);
1116                 }
1117                 totalwritten += bp->b_bufsize;
1118                 bp->b_dirtyoff = 0;
1119                 bp->b_dirtyend = bp->b_bufsize;
1120                 bp->b_bio1.bio_done = cluster_callback;
1121                 bp->b_cmd = BUF_CMD_WRITE;
1122
1123                 vfs_busy_pages(vp, bp);
1124                 bsetrunningbufspace(bp, bp->b_bufsize);
1125                 BUF_KERNPROC(bp);
1126                 vn_strategy(vp, &bp->b_bio1);
1127
1128                 bytes -= i;
1129         }
1130         return totalwritten;
1131 }
1132
1133 /*
1134  * Collect together all the buffers in a cluster.
1135  * Plus add one additional buffer.
1136  */
1137 static struct cluster_save *
1138 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize)
1139 {
1140         struct cluster_save *buflist;
1141         struct buf *bp;
1142         off_t loffset;
1143         int i, len;
1144
1145         len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize;
1146         buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1147                          M_SEGMENT, M_WAITOK);
1148         buflist->bs_nchildren = 0;
1149         buflist->bs_children = (struct buf **) (buflist + 1);
1150         for (loffset = vp->v_cstart, i = 0; i < len; (loffset += blksize), i++) {
1151                 (void) bread(vp, loffset, last_bp->b_bcount, &bp);
1152                 buflist->bs_children[i] = bp;
1153                 if (bp->b_bio2.bio_offset == NOOFFSET) {
1154                         VOP_BMAP(bp->b_vp, bp->b_loffset,
1155                                  &bp->b_bio2.bio_offset,
1156                                  NULL, NULL, BUF_CMD_WRITE);
1157                 }
1158         }
1159         buflist->bs_children[i] = bp = last_bp;
1160         if (bp->b_bio2.bio_offset == NOOFFSET) {
1161                 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
1162                          NULL, NULL, BUF_CMD_WRITE);
1163         }
1164         buflist->bs_nchildren = i + 1;
1165         return (buflist);
1166 }
1167
1168 void
1169 cluster_append(struct bio *bio, struct buf *tbp)
1170 {
1171         tbp->b_cluster_next = NULL;
1172         if (bio->bio_caller_info1.cluster_head == NULL) {
1173                 bio->bio_caller_info1.cluster_head = tbp;
1174                 bio->bio_caller_info2.cluster_tail = tbp;
1175         } else {
1176                 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
1177                 bio->bio_caller_info2.cluster_tail = tbp;
1178         }
1179 }
1180
1181 static
1182 void
1183 cluster_setram (struct buf *bp)
1184 {
1185         bp->b_flags |= B_RAM;
1186         if (bp->b_xio.xio_npages)
1187                 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
1188 }