Merge branch 'vendor/TCSH'
[dragonfly.git] / sys / kern / vfs_cluster.c
1 /*-
2  * Copyright (c) 1993
3  *      The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  *      Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *      @(#)vfs_cluster.c       8.7 (Berkeley) 2/13/94
32  * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
33  * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $
34  */
35
36 #include "opt_debug_cluster.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/proc.h>
42 #include <sys/buf.h>
43 #include <sys/vnode.h>
44 #include <sys/malloc.h>
45 #include <sys/mount.h>
46 #include <sys/resourcevar.h>
47 #include <sys/vmmeter.h>
48 #include <vm/vm.h>
49 #include <vm/vm_object.h>
50 #include <vm/vm_page.h>
51 #include <sys/sysctl.h>
52
53 #include <sys/buf2.h>
54 #include <vm/vm_page2.h>
55
56 #include <machine/limits.h>
57
58 #if defined(CLUSTERDEBUG)
59 #include <sys/sysctl.h>
60 static int      rcluster= 0;
61 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
62 #endif
63
64 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
65
66 static struct cluster_save *
67         cluster_collectbufs (struct vnode *vp, struct buf *last_bp,
68                             int blksize);
69 static struct buf *
70         cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
71                             off_t doffset, int blksize, int run, 
72                             struct buf *fbp);
73 static void cluster_callback (struct bio *);
74 static void cluster_setram (struct buf *);
75 static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize,
76                             off_t start_loffset, int bytes);
77
78 static int write_behind = 1;
79 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
80     "Cluster write-behind setting");
81 static quad_t write_behind_minfilesize = 10 * 1024 * 1024;
82 SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW,
83     &write_behind_minfilesize, 0, "Cluster write-behind setting");
84 static int max_readahead = 2 * 1024 * 1024;
85 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0,
86     "Limit in bytes for desired cluster read-ahead");
87
88 extern vm_page_t        bogus_page;
89
90 extern int cluster_pbuf_freecnt;
91
92 /*
93  * This replaces bread.
94  *
95  * filesize     - read-ahead @ blksize will not cross this boundary
96  * loffset      - loffset for returned *bpp
97  * blksize      - blocksize for returned *bpp and read-ahead bps
98  * minreq       - minimum (not a hard minimum) in bytes, typically reflects
99  *                a higher level uio resid.
100  * maxreq       - maximum (sequential heuristic) in bytes (highet typ ~2MB)
101  * bpp          - return buffer (*bpp) for (loffset,blksize)
102  */
103 int
104 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset,
105              int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
106 {
107         struct buf *bp, *rbp, *reqbp;
108         off_t origoffset;
109         off_t doffset;
110         int error;
111         int i;
112         int maxra;
113         int maxrbuild;
114
115         error = 0;
116
117         /*
118          * Calculate the desired read-ahead in blksize'd blocks (maxra).
119          * To do this we calculate maxreq.
120          *
121          * maxreq typically starts out as a sequential heuristic.  If the
122          * high level uio/resid is bigger (minreq), we pop maxreq up to
123          * minreq.  This represents the case where random I/O is being
124          * performed by the userland is issuing big read()'s.
125          *
126          * Then we limit maxreq to max_readahead to ensure it is a reasonable
127          * value.
128          *
129          * Finally we must ensure that (loffset + maxreq) does not cross the
130          * boundary (filesize) for the current blocksize.  If we allowed it
131          * to cross we could end up with buffers past the boundary with the
132          * wrong block size (HAMMER large-data areas use mixed block sizes).
133          * minreq is also absolutely limited to filesize.
134          */
135         if (maxreq < minreq)
136                 maxreq = minreq;
137         /* minreq not used beyond this point */
138
139         if (maxreq > max_readahead) {
140                 maxreq = max_readahead;
141                 if (maxreq > 16 * 1024 * 1024)
142                         maxreq = 16 * 1024 * 1024;
143         }
144         if (maxreq < blksize)
145                 maxreq = blksize;
146         if (loffset + maxreq > filesize) {
147                 if (loffset > filesize)
148                         maxreq = 0;
149                 else
150                         maxreq = filesize - loffset;
151         }
152
153         maxra = (int)(maxreq / blksize);
154
155         /*
156          * Get the requested block.
157          */
158         if (*bpp)
159                 reqbp = bp = *bpp;
160         else
161                 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
162         origoffset = loffset;
163
164         /*
165          * Calculate the maximum cluster size for a single I/O, used
166          * by cluster_rbuild().
167          */
168         maxrbuild = vmaxiosize(vp) / blksize;
169
170         /*
171          * if it is in the cache, then check to see if the reads have been
172          * sequential.  If they have, then try some read-ahead, otherwise
173          * back-off on prospective read-aheads.
174          */
175         if (bp->b_flags & B_CACHE) {
176                 /*
177                  * Not sequential, do not do any read-ahead
178                  */
179                 if (maxra <= 1)
180                         return 0;
181
182                 /*
183                  * No read-ahead mark, do not do any read-ahead
184                  * yet.
185                  */
186                 if ((bp->b_flags & B_RAM) == 0)
187                         return 0;
188
189                 /*
190                  * We hit a read-ahead-mark, figure out how much read-ahead
191                  * to do (maxra) and where to start (loffset).
192                  *
193                  * Shortcut the scan.  Typically the way this works is that
194                  * we've built up all the blocks inbetween except for the
195                  * last in previous iterations, so if the second-to-last
196                  * block is present we just skip ahead to it.
197                  *
198                  * This algorithm has O(1) cpu in the steady state no
199                  * matter how large maxra is.
200                  */
201                 bp->b_flags &= ~B_RAM;
202
203                 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
204                         i = maxra - 1;
205                 else
206                         i = 1;
207                 while (i < maxra) {
208                         if (findblk(vp, loffset + i * blksize,
209                                     FINDBLK_TEST) == NULL) {
210                                 break;
211                         }
212                         ++i;
213                 }
214
215                 /*
216                  * We got everything or everything is in the cache, no
217                  * point continuing.
218                  */
219                 if (i >= maxra)
220                         return 0;
221
222                 /*
223                  * Calculate where to start the read-ahead and how much
224                  * to do.  Generally speaking we want to read-ahead by
225                  * (maxra) when we've found a read-ahead mark.  We do
226                  * not want to reduce maxra here as it will cause
227                  * successive read-ahead I/O's to be smaller and smaller.
228                  *
229                  * However, we have to make sure we don't break the
230                  * filesize limitation for the clustered operation.
231                  */
232                 loffset += i * blksize;
233                 reqbp = bp = NULL;
234
235                 if (loffset >= filesize)
236                         return 0;
237                 if (loffset + maxra * blksize > filesize) {
238                         maxreq = filesize - loffset;
239                         maxra = (int)(maxreq / blksize);
240                 }
241         } else {
242                 __debugvar off_t firstread = bp->b_loffset;
243                 int nblks;
244
245                 /*
246                  * Set-up synchronous read for bp.
247                  */
248                 bp->b_cmd = BUF_CMD_READ;
249                 bp->b_bio1.bio_done = biodone_sync;
250                 bp->b_bio1.bio_flags |= BIO_SYNC;
251
252                 KASSERT(firstread != NOOFFSET, 
253                         ("cluster_read: no buffer offset"));
254
255                 /*
256                  * nblks is our cluster_rbuild request size, limited
257                  * primarily by the device.
258                  */
259                 if ((nblks = maxra) > maxrbuild)
260                         nblks = maxrbuild;
261
262                 if (nblks > 1) {
263                         int burstbytes;
264
265                         error = VOP_BMAP(vp, loffset, &doffset,
266                                          &burstbytes, NULL, BUF_CMD_READ);
267                         if (error)
268                                 goto single_block_read;
269                         if (nblks > burstbytes / blksize)
270                                 nblks = burstbytes / blksize;
271                         if (doffset == NOOFFSET)
272                                 goto single_block_read;
273                         if (nblks <= 1)
274                                 goto single_block_read;
275
276                         bp = cluster_rbuild(vp, filesize, loffset,
277                                             doffset, blksize, nblks, bp);
278                         loffset += bp->b_bufsize;
279                         maxra -= bp->b_bufsize / blksize;
280                 } else {
281 single_block_read:
282                         /*
283                          * If it isn't in the cache, then get a chunk from
284                          * disk if sequential, otherwise just get the block.
285                          */
286                         cluster_setram(bp);
287                         loffset += blksize;
288                         --maxra;
289                 }
290         }
291
292         /*
293          * If B_CACHE was not set issue bp.  bp will either be an
294          * asynchronous cluster buf or a synchronous single-buf.
295          * If it is a single buf it will be the same as reqbp.
296          *
297          * NOTE: Once an async cluster buf is issued bp becomes invalid.
298          */
299         if (bp) {
300 #if defined(CLUSTERDEBUG)
301                 if (rcluster)
302                         kprintf("S(%012jx,%d,%d)\n",
303                             (intmax_t)bp->b_loffset, bp->b_bcount, maxra);
304 #endif
305                 if ((bp->b_flags & B_CLUSTER) == 0)
306                         vfs_busy_pages(vp, bp);
307                 bp->b_flags &= ~(B_ERROR|B_INVAL);
308                 vn_strategy(vp, &bp->b_bio1);
309                 error = 0;
310                 /* bp invalid now */
311         }
312
313         /*
314          * If we have been doing sequential I/O, then do some read-ahead.
315          * The code above us should have positioned us at the next likely
316          * offset.
317          *
318          * Only mess with buffers which we can immediately lock.  HAMMER
319          * will do device-readahead irrespective of what the blocks
320          * represent.
321          */
322         while (error == 0 && maxra > 0) {
323                 int burstbytes;
324                 int tmp_error;
325                 int nblks;
326
327                 rbp = getblk(vp, loffset, blksize,
328                              GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
329                 if (rbp == NULL)
330                         goto no_read_ahead;
331                 if ((rbp->b_flags & B_CACHE)) {
332                         bqrelse(rbp);
333                         goto no_read_ahead;
334                 }
335
336                 /*
337                  * An error from the read-ahead bmap has nothing to do
338                  * with the caller's original request.
339                  */
340                 tmp_error = VOP_BMAP(vp, loffset, &doffset,
341                                      &burstbytes, NULL, BUF_CMD_READ);
342                 if (tmp_error || doffset == NOOFFSET) {
343                         rbp->b_flags |= B_INVAL;
344                         brelse(rbp);
345                         rbp = NULL;
346                         goto no_read_ahead;
347                 }
348                 if ((nblks = maxra) > maxrbuild)
349                         nblks = maxrbuild;
350                 if (nblks > burstbytes / blksize)
351                         nblks = burstbytes / blksize;
352
353                 /*
354                  * rbp: async read
355                  */
356                 rbp->b_cmd = BUF_CMD_READ;
357                 /*rbp->b_flags |= B_AGE*/;
358                 cluster_setram(rbp);
359
360                 if (nblks > 1) {
361                         rbp = cluster_rbuild(vp, filesize, loffset,
362                                              doffset, blksize, 
363                                              nblks, rbp);
364                 } else {
365                         rbp->b_bio2.bio_offset = doffset;
366                 }
367
368 #if defined(CLUSTERDEBUG)
369                 if (rcluster) {
370                         if (bp) {
371                                 kprintf("A+(%012jx,%d,%jd) "
372                                         "doff=%012jx minr=%zd ra=%d\n",
373                                     (intmax_t)loffset, rbp->b_bcount,
374                                     (intmax_t)(loffset - origoffset),
375                                     (intmax_t)doffset, minreq, maxra);
376                         } else {
377                                 kprintf("A-(%012jx,%d,%jd) "
378                                         "doff=%012jx minr=%zd ra=%d\n",
379                                     (intmax_t)rbp->b_loffset, rbp->b_bcount,
380                                     (intmax_t)(loffset - origoffset),
381                                     (intmax_t)doffset, minreq, maxra);
382                         }
383                 }
384 #endif
385                 rbp->b_flags &= ~(B_ERROR|B_INVAL);
386
387                 if ((rbp->b_flags & B_CLUSTER) == 0)
388                         vfs_busy_pages(vp, rbp);
389                 BUF_KERNPROC(rbp);
390                 loffset += rbp->b_bufsize;
391                 maxra -= rbp->b_bufsize / blksize;
392                 vn_strategy(vp, &rbp->b_bio1);
393                 /* rbp invalid now */
394         }
395
396         /*
397          * Wait for our original buffer to complete its I/O.  reqbp will
398          * be NULL if the original buffer was B_CACHE.  We are returning
399          * (*bpp) which is the same as reqbp when reqbp != NULL.
400          */
401 no_read_ahead:
402         if (reqbp) {
403                 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
404                 error = biowait(&reqbp->b_bio1, "clurd");
405         }
406         return (error);
407 }
408
409 /*
410  * If blocks are contiguous on disk, use this to provide clustered
411  * read ahead.  We will read as many blocks as possible sequentially
412  * and then parcel them up into logical blocks in the buffer hash table.
413  *
414  * This function either returns a cluster buf or it returns fbp.  fbp is
415  * already expected to be set up as a synchronous or asynchronous request.
416  *
417  * If a cluster buf is returned it will always be async.
418  */
419 static struct buf *
420 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
421                int blksize, int run, struct buf *fbp)
422 {
423         struct buf *bp, *tbp;
424         off_t boffset;
425         int i, j;
426         int maxiosize = vmaxiosize(vp);
427
428         /*
429          * avoid a division
430          */
431         while (loffset + run * blksize > filesize) {
432                 --run;
433         }
434
435         tbp = fbp;
436         tbp->b_bio2.bio_offset = doffset;
437         if((tbp->b_flags & B_MALLOC) ||
438             ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) {
439                 return tbp;
440         }
441
442         bp = trypbuf_kva(&cluster_pbuf_freecnt);
443         if (bp == NULL) {
444                 return tbp;
445         }
446
447         /*
448          * We are synthesizing a buffer out of vm_page_t's, but
449          * if the block size is not page aligned then the starting
450          * address may not be either.  Inherit the b_data offset
451          * from the original buffer.
452          */
453         bp->b_data = (char *)((vm_offset_t)bp->b_data |
454             ((vm_offset_t)tbp->b_data & PAGE_MASK));
455         bp->b_flags |= B_CLUSTER | B_VMIO;
456         bp->b_cmd = BUF_CMD_READ;
457         bp->b_bio1.bio_done = cluster_callback;         /* default to async */
458         bp->b_bio1.bio_caller_info1.cluster_head = NULL;
459         bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
460         bp->b_loffset = loffset;
461         bp->b_bio2.bio_offset = doffset;
462         KASSERT(bp->b_loffset != NOOFFSET,
463                 ("cluster_rbuild: no buffer offset"));
464
465         bp->b_bcount = 0;
466         bp->b_bufsize = 0;
467         bp->b_xio.xio_npages = 0;
468
469         for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
470                 if (i) {
471                         if ((bp->b_xio.xio_npages * PAGE_SIZE) +
472                             round_page(blksize) > maxiosize) {
473                                 break;
474                         }
475
476                         /*
477                          * Shortcut some checks and try to avoid buffers that
478                          * would block in the lock.  The same checks have to
479                          * be made again after we officially get the buffer.
480                          */
481                         tbp = getblk(vp, loffset + i * blksize, blksize,
482                                      GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
483                         if (tbp == NULL)
484                                 break;
485                         for (j = 0; j < tbp->b_xio.xio_npages; j++) {
486                                 if (tbp->b_xio.xio_pages[j]->valid)
487                                         break;
488                         }
489                         if (j != tbp->b_xio.xio_npages) {
490                                 bqrelse(tbp);
491                                 break;
492                         }
493
494                         /*
495                          * Stop scanning if the buffer is fuly valid 
496                          * (marked B_CACHE), or locked (may be doing a
497                          * background write), or if the buffer is not
498                          * VMIO backed.  The clustering code can only deal
499                          * with VMIO-backed buffers.
500                          */
501                         if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
502                             (tbp->b_flags & B_VMIO) == 0 ||
503                             (LIST_FIRST(&tbp->b_dep) != NULL &&
504                              buf_checkread(tbp))
505                         ) {
506                                 bqrelse(tbp);
507                                 break;
508                         }
509
510                         /*
511                          * The buffer must be completely invalid in order to
512                          * take part in the cluster.  If it is partially valid
513                          * then we stop.
514                          */
515                         for (j = 0;j < tbp->b_xio.xio_npages; j++) {
516                                 if (tbp->b_xio.xio_pages[j]->valid)
517                                         break;
518                         }
519                         if (j != tbp->b_xio.xio_npages) {
520                                 bqrelse(tbp);
521                                 break;
522                         }
523
524                         /*
525                          * Set a read-ahead mark as appropriate.  Always
526                          * set the read-ahead mark at (run - 1).  It is
527                          * unclear why we were also setting it at i == 1.
528                          */
529                         if (/*i == 1 ||*/ i == (run - 1))
530                                 cluster_setram(tbp);
531
532                         /*
533                          * Depress the priority of buffers not explicitly
534                          * requested.
535                          */
536                         /* tbp->b_flags |= B_AGE; */
537
538                         /*
539                          * Set the block number if it isn't set, otherwise
540                          * if it is make sure it matches the block number we
541                          * expect.
542                          */
543                         if (tbp->b_bio2.bio_offset == NOOFFSET) {
544                                 tbp->b_bio2.bio_offset = boffset;
545                         } else if (tbp->b_bio2.bio_offset != boffset) {
546                                 brelse(tbp);
547                                 break;
548                         }
549                 }
550
551                 /*
552                  * The passed-in tbp (i == 0) will already be set up for
553                  * async or sync operation.  All other tbp's acquire in
554                  * our loop are set up for async operation.
555                  */
556                 tbp->b_cmd = BUF_CMD_READ;
557                 BUF_KERNPROC(tbp);
558                 cluster_append(&bp->b_bio1, tbp);
559                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
560                         vm_page_t m;
561
562                         m = tbp->b_xio.xio_pages[j];
563                         vm_page_busy_wait(m, FALSE, "clurpg");
564                         vm_page_io_start(m);
565                         vm_page_wakeup(m);
566                         vm_object_pip_add(m->object, 1);
567                         if ((bp->b_xio.xio_npages == 0) ||
568                                 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
569                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
570                                 bp->b_xio.xio_npages++;
571                         }
572                         if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
573                                 tbp->b_xio.xio_pages[j] = bogus_page;
574                 }
575                 /*
576                  * XXX shouldn't this be += size for both, like in 
577                  * cluster_wbuild()?
578                  *
579                  * Don't inherit tbp->b_bufsize as it may be larger due to
580                  * a non-page-aligned size.  Instead just aggregate using
581                  * 'size'.
582                  */
583                 if (tbp->b_bcount != blksize)
584                     kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
585                 if (tbp->b_bufsize != blksize)
586                     kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
587                 bp->b_bcount += blksize;
588                 bp->b_bufsize += blksize;
589         }
590
591         /*
592          * Fully valid pages in the cluster are already good and do not need
593          * to be re-read from disk.  Replace the page with bogus_page
594          */
595         for (j = 0; j < bp->b_xio.xio_npages; j++) {
596                 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
597                     VM_PAGE_BITS_ALL) {
598                         bp->b_xio.xio_pages[j] = bogus_page;
599                 }
600         }
601         if (bp->b_bufsize > bp->b_kvasize) {
602                 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
603                     bp->b_bufsize, bp->b_kvasize);
604         }
605         pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
606                 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
607         BUF_KERNPROC(bp);
608         return (bp);
609 }
610
611 /*
612  * Cleanup after a clustered read or write.
613  * This is complicated by the fact that any of the buffers might have
614  * extra memory (if there were no empty buffer headers at allocbuf time)
615  * that we will need to shift around.
616  *
617  * The returned bio is &bp->b_bio1
618  */
619 void
620 cluster_callback(struct bio *bio)
621 {
622         struct buf *bp = bio->bio_buf;
623         struct buf *tbp;
624         int error = 0;
625
626         /*
627          * Must propogate errors to all the components.  A short read (EOF)
628          * is a critical error.
629          */
630         if (bp->b_flags & B_ERROR) {
631                 error = bp->b_error;
632         } else if (bp->b_bcount != bp->b_bufsize) {
633                 panic("cluster_callback: unexpected EOF on cluster %p!", bio);
634         }
635
636         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
637         /*
638          * Move memory from the large cluster buffer into the component
639          * buffers and mark IO as done on these.  Since the memory map
640          * is the same, no actual copying is required.
641          */
642         while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
643                 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
644                 if (error) {
645                         tbp->b_flags |= B_ERROR | B_IODEBUG;
646                         tbp->b_error = error;
647                 } else {
648                         tbp->b_dirtyoff = tbp->b_dirtyend = 0;
649                         tbp->b_flags &= ~(B_ERROR|B_INVAL);
650                         tbp->b_flags |= B_IODEBUG;
651                         /*
652                          * XXX the bdwrite()/bqrelse() issued during
653                          * cluster building clears B_RELBUF (see bqrelse()
654                          * comment).  If direct I/O was specified, we have
655                          * to restore it here to allow the buffer and VM
656                          * to be freed.
657                          */
658                         if (tbp->b_flags & B_DIRECT)
659                                 tbp->b_flags |= B_RELBUF;
660                 }
661                 biodone(&tbp->b_bio1);
662         }
663         relpbuf(bp, &cluster_pbuf_freecnt);
664 }
665
666 /*
667  * Implement modified write build for cluster.
668  *
669  *      write_behind = 0        write behind disabled
670  *      write_behind = 1        write behind normal (default)
671  *      write_behind = 2        write behind backed-off
672  *
673  * In addition, write_behind is only activated for files that have
674  * grown past a certain size (default 10MB).  Otherwise temporary files
675  * wind up generating a lot of unnecessary disk I/O.
676  */
677 static __inline int
678 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
679 {
680         int r = 0;
681
682         switch(write_behind) {
683         case 2:
684                 if (start_loffset < len)
685                         break;
686                 start_loffset -= len;
687                 /* fall through */
688         case 1:
689                 if (vp->v_filesize >= write_behind_minfilesize) {
690                         r = cluster_wbuild(vp, NULL, blksize,
691                                            start_loffset, len);
692                 }
693                 /* fall through */
694         default:
695                 /* fall through */
696                 break;
697         }
698         return(r);
699 }
700
701 /*
702  * Do clustered write for FFS.
703  *
704  * Three cases:
705  *      1. Write is not sequential (write asynchronously)
706  *      Write is sequential:
707  *      2.      beginning of cluster - begin cluster
708  *      3.      middle of a cluster - add to cluster
709  *      4.      end of a cluster - asynchronously write cluster
710  */
711 void
712 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
713 {
714         struct vnode *vp;
715         off_t loffset;
716         int maxclen, cursize;
717         int async;
718
719         vp = bp->b_vp;
720         if (vp->v_type == VREG)
721                 async = vp->v_mount->mnt_flag & MNT_ASYNC;
722         else
723                 async = 0;
724         loffset = bp->b_loffset;
725         KASSERT(bp->b_loffset != NOOFFSET, 
726                 ("cluster_write: no buffer offset"));
727
728         /* Initialize vnode to beginning of file. */
729         if (loffset == 0)
730                 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
731
732         if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize ||
733             bp->b_bio2.bio_offset == NOOFFSET ||
734             (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) {
735                 maxclen = vmaxiosize(vp);
736                 if (vp->v_clen != 0) {
737                         /*
738                          * Next block is not sequential.
739                          *
740                          * If we are not writing at end of file, the process
741                          * seeked to another point in the file since its last
742                          * write, or we have reached our maximum cluster size,
743                          * then push the previous cluster. Otherwise try
744                          * reallocating to make it sequential.
745                          *
746                          * Change to algorithm: only push previous cluster if
747                          * it was sequential from the point of view of the
748                          * seqcount heuristic, otherwise leave the buffer 
749                          * intact so we can potentially optimize the I/O
750                          * later on in the buf_daemon or update daemon
751                          * flush.
752                          */
753                         cursize = vp->v_lastw - vp->v_cstart + blksize;
754                         if (bp->b_loffset + blksize < filesize ||
755                             loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) {
756                                 if (!async && seqcount > 0) {
757                                         cluster_wbuild_wb(vp, blksize,
758                                                 vp->v_cstart, cursize);
759                                 }
760                         } else {
761                                 struct buf **bpp, **endbp;
762                                 struct cluster_save *buflist;
763
764                                 buflist = cluster_collectbufs(vp, bp, blksize);
765                                 endbp = &buflist->bs_children
766                                     [buflist->bs_nchildren - 1];
767                                 if (VOP_REALLOCBLKS(vp, buflist)) {
768                                         /*
769                                          * Failed, push the previous cluster
770                                          * if *really* writing sequentially
771                                          * in the logical file (seqcount > 1),
772                                          * otherwise delay it in the hopes that
773                                          * the low level disk driver can
774                                          * optimize the write ordering.
775                                          */
776                                         for (bpp = buflist->bs_children;
777                                              bpp < endbp; bpp++)
778                                                 brelse(*bpp);
779                                         kfree(buflist, M_SEGMENT);
780                                         if (seqcount > 1) {
781                                                 cluster_wbuild_wb(vp, 
782                                                     blksize, vp->v_cstart, 
783                                                     cursize);
784                                         }
785                                 } else {
786                                         /*
787                                          * Succeeded, keep building cluster.
788                                          */
789                                         for (bpp = buflist->bs_children;
790                                              bpp <= endbp; bpp++)
791                                                 bdwrite(*bpp);
792                                         kfree(buflist, M_SEGMENT);
793                                         vp->v_lastw = loffset;
794                                         vp->v_lasta = bp->b_bio2.bio_offset;
795                                         return;
796                                 }
797                         }
798                 }
799                 /*
800                  * Consider beginning a cluster. If at end of file, make
801                  * cluster as large as possible, otherwise find size of
802                  * existing cluster.
803                  */
804                 if ((vp->v_type == VREG) &&
805                     bp->b_loffset + blksize < filesize &&
806                     (bp->b_bio2.bio_offset == NOOFFSET) &&
807                     (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) ||
808                      bp->b_bio2.bio_offset == NOOFFSET)) {
809                         bdwrite(bp);
810                         vp->v_clen = 0;
811                         vp->v_lasta = bp->b_bio2.bio_offset;
812                         vp->v_cstart = loffset + blksize;
813                         vp->v_lastw = loffset;
814                         return;
815                 }
816                 if (maxclen > blksize)
817                         vp->v_clen = maxclen - blksize;
818                 else
819                         vp->v_clen = 0;
820                 if (!async && vp->v_clen == 0) { /* I/O not contiguous */
821                         vp->v_cstart = loffset + blksize;
822                         bdwrite(bp);
823                 } else {        /* Wait for rest of cluster */
824                         vp->v_cstart = loffset;
825                         bdwrite(bp);
826                 }
827         } else if (loffset == vp->v_cstart + vp->v_clen) {
828                 /*
829                  * At end of cluster, write it out if seqcount tells us we
830                  * are operating sequentially, otherwise let the buf or
831                  * update daemon handle it.
832                  */
833                 bdwrite(bp);
834                 if (seqcount > 1)
835                         cluster_wbuild_wb(vp, blksize, vp->v_cstart,
836                                           vp->v_clen + blksize);
837                 vp->v_clen = 0;
838                 vp->v_cstart = loffset + blksize;
839         } else if (vm_page_count_severe() &&
840                    bp->b_loffset + blksize < filesize) {
841                 /*
842                  * We are low on memory, get it going NOW.  However, do not
843                  * try to push out a partial block at the end of the file
844                  * as this could lead to extremely non-optimal write activity.
845                  */
846                 bawrite(bp);
847         } else {
848                 /*
849                  * In the middle of a cluster, so just delay the I/O for now.
850                  */
851                 bdwrite(bp);
852         }
853         vp->v_lastw = loffset;
854         vp->v_lasta = bp->b_bio2.bio_offset;
855 }
856
857 /*
858  * This is the clustered version of bawrite().  It works similarly to
859  * cluster_write() except I/O on the buffer is guaranteed to occur.
860  */
861 int
862 cluster_awrite(struct buf *bp)
863 {
864         int total;
865
866         /*
867          * Don't bother if it isn't clusterable.
868          */
869         if ((bp->b_flags & B_CLUSTEROK) == 0 ||
870             bp->b_vp == NULL ||
871             (bp->b_vp->v_flag & VOBJBUF) == 0) {
872                 total = bp->b_bufsize;
873                 bawrite(bp);
874                 return (total);
875         }
876
877         total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize,
878                                bp->b_loffset, vmaxiosize(bp->b_vp));
879         if (bp)
880                 bawrite(bp);
881
882         return total;
883 }
884
885 /*
886  * This is an awful lot like cluster_rbuild...wish they could be combined.
887  * The last lbn argument is the current block on which I/O is being
888  * performed.  Check to see that it doesn't fall in the middle of
889  * the current block (if last_bp == NULL).
890  *
891  * cluster_wbuild() normally does not guarantee anything.  If bpp is
892  * non-NULL and cluster_wbuild() is able to incorporate it into the
893  * I/O it will set *bpp to NULL, otherwise it will leave it alone and
894  * the caller must dispose of *bpp.
895  */
896 static int
897 cluster_wbuild(struct vnode *vp, struct buf **bpp,
898                int blksize, off_t start_loffset, int bytes)
899 {
900         struct buf *bp, *tbp;
901         int i, j;
902         int totalwritten = 0;
903         int must_initiate;
904         int maxiosize = vmaxiosize(vp);
905
906         while (bytes > 0) {
907                 /*
908                  * If the buffer matches the passed locked & removed buffer
909                  * we used the passed buffer (which might not be B_DELWRI).
910                  *
911                  * Otherwise locate the buffer and determine if it is
912                  * compatible.
913                  */
914                 if (bpp && (*bpp)->b_loffset == start_loffset) {
915                         tbp = *bpp;
916                         *bpp = NULL;
917                         bpp = NULL;
918                 } else {
919                         tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
920                         if (tbp == NULL ||
921                             (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) !=
922                              B_DELWRI ||
923                             (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
924                                 if (tbp)
925                                         BUF_UNLOCK(tbp);
926                                 start_loffset += blksize;
927                                 bytes -= blksize;
928                                 continue;
929                         }
930                         bremfree(tbp);
931                 }
932                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
933
934                 /*
935                  * Extra memory in the buffer, punt on this buffer.
936                  * XXX we could handle this in most cases, but we would
937                  * have to push the extra memory down to after our max
938                  * possible cluster size and then potentially pull it back
939                  * up if the cluster was terminated prematurely--too much
940                  * hassle.
941                  */
942                 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
943                     (tbp->b_bcount != tbp->b_bufsize) ||
944                     (tbp->b_bcount != blksize) ||
945                     (bytes == blksize) ||
946                     ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) {
947                         totalwritten += tbp->b_bufsize;
948                         bawrite(tbp);
949                         start_loffset += blksize;
950                         bytes -= blksize;
951                         continue;
952                 }
953
954                 /*
955                  * Set up the pbuf.  Track our append point with b_bcount
956                  * and b_bufsize.  b_bufsize is not used by the device but
957                  * our caller uses it to loop clusters and we use it to
958                  * detect a premature EOF on the block device.
959                  */
960                 bp->b_bcount = 0;
961                 bp->b_bufsize = 0;
962                 bp->b_xio.xio_npages = 0;
963                 bp->b_loffset = tbp->b_loffset;
964                 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
965
966                 /*
967                  * We are synthesizing a buffer out of vm_page_t's, but
968                  * if the block size is not page aligned then the starting
969                  * address may not be either.  Inherit the b_data offset
970                  * from the original buffer.
971                  */
972                 bp->b_data = (char *)((vm_offset_t)bp->b_data |
973                     ((vm_offset_t)tbp->b_data & PAGE_MASK));
974                 bp->b_flags &= ~B_ERROR;
975                 bp->b_flags |= B_CLUSTER | B_BNOCLIP |
976                         (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
977                 bp->b_bio1.bio_caller_info1.cluster_head = NULL;
978                 bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
979
980                 /*
981                  * From this location in the file, scan forward to see
982                  * if there are buffers with adjacent data that need to
983                  * be written as well.
984                  *
985                  * IO *must* be initiated on index 0 at this point
986                  * (particularly when called from cluster_awrite()).
987                  */
988                 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
989                         if (i == 0) {
990                                 must_initiate = 1;
991                         } else {
992                                 /*
993                                  * Not first buffer.
994                                  */
995                                 must_initiate = 0;
996                                 tbp = findblk(vp, start_loffset,
997                                               FINDBLK_NBLOCK);
998                                 /*
999                                  * Buffer not found or could not be locked
1000                                  * non-blocking.
1001                                  */
1002                                 if (tbp == NULL)
1003                                         break;
1004
1005                                 /*
1006                                  * If it IS in core, but has different
1007                                  * characteristics, then don't cluster
1008                                  * with it.
1009                                  */
1010                                 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
1011                                      B_INVAL | B_DELWRI | B_NEEDCOMMIT))
1012                                     != (B_DELWRI | B_CLUSTEROK |
1013                                      (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
1014                                     (tbp->b_flags & B_LOCKED)
1015                                 ) {
1016                                         BUF_UNLOCK(tbp);
1017                                         break;
1018                                 }
1019
1020                                 /*
1021                                  * Check that the combined cluster
1022                                  * would make sense with regard to pages
1023                                  * and would not be too large
1024                                  *
1025                                  * WARNING! buf_checkwrite() must be the last
1026                                  *          check made.  If it returns 0 then
1027                                  *          we must initiate the I/O.
1028                                  */
1029                                 if ((tbp->b_bcount != blksize) ||
1030                                   ((bp->b_bio2.bio_offset + i) !=
1031                                     tbp->b_bio2.bio_offset) ||
1032                                   ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
1033                                     (maxiosize / PAGE_SIZE)) ||
1034                                   (LIST_FIRST(&tbp->b_dep) &&
1035                                    buf_checkwrite(tbp))
1036                                 ) {
1037                                         BUF_UNLOCK(tbp);
1038                                         break;
1039                                 }
1040                                 if (LIST_FIRST(&tbp->b_dep))
1041                                         must_initiate = 1;
1042                                 /*
1043                                  * Ok, it's passed all the tests,
1044                                  * so remove it from the free list
1045                                  * and mark it busy. We will use it.
1046                                  */
1047                                 bremfree(tbp);
1048                                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
1049                         }
1050
1051                         /*
1052                          * If the IO is via the VM then we do some
1053                          * special VM hackery (yuck).  Since the buffer's
1054                          * block size may not be page-aligned it is possible
1055                          * for a page to be shared between two buffers.  We
1056                          * have to get rid of the duplication when building
1057                          * the cluster.
1058                          */
1059                         if (tbp->b_flags & B_VMIO) {
1060                                 vm_page_t m;
1061
1062                                 /*
1063                                  * Try to avoid deadlocks with the VM system.
1064                                  * However, we cannot abort the I/O if
1065                                  * must_initiate is non-zero.
1066                                  */
1067                                 if (must_initiate == 0) {
1068                                         for (j = 0;
1069                                              j < tbp->b_xio.xio_npages;
1070                                              ++j) {
1071                                                 m = tbp->b_xio.xio_pages[j];
1072                                                 if (m->flags & PG_BUSY) {
1073                                                         bqrelse(tbp);
1074                                                         goto finishcluster;
1075                                                 }
1076                                         }
1077                                 }
1078                                         
1079                                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
1080                                         m = tbp->b_xio.xio_pages[j];
1081                                         vm_page_busy_wait(m, FALSE, "clurpg");
1082                                         vm_page_io_start(m);
1083                                         vm_page_wakeup(m);
1084                                         vm_object_pip_add(m->object, 1);
1085                                         if ((bp->b_xio.xio_npages == 0) ||
1086                                           (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
1087                                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
1088                                                 bp->b_xio.xio_npages++;
1089                                         }
1090                                 }
1091                         }
1092                         bp->b_bcount += blksize;
1093                         bp->b_bufsize += blksize;
1094
1095                         bundirty(tbp);
1096                         tbp->b_flags &= ~B_ERROR;
1097                         tbp->b_cmd = BUF_CMD_WRITE;
1098                         BUF_KERNPROC(tbp);
1099                         cluster_append(&bp->b_bio1, tbp);
1100
1101                         /*
1102                          * check for latent dependencies to be handled 
1103                          */
1104                         if (LIST_FIRST(&tbp->b_dep) != NULL)
1105                                 buf_start(tbp);
1106                 }
1107         finishcluster:
1108                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
1109                             (vm_page_t *)bp->b_xio.xio_pages,
1110                             bp->b_xio.xio_npages);
1111                 if (bp->b_bufsize > bp->b_kvasize) {
1112                         panic("cluster_wbuild: b_bufsize(%d) "
1113                               "> b_kvasize(%d)\n",
1114                               bp->b_bufsize, bp->b_kvasize);
1115                 }
1116                 totalwritten += bp->b_bufsize;
1117                 bp->b_dirtyoff = 0;
1118                 bp->b_dirtyend = bp->b_bufsize;
1119                 bp->b_bio1.bio_done = cluster_callback;
1120                 bp->b_cmd = BUF_CMD_WRITE;
1121
1122                 vfs_busy_pages(vp, bp);
1123                 bsetrunningbufspace(bp, bp->b_bufsize);
1124                 BUF_KERNPROC(bp);
1125                 vn_strategy(vp, &bp->b_bio1);
1126
1127                 bytes -= i;
1128         }
1129         return totalwritten;
1130 }
1131
1132 /*
1133  * Collect together all the buffers in a cluster, plus add one
1134  * additional buffer passed-in.
1135  *
1136  * Only pre-existing buffers whos block size matches blksize are collected.
1137  * (this is primarily because HAMMER1 uses varying block sizes and we don't
1138  * want to override its choices).
1139  */
1140 static struct cluster_save *
1141 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize)
1142 {
1143         struct cluster_save *buflist;
1144         struct buf *bp;
1145         off_t loffset;
1146         int i, len;
1147         int j;
1148         int k;
1149
1150         len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize;
1151         buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1152                          M_SEGMENT, M_WAITOK);
1153         buflist->bs_nchildren = 0;
1154         buflist->bs_children = (struct buf **) (buflist + 1);
1155         for (loffset = vp->v_cstart, i = 0, j = 0;
1156              i < len;
1157              (loffset += blksize), i++) {
1158                 bp = getcacheblk(vp, loffset,
1159                                  last_bp->b_bcount, GETBLK_SZMATCH);
1160                 buflist->bs_children[i] = bp;
1161                 if (bp == NULL) {
1162                         j = i + 1;
1163                 } else if (bp->b_bio2.bio_offset == NOOFFSET) {
1164                         VOP_BMAP(bp->b_vp, bp->b_loffset,
1165                                  &bp->b_bio2.bio_offset,
1166                                  NULL, NULL, BUF_CMD_WRITE);
1167                 }
1168         }
1169
1170         /*
1171          * Get rid of gaps
1172          */
1173         for (k = 0; k < j; ++k) {
1174                 if (buflist->bs_children[k]) {
1175                         bqrelse(buflist->bs_children[k]);
1176                         buflist->bs_children[k] = NULL;
1177                 }
1178         }
1179         if (j != 0) {
1180                 if (j != i) {
1181                         bcopy(buflist->bs_children + j,
1182                               buflist->bs_children + 0,
1183                               sizeof(buflist->bs_children[0]) * (i - j));
1184                 }
1185                 i -= j;
1186         }
1187         buflist->bs_children[i] = bp = last_bp;
1188         if (bp->b_bio2.bio_offset == NOOFFSET) {
1189                 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
1190                          NULL, NULL, BUF_CMD_WRITE);
1191         }
1192         buflist->bs_nchildren = i + 1;
1193         return (buflist);
1194 }
1195
1196 void
1197 cluster_append(struct bio *bio, struct buf *tbp)
1198 {
1199         tbp->b_cluster_next = NULL;
1200         if (bio->bio_caller_info1.cluster_head == NULL) {
1201                 bio->bio_caller_info1.cluster_head = tbp;
1202                 bio->bio_caller_info2.cluster_tail = tbp;
1203         } else {
1204                 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
1205                 bio->bio_caller_info2.cluster_tail = tbp;
1206         }
1207 }
1208
1209 static
1210 void
1211 cluster_setram (struct buf *bp)
1212 {
1213         bp->b_flags |= B_RAM;
1214         if (bp->b_xio.xio_npages)
1215                 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
1216 }