sys/kern/vfs_cluster.c

   1 /*-
   2  * Copyright (c) 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * Modifications/enhancements:
   5  *      Copyright (c) 1995 John S. Dyson.  All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 4. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  *      @(#)vfs_cluster.c       8.7 (Berkeley) 2/13/94
  32  * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
  33  * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $
  34  */
  35
  36 #include "opt_debug_cluster.h"
  37
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/kernel.h>
  41 #include <sys/proc.h>
  42 #include <sys/buf.h>
  43 #include <sys/vnode.h>
  44 #include <sys/malloc.h>
  45 #include <sys/mount.h>
  46 #include <sys/resourcevar.h>
  47 #include <sys/vmmeter.h>
  48 #include <vm/vm.h>
  49 #include <vm/vm_object.h>
  50 #include <vm/vm_page.h>
  51 #include <sys/sysctl.h>
  52
  53 #include <sys/buf2.h>
  54 #include <vm/vm_page2.h>
  55
  56 #include <machine/limits.h>
  57
  58 #if defined(CLUSTERDEBUG)
  59 #include <sys/sysctl.h>
  60 static int      rcluster= 0;
  61 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
  62 #endif
  63
  64 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
  65
  66 static struct cluster_save *
  67         cluster_collectbufs (struct vnode *vp, struct buf *last_bp,
  68                             int blksize);
  69 static struct buf *
  70         cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
  71                             off_t doffset, int blksize, int run,
  72                             struct buf *fbp);
  73 static void cluster_callback (struct bio *);
  74 static void cluster_setram (struct buf *);
  75 static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize,
  76                             off_t start_loffset, int bytes);
  77
  78 static int write_behind = 1;
  79 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
  80     "Cluster write-behind setting");
  81 static quad_t write_behind_minfilesize = 10 * 1024 * 1024;
  82 SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW,
  83     &write_behind_minfilesize, 0, "Cluster write-behind setting");
  84 static int max_readahead = 2 * 1024 * 1024;
  85 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0,
  86     "Limit in bytes for desired cluster read-ahead");
  87
  88 extern vm_page_t        bogus_page;
  89
  90 extern int cluster_pbuf_freecnt;
  91
  92 /*
  93  * This replaces bread.
  94  *
  95  * filesize     - read-ahead @ blksize will not cross this boundary
  96  * loffset      - loffset for returned *bpp
  97  * blksize      - blocksize for returned *bpp and read-ahead bps
  98  * minreq       - minimum (not a hard minimum) in bytes, typically reflects
  99  *                a higher level uio resid.
 100  * maxreq       - maximum (sequential heuristic) in bytes (highet typ ~2MB)
 101  * bpp          - return buffer (*bpp) for (loffset,blksize)
 102  */
 103 int
 104 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset,
 105              int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
 106 {
 107         struct buf *bp, *rbp, *reqbp;
 108         off_t origoffset;
 109         off_t doffset;
 110         int error;
 111         int i;
 112         int maxra;
 113         int maxrbuild;
 114
 115         error = 0;
 116
 117         /*
 118          * Calculate the desired read-ahead in blksize'd blocks (maxra).
 119          * To do this we calculate maxreq.
 120          *
 121          * maxreq typically starts out as a sequential heuristic.  If the
 122          * high level uio/resid is bigger (minreq), we pop maxreq up to
 123          * minreq.  This represents the case where random I/O is being
 124          * performed by the userland is issuing big read()'s.
 125          *
 126          * Then we limit maxreq to max_readahead to ensure it is a reasonable
 127          * value.
 128          *
 129          * Finally we must ensure that (loffset + maxreq) does not cross the
 130          * boundary (filesize) for the current blocksize.  If we allowed it
 131          * to cross we could end up with buffers past the boundary with the
 132          * wrong block size (HAMMER large-data areas use mixed block sizes).
 133          * minreq is also absolutely limited to filesize.
 134          */
 135         if (maxreq < minreq)
 136                 maxreq = minreq;
 137         /* minreq not used beyond this point */
 138
 139         if (maxreq > max_readahead) {
 140                 maxreq = max_readahead;
 141                 if (maxreq > 16 * 1024 * 1024)
 142                         maxreq = 16 * 1024 * 1024;
 143         }
 144         if (maxreq < blksize)
 145                 maxreq = blksize;
 146         if (loffset + maxreq > filesize) {
 147                 if (loffset > filesize)
 148                         maxreq = 0;
 149                 else
 150                         maxreq = filesize - loffset;
 151         }
 152
 153         maxra = (int)(maxreq / blksize);
 154
 155         /*
 156          * Get the requested block.
 157          */
 158         if (*bpp)
 159                 reqbp = bp = *bpp;
 160         else
 161                 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
 162         origoffset = loffset;
 163
 164         /*
 165          * Calculate the maximum cluster size for a single I/O, used
 166          * by cluster_rbuild().
 167          */
 168         maxrbuild = vmaxiosize(vp) / blksize;
 169
 170         /*
 171          * if it is in the cache, then check to see if the reads have been
 172          * sequential.  If they have, then try some read-ahead, otherwise
 173          * back-off on prospective read-aheads.
 174          */
 175         if (bp->b_flags & B_CACHE) {
 176                 /*
 177                  * Not sequential, do not do any read-ahead
 178                  */
 179                 if (maxra <= 1)
 180                         return 0;
 181
 182                 /*
 183                  * No read-ahead mark, do not do any read-ahead
 184                  * yet.
 185                  */
 186                 if ((bp->b_flags & B_RAM) == 0)
 187                         return 0;
 188
 189                 /*
 190                  * We hit a read-ahead-mark, figure out how much read-ahead
 191                  * to do (maxra) and where to start (loffset).
 192                  *
 193                  * Shortcut the scan.  Typically the way this works is that
 194                  * we've built up all the blocks inbetween except for the
 195                  * last in previous iterations, so if the second-to-last
 196                  * block is present we just skip ahead to it.
 197                  *
 198                  * This algorithm has O(1) cpu in the steady state no
 199                  * matter how large maxra is.
 200                  */
 201                 bp->b_flags &= ~B_RAM;
 202
 203                 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
 204                         i = maxra - 1;
 205                 else
 206                         i = 1;
 207                 while (i < maxra) {
 208                         if (findblk(vp, loffset + i * blksize,
 209                                     FINDBLK_TEST) == NULL) {
 210                                 break;
 211                         }
 212                         ++i;
 213                 }
 214
 215                 /*
 216                  * We got everything or everything is in the cache, no
 217                  * point continuing.
 218                  */
 219                 if (i >= maxra)
 220                         return 0;
 221
 222                 /*
 223                  * Calculate where to start the read-ahead and how much
 224                  * to do.  Generally speaking we want to read-ahead by
 225                  * (maxra) when we've found a read-ahead mark.  We do
 226                  * not want to reduce maxra here as it will cause
 227                  * successive read-ahead I/O's to be smaller and smaller.
 228                  *
 229                  * However, we have to make sure we don't break the
 230                  * filesize limitation for the clustered operation.
 231                  */
 232                 loffset += i * blksize;
 233                 reqbp = bp = NULL;
 234
 235                 if (loffset >= filesize)
 236                         return 0;
 237                 if (loffset + maxra * blksize > filesize) {
 238                         maxreq = filesize - loffset;
 239                         maxra = (int)(maxreq / blksize);
 240                 }
 241         } else {
 242                 __debugvar off_t firstread = bp->b_loffset;
 243                 int nblks;
 244
 245                 /*
 246                  * Set-up synchronous read for bp.
 247                  */
 248                 bp->b_cmd = BUF_CMD_READ;
 249                 bp->b_bio1.bio_done = biodone_sync;
 250                 bp->b_bio1.bio_flags |= BIO_SYNC;
 251
 252                 KASSERT(firstread != NOOFFSET,
 253                         ("cluster_read: no buffer offset"));
 254
 255                 /*
 256                  * nblks is our cluster_rbuild request size, limited
 257                  * primarily by the device.
 258                  */
 259                 if ((nblks = maxra) > maxrbuild)
 260                         nblks = maxrbuild;
 261
 262                 if (nblks > 1) {
 263                         int burstbytes;
 264
 265                         error = VOP_BMAP(vp, loffset, &doffset,
 266                                          &burstbytes, NULL, BUF_CMD_READ);
 267                         if (error)
 268                                 goto single_block_read;
 269                         if (nblks > burstbytes / blksize)
 270                                 nblks = burstbytes / blksize;
 271                         if (doffset == NOOFFSET)
 272                                 goto single_block_read;
 273                         if (nblks <= 1)
 274                                 goto single_block_read;
 275
 276                         bp = cluster_rbuild(vp, filesize, loffset,
 277                                             doffset, blksize, nblks, bp);
 278                         loffset += bp->b_bufsize;
 279                         maxra -= bp->b_bufsize / blksize;
 280                 } else {
 281 single_block_read:
 282                         /*
 283                          * If it isn't in the cache, then get a chunk from
 284                          * disk if sequential, otherwise just get the block.
 285                          */
 286                         cluster_setram(bp);
 287                         loffset += blksize;
 288                         --maxra;
 289                 }
 290         }
 291
 292         /*
 293          * If B_CACHE was not set issue bp.  bp will either be an
 294          * asynchronous cluster buf or a synchronous single-buf.
 295          * If it is a single buf it will be the same as reqbp.
 296          *
 297          * NOTE: Once an async cluster buf is issued bp becomes invalid.
 298          */
 299         if (bp) {
 300 #if defined(CLUSTERDEBUG)
 301                 if (rcluster)
 302                         kprintf("S(%012jx,%d,%d)\n",
 303                             (intmax_t)bp->b_loffset, bp->b_bcount, maxra);
 304 #endif
 305                 if ((bp->b_flags & B_CLUSTER) == 0)
 306                         vfs_busy_pages(vp, bp);
 307                 bp->b_flags &= ~(B_ERROR|B_INVAL);
 308                 vn_strategy(vp, &bp->b_bio1);
 309                 error = 0;
 310                 /* bp invalid now */
 311         }
 312
 313         /*
 314          * If we have been doing sequential I/O, then do some read-ahead.
 315          * The code above us should have positioned us at the next likely
 316          * offset.
 317          *
 318          * Only mess with buffers which we can immediately lock.  HAMMER
 319          * will do device-readahead irrespective of what the blocks
 320          * represent.
 321          */
 322         while (error == 0 && maxra > 0) {
 323                 int burstbytes;
 324                 int tmp_error;
 325                 int nblks;
 326
 327                 rbp = getblk(vp, loffset, blksize,
 328                              GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
 329                 if (rbp == NULL)
 330                         goto no_read_ahead;
 331                 if ((rbp->b_flags & B_CACHE)) {
 332                         bqrelse(rbp);
 333                         goto no_read_ahead;
 334                 }
 335
 336                 /*
 337                  * An error from the read-ahead bmap has nothing to do
 338                  * with the caller's original request.
 339                  */
 340                 tmp_error = VOP_BMAP(vp, loffset, &doffset,
 341                                      &burstbytes, NULL, BUF_CMD_READ);
 342                 if (tmp_error || doffset == NOOFFSET) {
 343                         rbp->b_flags |= B_INVAL;
 344                         brelse(rbp);
 345                         rbp = NULL;
 346                         goto no_read_ahead;
 347                 }
 348                 if ((nblks = maxra) > maxrbuild)
 349                         nblks = maxrbuild;
 350                 if (nblks > burstbytes / blksize)
 351                         nblks = burstbytes / blksize;
 352
 353                 /*
 354                  * rbp: async read
 355                  */
 356                 rbp->b_cmd = BUF_CMD_READ;
 357                 /*rbp->b_flags |= B_AGE*/;
 358                 cluster_setram(rbp);
 359
 360                 if (nblks > 1) {
 361                         rbp = cluster_rbuild(vp, filesize, loffset,
 362                                              doffset, blksize,
 363                                              nblks, rbp);
 364                 } else {
 365                         rbp->b_bio2.bio_offset = doffset;
 366                 }
 367
 368 #if defined(CLUSTERDEBUG)
 369                 if (rcluster) {
 370                         if (bp) {
 371                                 kprintf("A+(%012jx,%d,%jd) "
 372                                         "doff=%012jx minr=%zd ra=%d\n",
 373                                     (intmax_t)loffset, rbp->b_bcount,
 374                                     (intmax_t)(loffset - origoffset),
 375                                     (intmax_t)doffset, minreq, maxra);
 376                         } else {
 377                                 kprintf("A-(%012jx,%d,%jd) "
 378                                         "doff=%012jx minr=%zd ra=%d\n",
 379                                     (intmax_t)rbp->b_loffset, rbp->b_bcount,
 380                                     (intmax_t)(loffset - origoffset),
 381                                     (intmax_t)doffset, minreq, maxra);
 382                         }
 383                 }
 384 #endif
 385                 rbp->b_flags &= ~(B_ERROR|B_INVAL);
 386
 387                 if ((rbp->b_flags & B_CLUSTER) == 0)
 388                         vfs_busy_pages(vp, rbp);
 389                 BUF_KERNPROC(rbp);
 390                 loffset += rbp->b_bufsize;
 391                 maxra -= rbp->b_bufsize / blksize;
 392                 vn_strategy(vp, &rbp->b_bio1);
 393                 /* rbp invalid now */
 394         }
 395
 396         /*
 397          * Wait for our original buffer to complete its I/O.  reqbp will
 398          * be NULL if the original buffer was B_CACHE.  We are returning
 399          * (*bpp) which is the same as reqbp when reqbp != NULL.
 400          */
 401 no_read_ahead:
 402         if (reqbp) {
 403                 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
 404                 error = biowait(&reqbp->b_bio1, "clurd");
 405         }
 406         return (error);
 407 }
 408
 409 /*
 410  * If blocks are contiguous on disk, use this to provide clustered
 411  * read ahead.  We will read as many blocks as possible sequentially
 412  * and then parcel them up into logical blocks in the buffer hash table.
 413  *
 414  * This function either returns a cluster buf or it returns fbp.  fbp is
 415  * already expected to be set up as a synchronous or asynchronous request.
 416  *
 417  * If a cluster buf is returned it will always be async.
 418  */
 419 static struct buf *
 420 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
 421                int blksize, int run, struct buf *fbp)
 422 {
 423         struct buf *bp, *tbp;
 424         off_t boffset;
 425         int i, j;
 426         int maxiosize = vmaxiosize(vp);
 427
 428         /*
 429          * avoid a division
 430          */
 431         while (loffset + run * blksize > filesize) {
 432                 --run;
 433         }
 434
 435         tbp = fbp;
 436         tbp->b_bio2.bio_offset = doffset;
 437         if((tbp->b_flags & B_MALLOC) ||
 438             ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) {
 439                 return tbp;
 440         }
 441
 442         bp = trypbuf_kva(&cluster_pbuf_freecnt);
 443         if (bp == NULL) {
 444                 return tbp;
 445         }
 446
 447         /*
 448          * We are synthesizing a buffer out of vm_page_t's, but
 449          * if the block size is not page aligned then the starting
 450          * address may not be either.  Inherit the b_data offset
 451          * from the original buffer.
 452          */
 453         bp->b_data = (char *)((vm_offset_t)bp->b_data |
 454             ((vm_offset_t)tbp->b_data & PAGE_MASK));
 455         bp->b_flags |= B_CLUSTER | B_VMIO;
 456         bp->b_cmd = BUF_CMD_READ;
 457         bp->b_bio1.bio_done = cluster_callback;         /* default to async */
 458         bp->b_bio1.bio_caller_info1.cluster_head = NULL;
 459         bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
 460         bp->b_loffset = loffset;
 461         bp->b_bio2.bio_offset = doffset;
 462         KASSERT(bp->b_loffset != NOOFFSET,
 463                 ("cluster_rbuild: no buffer offset"));
 464
 465         bp->b_bcount = 0;
 466         bp->b_bufsize = 0;
 467         bp->b_xio.xio_npages = 0;
 468
 469         for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
 470                 if (i) {
 471                         if ((bp->b_xio.xio_npages * PAGE_SIZE) +
 472                             round_page(blksize) > maxiosize) {
 473                                 break;
 474                         }
 475
 476                         /*
 477                          * Shortcut some checks and try to avoid buffers that
 478                          * would block in the lock.  The same checks have to
 479                          * be made again after we officially get the buffer.
 480                          */
 481                         tbp = getblk(vp, loffset + i * blksize, blksize,
 482                                      GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
 483                         if (tbp == NULL)
 484                                 break;
 485                         for (j = 0; j < tbp->b_xio.xio_npages; j++) {
 486                                 if (tbp->b_xio.xio_pages[j]->valid)
 487                                         break;
 488                         }
 489                         if (j != tbp->b_xio.xio_npages) {
 490                                 bqrelse(tbp);
 491                                 break;
 492                         }
 493
 494                         /*
 495                          * Stop scanning if the buffer is fuly valid
 496                          * (marked B_CACHE), or locked (may be doing a
 497                          * background write), or if the buffer is not
 498                          * VMIO backed.  The clustering code can only deal
 499                          * with VMIO-backed buffers.
 500                          */
 501                         if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
 502                             (tbp->b_flags & B_VMIO) == 0 ||
 503                             (LIST_FIRST(&tbp->b_dep) != NULL &&
 504                              buf_checkread(tbp))
 505                         ) {
 506                                 bqrelse(tbp);
 507                                 break;
 508                         }
 509
 510                         /*
 511                          * The buffer must be completely invalid in order to
 512                          * take part in the cluster.  If it is partially valid
 513                          * then we stop.
 514                          */
 515                         for (j = 0;j < tbp->b_xio.xio_npages; j++) {
 516                                 if (tbp->b_xio.xio_pages[j]->valid)
 517                                         break;
 518                         }
 519                         if (j != tbp->b_xio.xio_npages) {
 520                                 bqrelse(tbp);
 521                                 break;
 522                         }
 523
 524                         /*
 525                          * Set a read-ahead mark as appropriate.  Always
 526                          * set the read-ahead mark at (run - 1).  It is
 527                          * unclear why we were also setting it at i == 1.
 528                          */
 529                         if (/*i == 1 ||*/ i == (run - 1))
 530                                 cluster_setram(tbp);
 531
 532                         /*
 533                          * Depress the priority of buffers not explicitly
 534                          * requested.
 535                          */
 536                         /* tbp->b_flags |= B_AGE; */
 537
 538                         /*
 539                          * Set the block number if it isn't set, otherwise
 540                          * if it is make sure it matches the block number we
 541                          * expect.
 542                          */
 543                         if (tbp->b_bio2.bio_offset == NOOFFSET) {
 544                                 tbp->b_bio2.bio_offset = boffset;
 545                         } else if (tbp->b_bio2.bio_offset != boffset) {
 546                                 brelse(tbp);
 547                                 break;
 548                         }
 549                 }
 550
 551                 /*
 552                  * The passed-in tbp (i == 0) will already be set up for
 553                  * async or sync operation.  All other tbp's acquire in
 554                  * our loop are set up for async operation.
 555                  */
 556                 tbp->b_cmd = BUF_CMD_READ;
 557                 BUF_KERNPROC(tbp);
 558                 cluster_append(&bp->b_bio1, tbp);
 559                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
 560                         vm_page_t m;
 561
 562                         m = tbp->b_xio.xio_pages[j];
 563                         vm_page_busy_wait(m, FALSE, "clurpg");
 564                         vm_page_io_start(m);
 565                         vm_page_wakeup(m);
 566                         vm_object_pip_add(m->object, 1);
 567                         if ((bp->b_xio.xio_npages == 0) ||
 568                                 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
 569                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
 570                                 bp->b_xio.xio_npages++;
 571                         }
 572                         if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
 573                                 tbp->b_xio.xio_pages[j] = bogus_page;
 574                 }
 575                 /*
 576                  * XXX shouldn't this be += size for both, like in
 577                  * cluster_wbuild()?
 578                  *
 579                  * Don't inherit tbp->b_bufsize as it may be larger due to
 580                  * a non-page-aligned size.  Instead just aggregate using
 581                  * 'size'.
 582                  */
 583                 if (tbp->b_bcount != blksize)
 584                     kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
 585                 if (tbp->b_bufsize != blksize)
 586                     kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
 587                 bp->b_bcount += blksize;
 588                 bp->b_bufsize += blksize;
 589         }
 590
 591         /*
 592          * Fully valid pages in the cluster are already good and do not need
 593          * to be re-read from disk.  Replace the page with bogus_page
 594          */
 595         for (j = 0; j < bp->b_xio.xio_npages; j++) {
 596                 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
 597                     VM_PAGE_BITS_ALL) {
 598                         bp->b_xio.xio_pages[j] = bogus_page;
 599                 }
 600         }
 601         if (bp->b_bufsize > bp->b_kvasize) {
 602                 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
 603                     bp->b_bufsize, bp->b_kvasize);
 604         }
 605         pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
 606                 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
 607         BUF_KERNPROC(bp);
 608         return (bp);
 609 }
 610
 611 /*
 612  * Cleanup after a clustered read or write.
 613  * This is complicated by the fact that any of the buffers might have
 614  * extra memory (if there were no empty buffer headers at allocbuf time)
 615  * that we will need to shift around.
 616  *
 617  * The returned bio is &bp->b_bio1
 618  */
 619 void
 620 cluster_callback(struct bio *bio)
 621 {
 622         struct buf *bp = bio->bio_buf;
 623         struct buf *tbp;
 624         int error = 0;
 625
 626         /*
 627          * Must propogate errors to all the components.  A short read (EOF)
 628          * is a critical error.
 629          */
 630         if (bp->b_flags & B_ERROR) {
 631                 error = bp->b_error;
 632         } else if (bp->b_bcount != bp->b_bufsize) {
 633                 panic("cluster_callback: unexpected EOF on cluster %p!", bio);
 634         }
 635
 636         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
 637         /*
 638          * Move memory from the large cluster buffer into the component
 639          * buffers and mark IO as done on these.  Since the memory map
 640          * is the same, no actual copying is required.
 641          */
 642         while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
 643                 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
 644                 if (error) {
 645                         tbp->b_flags |= B_ERROR | B_IODEBUG;
 646                         tbp->b_error = error;
 647                 } else {
 648                         tbp->b_dirtyoff = tbp->b_dirtyend = 0;
 649                         tbp->b_flags &= ~(B_ERROR|B_INVAL);
 650                         tbp->b_flags |= B_IODEBUG;
 651                         /*
 652                          * XXX the bdwrite()/bqrelse() issued during
 653                          * cluster building clears B_RELBUF (see bqrelse()
 654                          * comment).  If direct I/O was specified, we have
 655                          * to restore it here to allow the buffer and VM
 656                          * to be freed.
 657                          */
 658                         if (tbp->b_flags & B_DIRECT)
 659                                 tbp->b_flags |= B_RELBUF;
 660                 }
 661                 biodone(&tbp->b_bio1);
 662         }
 663         relpbuf(bp, &cluster_pbuf_freecnt);
 664 }
 665
 666 /*
 667  * Implement modified write build for cluster.
 668  *
 669  *      write_behind = 0        write behind disabled
 670  *      write_behind = 1        write behind normal (default)
 671  *      write_behind = 2        write behind backed-off
 672  *
 673  * In addition, write_behind is only activated for files that have
 674  * grown past a certain size (default 10MB).  Otherwise temporary files
 675  * wind up generating a lot of unnecessary disk I/O.
 676  */
 677 static __inline int
 678 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
 679 {
 680         int r = 0;
 681
 682         switch(write_behind) {
 683         case 2:
 684                 if (start_loffset < len)
 685                         break;
 686                 start_loffset -= len;
 687                 /* fall through */
 688         case 1:
 689                 if (vp->v_filesize >= write_behind_minfilesize) {
 690                         r = cluster_wbuild(vp, NULL, blksize,
 691                                            start_loffset, len);
 692                 }
 693                 /* fall through */
 694         default:
 695                 /* fall through */
 696                 break;
 697         }
 698         return(r);
 699 }
 700
 701 /*
 702  * Do clustered write for FFS.
 703  *
 704  * Three cases:
 705  *      1. Write is not sequential (write asynchronously)
 706  *      Write is sequential:
 707  *      2.      beginning of cluster - begin cluster
 708  *      3.      middle of a cluster - add to cluster
 709  *      4.      end of a cluster - asynchronously write cluster
 710  */
 711 void
 712 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
 713 {
 714         struct vnode *vp;
 715         off_t loffset;
 716         int maxclen, cursize;
 717         int async;
 718
 719         vp = bp->b_vp;
 720         if (vp->v_type == VREG)
 721                 async = vp->v_mount->mnt_flag & MNT_ASYNC;
 722         else
 723                 async = 0;
 724         loffset = bp->b_loffset;
 725         KASSERT(bp->b_loffset != NOOFFSET,
 726                 ("cluster_write: no buffer offset"));
 727
 728         /* Initialize vnode to beginning of file. */
 729         if (loffset == 0)
 730                 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 731
 732         if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize ||
 733             bp->b_bio2.bio_offset == NOOFFSET ||
 734             (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) {
 735                 maxclen = vmaxiosize(vp);
 736                 if (vp->v_clen != 0) {
 737                         /*
 738                          * Next block is not sequential.
 739                          *
 740                          * If we are not writing at end of file, the process
 741                          * seeked to another point in the file since its last
 742                          * write, or we have reached our maximum cluster size,
 743                          * then push the previous cluster. Otherwise try
 744                          * reallocating to make it sequential.
 745                          *
 746                          * Change to algorithm: only push previous cluster if
 747                          * it was sequential from the point of view of the
 748                          * seqcount heuristic, otherwise leave the buffer
 749                          * intact so we can potentially optimize the I/O
 750                          * later on in the buf_daemon or update daemon
 751                          * flush.
 752                          */
 753                         cursize = vp->v_lastw - vp->v_cstart + blksize;
 754                         if (bp->b_loffset + blksize < filesize ||
 755                             loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) {
 756                                 if (!async && seqcount > 0) {
 757                                         cluster_wbuild_wb(vp, blksize,
 758                                                 vp->v_cstart, cursize);
 759                                 }
 760                         } else {
 761                                 struct buf **bpp, **endbp;
 762                                 struct cluster_save *buflist;
 763
 764                                 buflist = cluster_collectbufs(vp, bp, blksize);
 765                                 endbp = &buflist->bs_children
 766                                     [buflist->bs_nchildren - 1];
 767                                 if (VOP_REALLOCBLKS(vp, buflist)) {
 768                                         /*
 769                                          * Failed, push the previous cluster
 770                                          * if *really* writing sequentially
 771                                          * in the logical file (seqcount > 1),
 772                                          * otherwise delay it in the hopes that
 773                                          * the low level disk driver can
 774                                          * optimize the write ordering.
 775                                          */
 776                                         for (bpp = buflist->bs_children;
 777                                              bpp < endbp; bpp++)
 778                                                 brelse(*bpp);
 779                                         kfree(buflist, M_SEGMENT);
 780                                         if (seqcount > 1) {
 781                                                 cluster_wbuild_wb(vp,
 782                                                     blksize, vp->v_cstart,
 783                                                     cursize);
 784                                         }
 785                                 } else {
 786                                         /*
 787                                          * Succeeded, keep building cluster.
 788                                          */
 789                                         for (bpp = buflist->bs_children;
 790                                              bpp <= endbp; bpp++)
 791                                                 bdwrite(*bpp);
 792                                         kfree(buflist, M_SEGMENT);
 793                                         vp->v_lastw = loffset;
 794                                         vp->v_lasta = bp->b_bio2.bio_offset;
 795                                         return;
 796                                 }
 797                         }
 798                 }
 799                 /*
 800                  * Consider beginning a cluster. If at end of file, make
 801                  * cluster as large as possible, otherwise find size of
 802                  * existing cluster.
 803                  */
 804                 if ((vp->v_type == VREG) &&
 805                     bp->b_loffset + blksize < filesize &&
 806                     (bp->b_bio2.bio_offset == NOOFFSET) &&
 807                     (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) ||
 808                      bp->b_bio2.bio_offset == NOOFFSET)) {
 809                         bdwrite(bp);
 810                         vp->v_clen = 0;
 811                         vp->v_lasta = bp->b_bio2.bio_offset;
 812                         vp->v_cstart = loffset + blksize;
 813                         vp->v_lastw = loffset;
 814                         return;
 815                 }
 816                 if (maxclen > blksize)
 817                         vp->v_clen = maxclen - blksize;
 818                 else
 819                         vp->v_clen = 0;
 820                 if (!async && vp->v_clen == 0) { /* I/O not contiguous */
 821                         vp->v_cstart = loffset + blksize;
 822                         bdwrite(bp);
 823                 } else {        /* Wait for rest of cluster */
 824                         vp->v_cstart = loffset;
 825                         bdwrite(bp);
 826                 }
 827         } else if (loffset == vp->v_cstart + vp->v_clen) {
 828                 /*
 829                  * At end of cluster, write it out if seqcount tells us we
 830                  * are operating sequentially, otherwise let the buf or
 831                  * update daemon handle it.
 832                  */
 833                 bdwrite(bp);
 834                 if (seqcount > 1)
 835                         cluster_wbuild_wb(vp, blksize, vp->v_cstart,
 836                                           vp->v_clen + blksize);
 837                 vp->v_clen = 0;
 838                 vp->v_cstart = loffset + blksize;
 839         } else if (vm_page_count_severe() &&
 840                    bp->b_loffset + blksize < filesize) {
 841                 /*
 842                  * We are low on memory, get it going NOW.  However, do not
 843                  * try to push out a partial block at the end of the file
 844                  * as this could lead to extremely non-optimal write activity.
 845                  */
 846                 bawrite(bp);
 847         } else {
 848                 /*
 849                  * In the middle of a cluster, so just delay the I/O for now.
 850                  */
 851                 bdwrite(bp);
 852         }
 853         vp->v_lastw = loffset;
 854         vp->v_lasta = bp->b_bio2.bio_offset;
 855 }
 856
 857 /*
 858  * This is the clustered version of bawrite().  It works similarly to
 859  * cluster_write() except I/O on the buffer is guaranteed to occur.
 860  */
 861 int
 862 cluster_awrite(struct buf *bp)
 863 {
 864         int total;
 865
 866         /*
 867          * Don't bother if it isn't clusterable.
 868          */
 869         if ((bp->b_flags & B_CLUSTEROK) == 0 ||
 870             bp->b_vp == NULL ||
 871             (bp->b_vp->v_flag & VOBJBUF) == 0) {
 872                 total = bp->b_bufsize;
 873                 bawrite(bp);
 874                 return (total);
 875         }
 876
 877         total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize,
 878                                bp->b_loffset, vmaxiosize(bp->b_vp));
 879         if (bp)
 880                 bawrite(bp);
 881
 882         return total;
 883 }
 884
 885 /*
 886  * This is an awful lot like cluster_rbuild...wish they could be combined.
 887  * The last lbn argument is the current block on which I/O is being
 888  * performed.  Check to see that it doesn't fall in the middle of
 889  * the current block (if last_bp == NULL).
 890  *
 891  * cluster_wbuild() normally does not guarantee anything.  If bpp is
 892  * non-NULL and cluster_wbuild() is able to incorporate it into the
 893  * I/O it will set *bpp to NULL, otherwise it will leave it alone and
 894  * the caller must dispose of *bpp.
 895  */
 896 static int
 897 cluster_wbuild(struct vnode *vp, struct buf **bpp,
 898                int blksize, off_t start_loffset, int bytes)
 899 {
 900         struct buf *bp, *tbp;
 901         int i, j;
 902         int totalwritten = 0;
 903         int must_initiate;
 904         int maxiosize = vmaxiosize(vp);
 905
 906         while (bytes > 0) {
 907                 /*
 908                  * If the buffer matches the passed locked & removed buffer
 909                  * we used the passed buffer (which might not be B_DELWRI).
 910                  *
 911                  * Otherwise locate the buffer and determine if it is
 912                  * compatible.
 913                  */
 914                 if (bpp && (*bpp)->b_loffset == start_loffset) {
 915                         tbp = *bpp;
 916                         *bpp = NULL;
 917                         bpp = NULL;
 918                 } else {
 919                         tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
 920                         if (tbp == NULL ||
 921                             (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) !=
 922                              B_DELWRI ||
 923                             (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
 924                                 if (tbp)
 925                                         BUF_UNLOCK(tbp);
 926                                 start_loffset += blksize;
 927                                 bytes -= blksize;
 928                                 continue;
 929                         }
 930                         bremfree(tbp);
 931                 }
 932                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
 933
 934                 /*
 935                  * Extra memory in the buffer, punt on this buffer.
 936                  * XXX we could handle this in most cases, but we would
 937                  * have to push the extra memory down to after our max
 938                  * possible cluster size and then potentially pull it back
 939                  * up if the cluster was terminated prematurely--too much
 940                  * hassle.
 941                  */
 942                 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
 943                     (tbp->b_bcount != tbp->b_bufsize) ||
 944                     (tbp->b_bcount != blksize) ||
 945                     (bytes == blksize) ||
 946                     ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) {
 947                         totalwritten += tbp->b_bufsize;
 948                         bawrite(tbp);
 949                         start_loffset += blksize;
 950                         bytes -= blksize;
 951                         continue;
 952                 }
 953
 954                 /*
 955                  * Set up the pbuf.  Track our append point with b_bcount
 956                  * and b_bufsize.  b_bufsize is not used by the device but
 957                  * our caller uses it to loop clusters and we use it to
 958                  * detect a premature EOF on the block device.
 959                  */
 960                 bp->b_bcount = 0;
 961                 bp->b_bufsize = 0;
 962                 bp->b_xio.xio_npages = 0;
 963                 bp->b_loffset = tbp->b_loffset;
 964                 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
 965
 966                 /*
 967                  * We are synthesizing a buffer out of vm_page_t's, but
 968                  * if the block size is not page aligned then the starting
 969                  * address may not be either.  Inherit the b_data offset
 970                  * from the original buffer.
 971                  */
 972                 bp->b_data = (char *)((vm_offset_t)bp->b_data |
 973                     ((vm_offset_t)tbp->b_data & PAGE_MASK));
 974                 bp->b_flags &= ~B_ERROR;
 975                 bp->b_flags |= B_CLUSTER | B_BNOCLIP |
 976                         (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
 977                 bp->b_bio1.bio_caller_info1.cluster_head = NULL;
 978                 bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
 979
 980                 /*
 981                  * From this location in the file, scan forward to see
 982                  * if there are buffers with adjacent data that need to
 983                  * be written as well.
 984                  *
 985                  * IO *must* be initiated on index 0 at this point
 986                  * (particularly when called from cluster_awrite()).
 987                  */
 988                 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
 989                         if (i == 0) {
 990                                 must_initiate = 1;
 991                         } else {
 992                                 /*
 993                                  * Not first buffer.
 994                                  */
 995                                 must_initiate = 0;
 996                                 tbp = findblk(vp, start_loffset,
 997                                               FINDBLK_NBLOCK);
 998                                 /*
 999                                  * Buffer not found or could not be locked
1000                                  * non-blocking.
1001                                  */
1002                                 if (tbp == NULL)
1003                                         break;
1004
1005                                 /*
1006                                  * If it IS in core, but has different
1007                                  * characteristics, then don't cluster
1008                                  * with it.
1009                                  */
1010                                 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
1011                                      B_INVAL | B_DELWRI | B_NEEDCOMMIT))
1012                                     != (B_DELWRI | B_CLUSTEROK |
1013                                      (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
1014                                     (tbp->b_flags & B_LOCKED)
1015                                 ) {
1016                                         BUF_UNLOCK(tbp);
1017                                         break;
1018                                 }
1019
1020                                 /*
1021                                  * Check that the combined cluster
1022                                  * would make sense with regard to pages
1023                                  * and would not be too large
1024                                  *
1025                                  * WARNING! buf_checkwrite() must be the last
1026                                  *          check made.  If it returns 0 then
1027                                  *          we must initiate the I/O.
1028                                  */
1029                                 if ((tbp->b_bcount != blksize) ||
1030                                   ((bp->b_bio2.bio_offset + i) !=
1031                                     tbp->b_bio2.bio_offset) ||
1032                                   ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
1033                                     (maxiosize / PAGE_SIZE)) ||
1034                                   (LIST_FIRST(&tbp->b_dep) &&
1035                                    buf_checkwrite(tbp))
1036                                 ) {
1037                                         BUF_UNLOCK(tbp);
1038                                         break;
1039                                 }
1040                                 if (LIST_FIRST(&tbp->b_dep))
1041                                         must_initiate = 1;
1042                                 /*
1043                                  * Ok, it's passed all the tests,
1044                                  * so remove it from the free list
1045                                  * and mark it busy. We will use it.
1046                                  */
1047                                 bremfree(tbp);
1048                                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
1049                         }
1050
1051                         /*
1052                          * If the IO is via the VM then we do some
1053                          * special VM hackery (yuck).  Since the buffer's
1054                          * block size may not be page-aligned it is possible
1055                          * for a page to be shared between two buffers.  We
1056                          * have to get rid of the duplication when building
1057                          * the cluster.
1058                          */
1059                         if (tbp->b_flags & B_VMIO) {
1060                                 vm_page_t m;
1061
1062                                 /*
1063                                  * Try to avoid deadlocks with the VM system.
1064                                  * However, we cannot abort the I/O if
1065                                  * must_initiate is non-zero.
1066                                  */
1067                                 if (must_initiate == 0) {
1068                                         for (j = 0;
1069                                              j < tbp->b_xio.xio_npages;
1070                                              ++j) {
1071                                                 m = tbp->b_xio.xio_pages[j];
1072                                                 if (m->flags & PG_BUSY) {
1073                                                         bqrelse(tbp);
1074                                                         goto finishcluster;
1075                                                 }
1076                                         }
1077                                 }
1078
1079                                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
1080                                         m = tbp->b_xio.xio_pages[j];
1081                                         vm_page_busy_wait(m, FALSE, "clurpg");
1082                                         vm_page_io_start(m);
1083                                         vm_page_wakeup(m);
1084                                         vm_object_pip_add(m->object, 1);
1085                                         if ((bp->b_xio.xio_npages == 0) ||
1086                                           (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
1087                                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
1088                                                 bp->b_xio.xio_npages++;
1089                                         }
1090                                 }
1091                         }
1092                         bp->b_bcount += blksize;
1093                         bp->b_bufsize += blksize;
1094
1095                         bundirty(tbp);
1096                         tbp->b_flags &= ~B_ERROR;
1097                         tbp->b_cmd = BUF_CMD_WRITE;
1098                         BUF_KERNPROC(tbp);
1099                         cluster_append(&bp->b_bio1, tbp);
1100
1101                         /*
1102                          * check for latent dependencies to be handled
1103                          */
1104                         if (LIST_FIRST(&tbp->b_dep) != NULL)
1105                                 buf_start(tbp);
1106                 }
1107         finishcluster:
1108                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
1109                             (vm_page_t *)bp->b_xio.xio_pages,
1110                             bp->b_xio.xio_npages);
1111                 if (bp->b_bufsize > bp->b_kvasize) {
1112                         panic("cluster_wbuild: b_bufsize(%d) "
1113                               "> b_kvasize(%d)\n",
1114                               bp->b_bufsize, bp->b_kvasize);
1115                 }
1116                 totalwritten += bp->b_bufsize;
1117                 bp->b_dirtyoff = 0;
1118                 bp->b_dirtyend = bp->b_bufsize;
1119                 bp->b_bio1.bio_done = cluster_callback;
1120                 bp->b_cmd = BUF_CMD_WRITE;
1121
1122                 vfs_busy_pages(vp, bp);
1123                 bsetrunningbufspace(bp, bp->b_bufsize);
1124                 BUF_KERNPROC(bp);
1125                 vn_strategy(vp, &bp->b_bio1);
1126
1127                 bytes -= i;
1128         }
1129         return totalwritten;
1130 }
1131
1132 /*
1133  * Collect together all the buffers in a cluster, plus add one
1134  * additional buffer passed-in.
1135  *
1136  * Only pre-existing buffers whos block size matches blksize are collected.
1137  * (this is primarily because HAMMER1 uses varying block sizes and we don't
1138  * want to override its choices).
1139  */
1140 static struct cluster_save *
1141 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize)
1142 {
1143         struct cluster_save *buflist;
1144         struct buf *bp;
1145         off_t loffset;
1146         int i, len;
1147         int j;
1148         int k;
1149
1150         len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize;
1151         buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1152                          M_SEGMENT, M_WAITOK);
1153         buflist->bs_nchildren = 0;
1154         buflist->bs_children = (struct buf **) (buflist + 1);
1155         for (loffset = vp->v_cstart, i = 0, j = 0;
1156              i < len;
1157              (loffset += blksize), i++) {
1158                 bp = getcacheblk(vp, loffset,
1159                                  last_bp->b_bcount, GETBLK_SZMATCH);
1160                 buflist->bs_children[i] = bp;
1161                 if (bp == NULL) {
1162                         j = i + 1;
1163                 } else if (bp->b_bio2.bio_offset == NOOFFSET) {
1164                         VOP_BMAP(bp->b_vp, bp->b_loffset,
1165                                  &bp->b_bio2.bio_offset,
1166                                  NULL, NULL, BUF_CMD_WRITE);
1167                 }
1168         }
1169
1170         /*
1171          * Get rid of gaps
1172          */
1173         for (k = 0; k < j; ++k) {
1174                 if (buflist->bs_children[k]) {
1175                         bqrelse(buflist->bs_children[k]);
1176                         buflist->bs_children[k] = NULL;
1177                 }
1178         }
1179         if (j != 0) {
1180                 if (j != i) {
1181                         bcopy(buflist->bs_children + j,
1182                               buflist->bs_children + 0,
1183                               sizeof(buflist->bs_children[0]) * (i - j));
1184                 }
1185                 i -= j;
1186         }
1187         buflist->bs_children[i] = bp = last_bp;
1188         if (bp->b_bio2.bio_offset == NOOFFSET) {
1189                 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
1190                          NULL, NULL, BUF_CMD_WRITE);
1191         }
1192         buflist->bs_nchildren = i + 1;
1193         return (buflist);
1194 }
1195
1196 void
1197 cluster_append(struct bio *bio, struct buf *tbp)
1198 {
1199         tbp->b_cluster_next = NULL;
1200         if (bio->bio_caller_info1.cluster_head == NULL) {
1201                 bio->bio_caller_info1.cluster_head = tbp;
1202                 bio->bio_caller_info2.cluster_tail = tbp;
1203         } else {
1204                 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
1205                 bio->bio_caller_info2.cluster_tail = tbp;
1206         }
1207 }
1208
1209 static
1210 void
1211 cluster_setram (struct buf *bp)
1212 {
1213         bp->b_flags |= B_RAM;
1214         if (bp->b_xio.xio_npages)
1215                 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
1216 }