Merge branch 'net80211-update' of git://leaf.dragonflybsd.org/~rpaulo/dragonfly into...
[dragonfly.git] / sys / kern / vfs_cluster.c
1 /*-
2  * Copyright (c) 1993
3  *      The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  *      Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *      This product includes software developed by the University of
18  *      California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *      @(#)vfs_cluster.c       8.7 (Berkeley) 2/13/94
36  * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
37  * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $
38  */
39
40 #include "opt_debug_cluster.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/buf.h>
47 #include <sys/vnode.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/resourcevar.h>
51 #include <sys/vmmeter.h>
52 #include <vm/vm.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_page.h>
55 #include <sys/sysctl.h>
56 #include <sys/buf2.h>
57 #include <vm/vm_page2.h>
58
59 #include <machine/limits.h>
60
61 #if defined(CLUSTERDEBUG)
62 #include <sys/sysctl.h>
63 static int      rcluster= 0;
64 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
65 #endif
66
67 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer");
68
69 static struct cluster_save *
70         cluster_collectbufs (struct vnode *vp, struct buf *last_bp,
71                             int blksize);
72 static struct buf *
73         cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
74                             off_t doffset, int blksize, int run, 
75                             struct buf *fbp);
76 static void cluster_callback (struct bio *);
77 static void cluster_setram (struct buf *);
78
79 static int write_behind = 1;
80 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
81
82 extern vm_page_t        bogus_page;
83
84 extern int cluster_pbuf_freecnt;
85
86 /*
87  * Maximum number of blocks for read-ahead.
88  */
89 #define MAXRA 32
90
91 /*
92  * This replaces bread.
93  */
94 int
95 cluster_read(struct vnode *vp, off_t filesize, off_t loffset, 
96              int blksize, size_t resid, int seqcount, struct buf **bpp)
97 {
98         struct buf *bp, *rbp, *reqbp;
99         off_t origoffset;
100         off_t doffset;
101         int error;
102         int i;
103         int maxra, racluster;
104         int totread;
105
106         error = 0;
107         totread = (resid > INT_MAX) ? INT_MAX : (int)resid;
108
109         /*
110          * racluster - calculate maximum cluster IO size (limited by
111          *             backing block device).
112          *
113          * Try to limit the amount of read-ahead by a few ad-hoc parameters.
114          * This needs work!!!
115          *
116          * NOTE!  The BMAP operations may involve synchronous I/O so we
117          *        really want several cluster IOs in progress to absorb
118          *        the time lag.
119          */
120         racluster = vmaxiosize(vp) / blksize;
121         maxra = 2 * racluster + (totread / blksize);
122         if (maxra > MAXRA)
123                 maxra = MAXRA;
124         if (maxra > nbuf / 8)
125                 maxra = nbuf / 8;
126
127         /*
128          * Get the requested block.
129          */
130         *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
131         origoffset = loffset;
132
133         /*
134          * if it is in the cache, then check to see if the reads have been
135          * sequential.  If they have, then try some read-ahead, otherwise
136          * back-off on prospective read-aheads.
137          */
138         if (bp->b_flags & B_CACHE) {
139                 /*
140                  * Not sequential, do not do any read-ahead
141                  */
142                 seqcount -= (bp->b_bufsize + BKVASIZE - 1) / BKVASIZE;
143                 if (seqcount <= 0 || maxra == 0)
144                         return 0;
145
146                 /*
147                  * No read-ahead mark, do not do any read-ahead
148                  * yet.
149                  */
150                 if ((bp->b_flags & B_RAM) == 0)
151                         return 0;
152
153                 /*
154                  * We hit a read-ahead-mark, figure out how much read-ahead
155                  * to do (maxra) and where to start (loffset).
156                  *
157                  * Shortcut the scan.  Typically the way this works is that
158                  * we've built up all the blocks inbetween except for the
159                  * last in previous iterations, so if the second-to-last
160                  * block is present we just skip ahead to it.
161                  *
162                  * This algorithm has O(1) cpu in the steady state no
163                  * matter how large maxra is.
164                  */
165                 bp->b_flags &= ~B_RAM;
166
167                 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST))
168                         i = maxra - 1;
169                 else
170                         i = 1;
171                 while (i < maxra) {
172                         if (findblk(vp, loffset + i * blksize,
173                                     FINDBLK_TEST) == NULL) {
174                                 break;
175                         }
176                         ++i;
177                 }
178                 if (i >= maxra)
179                         return 0;
180                 maxra -= i;
181                 loffset += i * blksize;
182                 reqbp = bp = NULL;
183         } else {
184                 off_t firstread = bp->b_loffset;
185                 int nblks;
186
187                 /*
188                  * Set-up synchronous read for bp.
189                  */
190                 bp->b_cmd = BUF_CMD_READ;
191                 bp->b_bio1.bio_done = biodone_sync;
192                 bp->b_bio1.bio_flags |= BIO_SYNC;
193
194                 KASSERT(firstread != NOOFFSET, 
195                         ("cluster_read: no buffer offset"));
196                 if (firstread + totread > filesize)
197                         totread = (int)(filesize - firstread);
198                 nblks = totread / blksize;
199                 if (nblks) {
200                         int burstbytes;
201
202                         if (nblks > racluster)
203                                 nblks = racluster;
204
205                         error = VOP_BMAP(vp, loffset, &doffset,
206                                          &burstbytes, NULL, BUF_CMD_READ);
207                         if (error)
208                                 goto single_block_read;
209                         if (doffset == NOOFFSET)
210                                 goto single_block_read;
211                         if (burstbytes < blksize * 2)
212                                 goto single_block_read;
213                         if (nblks > burstbytes / blksize)
214                                 nblks = burstbytes / blksize;
215
216                         bp = cluster_rbuild(vp, filesize, loffset,
217                                             doffset, blksize, nblks, bp);
218                         loffset += bp->b_bufsize;
219                         maxra -= (bp->b_bufsize - blksize) / blksize;
220                 } else {
221 single_block_read:
222                         /*
223                          * if it isn't in the cache, then get a chunk from
224                          * disk if sequential, otherwise just get the block.
225                          */
226                         cluster_setram(bp);
227                         loffset += blksize;
228                 }
229         }
230
231         /*
232          * If B_CACHE was not set issue bp.  bp will either be an
233          * asynchronous cluster buf or a synchronous single-buf.
234          * If it is a single buf it will be the same as reqbp.
235          *
236          * NOTE: Once an async cluster buf is issued bp becomes invalid.
237          */
238         if (bp) {
239 #if defined(CLUSTERDEBUG)
240                 if (rcluster)
241                         kprintf("S(%lld,%d,%d)\n",
242                             bp->b_loffset, bp->b_bcount, seqcount);
243 #endif
244                 if ((bp->b_flags & B_CLUSTER) == 0)
245                         vfs_busy_pages(vp, bp);
246                 bp->b_flags &= ~(B_ERROR|B_INVAL);
247                 seqcount -= (bp->b_bufsize + BKVASIZE - 1) / BKVASIZE;
248                 vn_strategy(vp, &bp->b_bio1);
249                 error = 0;
250                 /* bp invalid now */
251         }
252
253         /*
254          * If we have been doing sequential I/O, then do some read-ahead.
255          * The code above us should have positioned us at the next likely
256          * offset.
257          *
258          * Only mess with buffers which we can immediately lock.  HAMMER
259          * will do device-readahead irrespective of what the blocks
260          * represent.
261          */
262         while (!error && seqcount > 0 && maxra > 0 &&
263                loffset + blksize <= filesize) {
264                 int nblksread;
265                 int ntoread;
266                 int burstbytes;
267                 int tmp_error;
268
269                 rbp = getblk(vp, loffset, blksize,
270                              GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
271                 if (rbp == NULL)
272                         goto no_read_ahead;
273                 if ((rbp->b_flags & B_CACHE)) {
274                         bqrelse(rbp);
275                         goto no_read_ahead;
276                 }
277
278                 /*
279                  * An error from the read-ahead bmap has nothing to do
280                  * with the caller's original request.
281                  */
282                 tmp_error = VOP_BMAP(vp, loffset, &doffset,
283                                      &burstbytes, NULL, BUF_CMD_READ);
284                 if (tmp_error || doffset == NOOFFSET) {
285                         rbp->b_flags |= B_INVAL;
286                         brelse(rbp);
287                         rbp = NULL;
288                         goto no_read_ahead;
289                 }
290                 ntoread = burstbytes / blksize;
291                 nblksread = (totread + blksize - 1) / blksize;
292                 if (seqcount < nblksread)
293                         seqcount = nblksread;
294                 if (ntoread > seqcount)
295                         ntoread = seqcount;
296
297                 /*
298                  * rbp: async read
299                  */
300                 rbp->b_cmd = BUF_CMD_READ;
301                 /*rbp->b_flags |= B_AGE*/;
302                 cluster_setram(rbp);
303
304                 if (burstbytes) {
305                         rbp = cluster_rbuild(vp, filesize, loffset,
306                                              doffset, blksize, 
307                                              ntoread, rbp);
308                 } else {
309                         rbp->b_bio2.bio_offset = doffset;
310                 }
311                 seqcount -= (rbp->b_bufsize + BKVASIZE - 1) / BKVASIZE;
312 #if defined(CLUSTERDEBUG)
313                 if (rcluster) {
314                         if (bp)
315                                 kprintf("A+(%lld,%d,%lld,%d) ra=%d\n",
316                                     rbp->b_loffset, rbp->b_bcount,
317                                     rbp->b_loffset - origoffset,
318                                     seqcount, maxra);
319                         else
320                                 kprintf("A-(%lld,%d,%lld,%d) ra=%d\n",
321                                     rbp->b_loffset, rbp->b_bcount,
322                                     rbp->b_loffset - origoffset,
323                                     seqcount, maxra);
324                 }
325 #endif
326                 rbp->b_flags &= ~(B_ERROR|B_INVAL);
327
328                 if ((rbp->b_flags & B_CLUSTER) == 0)
329                         vfs_busy_pages(vp, rbp);
330                 BUF_KERNPROC(rbp);
331                 loffset += rbp->b_bufsize;
332                 maxra -= rbp->b_bufsize / blksize;
333                 vn_strategy(vp, &rbp->b_bio1);
334                 /* rbp invalid now */
335         }
336
337         /*
338          * Wait for our original buffer to complete its I/O.  reqbp will
339          * be NULL if the original buffer was B_CACHE.  We are returning
340          * (*bpp) which is the same as reqbp when reqbp != NULL.
341          */
342 no_read_ahead:
343         if (reqbp) {
344                 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
345                 error = biowait(&reqbp->b_bio1, "clurd");
346         }
347         return (error);
348 }
349
350 /*
351  * If blocks are contiguous on disk, use this to provide clustered
352  * read ahead.  We will read as many blocks as possible sequentially
353  * and then parcel them up into logical blocks in the buffer hash table.
354  *
355  * This function either returns a cluster buf or it returns fbp.  fbp is
356  * already expected to be set up as a synchronous or asynchronous request.
357  *
358  * If a cluster buf is returned it will always be async.
359  */
360 static struct buf *
361 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
362                int blksize, int run, struct buf *fbp)
363 {
364         struct buf *bp, *tbp;
365         off_t boffset;
366         int i, j;
367         int maxiosize = vmaxiosize(vp);
368
369         /*
370          * avoid a division
371          */
372         while (loffset + run * blksize > filesize) {
373                 --run;
374         }
375
376         tbp = fbp;
377         tbp->b_bio2.bio_offset = doffset;
378         if((tbp->b_flags & B_MALLOC) ||
379             ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) {
380                 return tbp;
381         }
382
383         bp = trypbuf(&cluster_pbuf_freecnt);
384         if (bp == NULL) {
385                 return tbp;
386         }
387
388         /*
389          * We are synthesizing a buffer out of vm_page_t's, but
390          * if the block size is not page aligned then the starting
391          * address may not be either.  Inherit the b_data offset
392          * from the original buffer.
393          */
394         bp->b_data = (char *)((vm_offset_t)bp->b_data |
395             ((vm_offset_t)tbp->b_data & PAGE_MASK));
396         bp->b_flags |= B_CLUSTER | B_VMIO;
397         bp->b_cmd = BUF_CMD_READ;
398         bp->b_bio1.bio_done = cluster_callback;         /* default to async */
399         bp->b_bio1.bio_caller_info1.cluster_head = NULL;
400         bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
401         bp->b_loffset = loffset;
402         bp->b_bio2.bio_offset = doffset;
403         KASSERT(bp->b_loffset != NOOFFSET,
404                 ("cluster_rbuild: no buffer offset"));
405
406         bp->b_bcount = 0;
407         bp->b_bufsize = 0;
408         bp->b_xio.xio_npages = 0;
409
410         for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) {
411                 if (i) {
412                         if ((bp->b_xio.xio_npages * PAGE_SIZE) +
413                             round_page(blksize) > maxiosize) {
414                                 break;
415                         }
416
417                         /*
418                          * Shortcut some checks and try to avoid buffers that
419                          * would block in the lock.  The same checks have to
420                          * be made again after we officially get the buffer.
421                          */
422                         tbp = getblk(vp, loffset + i * blksize, blksize,
423                                      GETBLK_SZMATCH|GETBLK_NOWAIT, 0);
424                         if (tbp == NULL)
425                                 break;
426                         for (j = 0; j < tbp->b_xio.xio_npages; j++) {
427                                 if (tbp->b_xio.xio_pages[j]->valid)
428                                         break;
429                         }
430                         if (j != tbp->b_xio.xio_npages) {
431                                 bqrelse(tbp);
432                                 break;
433                         }
434
435                         /*
436                          * Stop scanning if the buffer is fuly valid 
437                          * (marked B_CACHE), or locked (may be doing a
438                          * background write), or if the buffer is not
439                          * VMIO backed.  The clustering code can only deal
440                          * with VMIO-backed buffers.
441                          */
442                         if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
443                             (tbp->b_flags & B_VMIO) == 0 ||
444                             (LIST_FIRST(&tbp->b_dep) != NULL &&
445                              buf_checkread(tbp))
446                         ) {
447                                 bqrelse(tbp);
448                                 break;
449                         }
450
451                         /*
452                          * The buffer must be completely invalid in order to
453                          * take part in the cluster.  If it is partially valid
454                          * then we stop.
455                          */
456                         for (j = 0;j < tbp->b_xio.xio_npages; j++) {
457                                 if (tbp->b_xio.xio_pages[j]->valid)
458                                         break;
459                         }
460                         if (j != tbp->b_xio.xio_npages) {
461                                 bqrelse(tbp);
462                                 break;
463                         }
464
465                         /*
466                          * Set a read-ahead mark as appropriate
467                          */
468                         if (i == 1 || i == (run - 1))
469                                 cluster_setram(tbp);
470
471                         /*
472                          * Depress the priority of buffers not explicitly
473                          * requested.
474                          */
475                         /* tbp->b_flags |= B_AGE; */
476
477                         /*
478                          * Set the block number if it isn't set, otherwise
479                          * if it is make sure it matches the block number we
480                          * expect.
481                          */
482                         if (tbp->b_bio2.bio_offset == NOOFFSET) {
483                                 tbp->b_bio2.bio_offset = boffset;
484                         } else if (tbp->b_bio2.bio_offset != boffset) {
485                                 brelse(tbp);
486                                 break;
487                         }
488                 }
489
490                 /*
491                  * The passed-in tbp (i == 0) will already be set up for
492                  * async or sync operation.  All other tbp's acquire in
493                  * our loop are set up for async operation.
494                  */
495                 tbp->b_cmd = BUF_CMD_READ;
496                 BUF_KERNPROC(tbp);
497                 cluster_append(&bp->b_bio1, tbp);
498                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
499                         vm_page_t m;
500                         m = tbp->b_xio.xio_pages[j];
501                         vm_page_io_start(m);
502                         vm_object_pip_add(m->object, 1);
503                         if ((bp->b_xio.xio_npages == 0) ||
504                                 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
505                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
506                                 bp->b_xio.xio_npages++;
507                         }
508                         if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
509                                 tbp->b_xio.xio_pages[j] = bogus_page;
510                 }
511                 /*
512                  * XXX shouldn't this be += size for both, like in 
513                  * cluster_wbuild()?
514                  *
515                  * Don't inherit tbp->b_bufsize as it may be larger due to
516                  * a non-page-aligned size.  Instead just aggregate using
517                  * 'size'.
518                  */
519                 if (tbp->b_bcount != blksize)
520                     kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize);
521                 if (tbp->b_bufsize != blksize)
522                     kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize);
523                 bp->b_bcount += blksize;
524                 bp->b_bufsize += blksize;
525         }
526
527         /*
528          * Fully valid pages in the cluster are already good and do not need
529          * to be re-read from disk.  Replace the page with bogus_page
530          */
531         for (j = 0; j < bp->b_xio.xio_npages; j++) {
532                 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
533                     VM_PAGE_BITS_ALL) {
534                         bp->b_xio.xio_pages[j] = bogus_page;
535                 }
536         }
537         if (bp->b_bufsize > bp->b_kvasize) {
538                 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
539                     bp->b_bufsize, bp->b_kvasize);
540         }
541         pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
542                 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
543         BUF_KERNPROC(bp);
544         return (bp);
545 }
546
547 /*
548  * Cleanup after a clustered read or write.
549  * This is complicated by the fact that any of the buffers might have
550  * extra memory (if there were no empty buffer headers at allocbuf time)
551  * that we will need to shift around.
552  *
553  * The returned bio is &bp->b_bio1
554  */
555 void
556 cluster_callback(struct bio *bio)
557 {
558         struct buf *bp = bio->bio_buf;
559         struct buf *tbp;
560         int error = 0;
561
562         /*
563          * Must propogate errors to all the components.  A short read (EOF)
564          * is a critical error.
565          */
566         if (bp->b_flags & B_ERROR) {
567                 error = bp->b_error;
568         } else if (bp->b_bcount != bp->b_bufsize) {
569                 panic("cluster_callback: unexpected EOF on cluster %p!", bio);
570         }
571
572         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
573         /*
574          * Move memory from the large cluster buffer into the component
575          * buffers and mark IO as done on these.  Since the memory map
576          * is the same, no actual copying is required.
577          */
578         while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) {
579                 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next;
580                 if (error) {
581                         tbp->b_flags |= B_ERROR;
582                         tbp->b_error = error;
583                 } else {
584                         tbp->b_dirtyoff = tbp->b_dirtyend = 0;
585                         tbp->b_flags &= ~(B_ERROR|B_INVAL);
586                         /*
587                          * XXX the bdwrite()/bqrelse() issued during
588                          * cluster building clears B_RELBUF (see bqrelse()
589                          * comment).  If direct I/O was specified, we have
590                          * to restore it here to allow the buffer and VM
591                          * to be freed.
592                          */
593                         if (tbp->b_flags & B_DIRECT)
594                                 tbp->b_flags |= B_RELBUF;
595                 }
596                 biodone(&tbp->b_bio1);
597         }
598         relpbuf(bp, &cluster_pbuf_freecnt);
599 }
600
601 /*
602  *      cluster_wbuild_wb:
603  *
604  *      Implement modified write build for cluster.
605  *
606  *              write_behind = 0        write behind disabled
607  *              write_behind = 1        write behind normal (default)
608  *              write_behind = 2        write behind backed-off
609  */
610
611 static __inline int
612 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len)
613 {
614         int r = 0;
615
616         switch(write_behind) {
617         case 2:
618                 if (start_loffset < len)
619                         break;
620                 start_loffset -= len;
621                 /* fall through */
622         case 1:
623                 r = cluster_wbuild(vp, blksize, start_loffset, len);
624                 /* fall through */
625         default:
626                 /* fall through */
627                 break;
628         }
629         return(r);
630 }
631
632 /*
633  * Do clustered write for FFS.
634  *
635  * Three cases:
636  *      1. Write is not sequential (write asynchronously)
637  *      Write is sequential:
638  *      2.      beginning of cluster - begin cluster
639  *      3.      middle of a cluster - add to cluster
640  *      4.      end of a cluster - asynchronously write cluster
641  */
642 void
643 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount)
644 {
645         struct vnode *vp;
646         off_t loffset;
647         int maxclen, cursize;
648         int async;
649
650         vp = bp->b_vp;
651         if (vp->v_type == VREG)
652                 async = vp->v_mount->mnt_flag & MNT_ASYNC;
653         else
654                 async = 0;
655         loffset = bp->b_loffset;
656         KASSERT(bp->b_loffset != NOOFFSET, 
657                 ("cluster_write: no buffer offset"));
658
659         /* Initialize vnode to beginning of file. */
660         if (loffset == 0)
661                 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
662
663         if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize ||
664             bp->b_bio2.bio_offset == NOOFFSET ||
665             (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) {
666                 maxclen = vmaxiosize(vp);
667                 if (vp->v_clen != 0) {
668                         /*
669                          * Next block is not sequential.
670                          *
671                          * If we are not writing at end of file, the process
672                          * seeked to another point in the file since its last
673                          * write, or we have reached our maximum cluster size,
674                          * then push the previous cluster. Otherwise try
675                          * reallocating to make it sequential.
676                          *
677                          * Change to algorithm: only push previous cluster if
678                          * it was sequential from the point of view of the
679                          * seqcount heuristic, otherwise leave the buffer 
680                          * intact so we can potentially optimize the I/O
681                          * later on in the buf_daemon or update daemon
682                          * flush.
683                          */
684                         cursize = vp->v_lastw - vp->v_cstart + blksize;
685                         if (bp->b_loffset + blksize != filesize ||
686                             loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) {
687                                 if (!async && seqcount > 0) {
688                                         cluster_wbuild_wb(vp, blksize,
689                                                 vp->v_cstart, cursize);
690                                 }
691                         } else {
692                                 struct buf **bpp, **endbp;
693                                 struct cluster_save *buflist;
694
695                                 buflist = cluster_collectbufs(vp, bp, blksize);
696                                 endbp = &buflist->bs_children
697                                     [buflist->bs_nchildren - 1];
698                                 if (VOP_REALLOCBLKS(vp, buflist)) {
699                                         /*
700                                          * Failed, push the previous cluster
701                                          * if *really* writing sequentially
702                                          * in the logical file (seqcount > 1),
703                                          * otherwise delay it in the hopes that
704                                          * the low level disk driver can
705                                          * optimize the write ordering.
706                                          */
707                                         for (bpp = buflist->bs_children;
708                                              bpp < endbp; bpp++)
709                                                 brelse(*bpp);
710                                         kfree(buflist, M_SEGMENT);
711                                         if (seqcount > 1) {
712                                                 cluster_wbuild_wb(vp, 
713                                                     blksize, vp->v_cstart, 
714                                                     cursize);
715                                         }
716                                 } else {
717                                         /*
718                                          * Succeeded, keep building cluster.
719                                          */
720                                         for (bpp = buflist->bs_children;
721                                              bpp <= endbp; bpp++)
722                                                 bdwrite(*bpp);
723                                         kfree(buflist, M_SEGMENT);
724                                         vp->v_lastw = loffset;
725                                         vp->v_lasta = bp->b_bio2.bio_offset;
726                                         return;
727                                 }
728                         }
729                 }
730                 /*
731                  * Consider beginning a cluster. If at end of file, make
732                  * cluster as large as possible, otherwise find size of
733                  * existing cluster.
734                  */
735                 if ((vp->v_type == VREG) &&
736                     bp->b_loffset + blksize != filesize &&
737                     (bp->b_bio2.bio_offset == NOOFFSET) &&
738                     (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) ||
739                      bp->b_bio2.bio_offset == NOOFFSET)) {
740                         bawrite(bp);
741                         vp->v_clen = 0;
742                         vp->v_lasta = bp->b_bio2.bio_offset;
743                         vp->v_cstart = loffset + blksize;
744                         vp->v_lastw = loffset;
745                         return;
746                 }
747                 if (maxclen > blksize)
748                         vp->v_clen = maxclen - blksize;
749                 else
750                         vp->v_clen = 0;
751                 if (!async && vp->v_clen == 0) { /* I/O not contiguous */
752                         vp->v_cstart = loffset + blksize;
753                         bawrite(bp);
754                 } else {        /* Wait for rest of cluster */
755                         vp->v_cstart = loffset;
756                         bdwrite(bp);
757                 }
758         } else if (loffset == vp->v_cstart + vp->v_clen) {
759                 /*
760                  * At end of cluster, write it out if seqcount tells us we
761                  * are operating sequentially, otherwise let the buf or
762                  * update daemon handle it.
763                  */
764                 bdwrite(bp);
765                 if (seqcount > 1)
766                         cluster_wbuild_wb(vp, blksize, vp->v_cstart,
767                                           vp->v_clen + blksize);
768                 vp->v_clen = 0;
769                 vp->v_cstart = loffset + blksize;
770         } else if (vm_page_count_severe()) {
771                 /*
772                  * We are low on memory, get it going NOW
773                  */
774                 bawrite(bp);
775         } else {
776                 /*
777                  * In the middle of a cluster, so just delay the I/O for now.
778                  */
779                 bdwrite(bp);
780         }
781         vp->v_lastw = loffset;
782         vp->v_lasta = bp->b_bio2.bio_offset;
783 }
784
785
786 /*
787  * This is an awful lot like cluster_rbuild...wish they could be combined.
788  * The last lbn argument is the current block on which I/O is being
789  * performed.  Check to see that it doesn't fall in the middle of
790  * the current block (if last_bp == NULL).
791  */
792 int
793 cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
794 {
795         struct buf *bp, *tbp;
796         int i, j;
797         int totalwritten = 0;
798         int maxiosize = vmaxiosize(vp);
799
800         while (bytes > 0) {
801                 /*
802                  * If the buffer is not delayed-write (i.e. dirty), or it 
803                  * is delayed-write but either locked or inval, it cannot 
804                  * partake in the clustered write.
805                  */
806                 tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK);
807                 if (tbp == NULL ||
808                     (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI ||
809                     (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) {
810                         if (tbp)
811                                 BUF_UNLOCK(tbp);
812                         start_loffset += blksize;
813                         bytes -= blksize;
814                         continue;
815                 }
816                 bremfree(tbp);
817                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
818
819                 /*
820                  * Extra memory in the buffer, punt on this buffer.
821                  * XXX we could handle this in most cases, but we would
822                  * have to push the extra memory down to after our max
823                  * possible cluster size and then potentially pull it back
824                  * up if the cluster was terminated prematurely--too much
825                  * hassle.
826                  */
827                 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
828                     (tbp->b_bcount != tbp->b_bufsize) ||
829                     (tbp->b_bcount != blksize) ||
830                     (bytes == blksize) ||
831                     ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
832                         totalwritten += tbp->b_bufsize;
833                         bawrite(tbp);
834                         start_loffset += blksize;
835                         bytes -= blksize;
836                         continue;
837                 }
838
839                 /*
840                  * Set up the pbuf.  Track our append point with b_bcount
841                  * and b_bufsize.  b_bufsize is not used by the device but
842                  * our caller uses it to loop clusters and we use it to
843                  * detect a premature EOF on the block device.
844                  */
845                 bp->b_bcount = 0;
846                 bp->b_bufsize = 0;
847                 bp->b_xio.xio_npages = 0;
848                 bp->b_loffset = tbp->b_loffset;
849                 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset;
850
851                 /*
852                  * We are synthesizing a buffer out of vm_page_t's, but
853                  * if the block size is not page aligned then the starting
854                  * address may not be either.  Inherit the b_data offset
855                  * from the original buffer.
856                  */
857                 bp->b_data = (char *)((vm_offset_t)bp->b_data |
858                     ((vm_offset_t)tbp->b_data & PAGE_MASK));
859                 bp->b_flags &= ~B_ERROR;
860                 bp->b_flags |= B_CLUSTER | B_BNOCLIP |
861                         (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
862                 bp->b_bio1.bio_caller_info1.cluster_head = NULL;
863                 bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
864
865                 /*
866                  * From this location in the file, scan forward to see
867                  * if there are buffers with adjacent data that need to
868                  * be written as well.
869                  */
870                 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) {
871                         if (i != 0) { /* If not the first buffer */
872                                 tbp = findblk(vp, start_loffset,
873                                               FINDBLK_NBLOCK);
874                                 /*
875                                  * Buffer not found or could not be locked
876                                  * non-blocking.
877                                  */
878                                 if (tbp == NULL)
879                                         break;
880
881                                 /*
882                                  * If it IS in core, but has different
883                                  * characteristics, then don't cluster
884                                  * with it.
885                                  */
886                                 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
887                                      B_INVAL | B_DELWRI | B_NEEDCOMMIT))
888                                     != (B_DELWRI | B_CLUSTEROK |
889                                      (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
890                                     (tbp->b_flags & B_LOCKED) ||
891                                     (LIST_FIRST(&tbp->b_dep) &&
892                                      buf_checkwrite(tbp))
893                                 ) {
894                                         BUF_UNLOCK(tbp);
895                                         break;
896                                 }
897
898                                 /*
899                                  * Check that the combined cluster
900                                  * would make sense with regard to pages
901                                  * and would not be too large
902                                  */
903                                 if ((tbp->b_bcount != blksize) ||
904                                   ((bp->b_bio2.bio_offset + i) !=
905                                     tbp->b_bio2.bio_offset) ||
906                                   ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
907                                     (maxiosize / PAGE_SIZE))) {
908                                         BUF_UNLOCK(tbp);
909                                         break;
910                                 }
911                                 /*
912                                  * Ok, it's passed all the tests,
913                                  * so remove it from the free list
914                                  * and mark it busy. We will use it.
915                                  */
916                                 bremfree(tbp);
917                                 KKASSERT(tbp->b_cmd == BUF_CMD_DONE);
918                         } /* end of code for non-first buffers only */
919
920                         /*
921                          * If the IO is via the VM then we do some
922                          * special VM hackery (yuck).  Since the buffer's
923                          * block size may not be page-aligned it is possible
924                          * for a page to be shared between two buffers.  We
925                          * have to get rid of the duplication when building
926                          * the cluster.
927                          */
928                         if (tbp->b_flags & B_VMIO) {
929                                 vm_page_t m;
930
931                                 if (i != 0) { /* if not first buffer */
932                                         for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
933                                                 m = tbp->b_xio.xio_pages[j];
934                                                 if (m->flags & PG_BUSY) {
935                                                         bqrelse(tbp);
936                                                         goto finishcluster;
937                                                 }
938                                         }
939                                 }
940                                         
941                                 for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
942                                         m = tbp->b_xio.xio_pages[j];
943                                         vm_page_io_start(m);
944                                         vm_object_pip_add(m->object, 1);
945                                         if ((bp->b_xio.xio_npages == 0) ||
946                                           (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
947                                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
948                                                 bp->b_xio.xio_npages++;
949                                         }
950                                 }
951                         }
952                         bp->b_bcount += blksize;
953                         bp->b_bufsize += blksize;
954
955                         bundirty(tbp);
956                         tbp->b_flags &= ~B_ERROR;
957                         tbp->b_cmd = BUF_CMD_WRITE;
958                         BUF_KERNPROC(tbp);
959                         cluster_append(&bp->b_bio1, tbp);
960
961                         /*
962                          * check for latent dependencies to be handled 
963                          */
964                         if (LIST_FIRST(&tbp->b_dep) != NULL)
965                                 buf_start(tbp);
966                 }
967         finishcluster:
968                 pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
969                         (vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages);
970                 if (bp->b_bufsize > bp->b_kvasize) {
971                         panic(
972                             "cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
973                             bp->b_bufsize, bp->b_kvasize);
974                 }
975                 totalwritten += bp->b_bufsize;
976                 bp->b_dirtyoff = 0;
977                 bp->b_dirtyend = bp->b_bufsize;
978                 bp->b_bio1.bio_done = cluster_callback;
979                 bp->b_cmd = BUF_CMD_WRITE;
980
981                 vfs_busy_pages(vp, bp);
982                 bp->b_runningbufspace = bp->b_bufsize;
983                 if (bp->b_runningbufspace) {
984                         runningbufspace += bp->b_runningbufspace;
985                         ++runningbufcount;
986                 }
987                 BUF_KERNPROC(bp);
988                 vn_strategy(vp, &bp->b_bio1);
989
990                 bytes -= i;
991         }
992         return totalwritten;
993 }
994
995 /*
996  * Collect together all the buffers in a cluster.
997  * Plus add one additional buffer.
998  */
999 static struct cluster_save *
1000 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize)
1001 {
1002         struct cluster_save *buflist;
1003         struct buf *bp;
1004         off_t loffset;
1005         int i, len;
1006
1007         len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize;
1008         buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1009                          M_SEGMENT, M_WAITOK);
1010         buflist->bs_nchildren = 0;
1011         buflist->bs_children = (struct buf **) (buflist + 1);
1012         for (loffset = vp->v_cstart, i = 0; i < len; (loffset += blksize), i++) {
1013                 (void) bread(vp, loffset, last_bp->b_bcount, &bp);
1014                 buflist->bs_children[i] = bp;
1015                 if (bp->b_bio2.bio_offset == NOOFFSET) {
1016                         VOP_BMAP(bp->b_vp, bp->b_loffset,
1017                                  &bp->b_bio2.bio_offset,
1018                                  NULL, NULL, BUF_CMD_WRITE);
1019                 }
1020         }
1021         buflist->bs_children[i] = bp = last_bp;
1022         if (bp->b_bio2.bio_offset == NOOFFSET) {
1023                 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
1024                          NULL, NULL, BUF_CMD_WRITE);
1025         }
1026         buflist->bs_nchildren = i + 1;
1027         return (buflist);
1028 }
1029
1030 void
1031 cluster_append(struct bio *bio, struct buf *tbp)
1032 {
1033         tbp->b_cluster_next = NULL;
1034         if (bio->bio_caller_info1.cluster_head == NULL) {
1035                 bio->bio_caller_info1.cluster_head = tbp;
1036                 bio->bio_caller_info2.cluster_tail = tbp;
1037         } else {
1038                 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp;
1039                 bio->bio_caller_info2.cluster_tail = tbp;
1040         }
1041 }
1042
1043 static
1044 void
1045 cluster_setram (struct buf *bp)
1046 {
1047         bp->b_flags |= B_RAM;
1048         if (bp->b_xio.xio_npages)
1049                 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
1050 }