Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /*- |
2 | * Copyright (c) 1993 | |
3 | * The Regents of the University of California. All rights reserved. | |
4 | * Modifications/enhancements: | |
5 | * Copyright (c) 1995 John S. Dyson. All rights reserved. | |
38a4b308 | 6 | * Copyright (c) 2012-2013 Matthew Dillon. All rights reserved. |
984263bc MD |
7 | * |
8 | * Redistribution and use in source and binary forms, with or without | |
9 | * modification, are permitted provided that the following conditions | |
10 | * are met: | |
11 | * 1. Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in the | |
15 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 16 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
17 | * may be used to endorse or promote products derived from this software |
18 | * without specific prior written permission. | |
19 | * | |
20 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
28 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
29 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
30 | * SUCH DAMAGE. | |
984263bc MD |
31 | */ |
32 | ||
33 | #include "opt_debug_cluster.h" | |
34 | ||
35 | #include <sys/param.h> | |
36 | #include <sys/systm.h> | |
37 | #include <sys/kernel.h> | |
38 | #include <sys/proc.h> | |
39 | #include <sys/buf.h> | |
40 | #include <sys/vnode.h> | |
41 | #include <sys/malloc.h> | |
42 | #include <sys/mount.h> | |
43 | #include <sys/resourcevar.h> | |
44 | #include <sys/vmmeter.h> | |
45 | #include <vm/vm.h> | |
46 | #include <vm/vm_object.h> | |
47 | #include <vm/vm_page.h> | |
48 | #include <sys/sysctl.h> | |
54341a3b | 49 | |
3020e3be | 50 | #include <sys/buf2.h> |
12e4aaff | 51 | #include <vm/vm_page2.h> |
984263bc | 52 | |
e54488bb MD |
53 | #include <machine/limits.h> |
54 | ||
38a4b308 MD |
55 | /* |
56 | * Cluster tracking cache - replaces the original vnode v_* fields which had | |
57 | * limited utility and were not MP safe. | |
58 | * | |
59 | * The cluster tracking cache is a simple 4-way set-associative non-chained | |
60 | * cache. It is capable of tracking up to four zones separated by 1MB or | |
61 | * more per vnode. | |
62 | * | |
63 | * NOTE: We want this structure to be cache-line friendly so the iterator | |
64 | * is embedded rather than in a separate array. | |
65 | * | |
66 | * NOTE: A cluster cache entry can become stale when a vnode is recycled. | |
67 | * For now we treat the values as heuristical but also self-consistent. | |
68 | * i.e. the values cannot be completely random and cannot be SMP unsafe | |
69 | * or the cluster code might end-up clustering non-contiguous buffers | |
70 | * at the wrong offsets. | |
71 | */ | |
72 | struct cluster_cache { | |
73 | struct vnode *vp; | |
74 | u_int locked; | |
cf297f2c MD |
75 | off_t v_lastw; /* last write (end) (write cluster) */ |
76 | off_t v_cstart; /* start block (beg) of cluster */ | |
77 | off_t v_lasta; /* last allocation (end) */ | |
38a4b308 MD |
78 | u_int v_clen; /* length of current cluster */ |
79 | u_int iterator; | |
80 | } __cachealign; | |
81 | ||
82 | typedef struct cluster_cache cluster_cache_t; | |
83 | ||
84 | #define CLUSTER_CACHE_SIZE 512 | |
85 | #define CLUSTER_CACHE_MASK (CLUSTER_CACHE_SIZE - 1) | |
86 | ||
87 | #define CLUSTER_ZONE ((off_t)(1024 * 1024)) | |
88 | ||
89 | cluster_cache_t cluster_array[CLUSTER_CACHE_SIZE]; | |
90 | ||
984263bc | 91 | #if defined(CLUSTERDEBUG) |
984263bc MD |
92 | static int rcluster= 0; |
93 | SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); | |
94 | #endif | |
95 | ||
d1cd9d97 | 96 | static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer"); |
984263bc MD |
97 | |
98 | static struct cluster_save * | |
38a4b308 MD |
99 | cluster_collectbufs (cluster_cache_t *cc, struct vnode *vp, |
100 | struct buf *last_bp, int blksize); | |
984263bc | 101 | static struct buf * |
54078292 | 102 | cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset, |
e92ca23a | 103 | off_t doffset, int blksize, int run, |
cb1fa82f | 104 | struct buf *fbp, int *srp); |
81b5c339 | 105 | static void cluster_callback (struct bio *); |
cf1bb2a8 | 106 | static void cluster_setram (struct buf *); |
cb1fa82f | 107 | static void cluster_clrram (struct buf *); |
9de13b88 MD |
108 | static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize, |
109 | off_t start_loffset, int bytes); | |
984263bc MD |
110 | |
111 | static int write_behind = 1; | |
093e85dc SG |
112 | SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, |
113 | "Cluster write-behind setting"); | |
504ea70e MD |
114 | static quad_t write_behind_minfilesize = 10 * 1024 * 1024; |
115 | SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW, | |
116 | &write_behind_minfilesize, 0, "Cluster write-behind setting"); | |
364c022c | 117 | static int max_readahead = 2 * 1024 * 1024; |
093e85dc SG |
118 | SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, |
119 | "Limit in bytes for desired cluster read-ahead"); | |
984263bc MD |
120 | |
121 | extern vm_page_t bogus_page; | |
122 | ||
cb1fa82f MD |
123 | /* |
124 | * nblks is our cluster_rbuild request size. The approximate number of | |
125 | * physical read-ahead requests is maxra / nblks. The physical request | |
126 | * size is limited by the device (maxrbuild). We also do not want to make | |
127 | * the request size too big or it will mess up the B_RAM streaming. | |
128 | */ | |
129 | static __inline | |
130 | int | |
131 | calc_rbuild_reqsize(int maxra, int maxrbuild) | |
132 | { | |
133 | int nblks; | |
134 | ||
135 | if ((nblks = maxra / 4) > maxrbuild) | |
136 | nblks = maxrbuild; | |
137 | if (nblks < 1) | |
138 | nblks = maxra; | |
139 | return nblks; | |
140 | } | |
141 | ||
38a4b308 MD |
142 | /* |
143 | * Acquire/release cluster cache (can return dummy entry) | |
144 | */ | |
145 | static | |
146 | cluster_cache_t * | |
147 | cluster_getcache(cluster_cache_t *dummy, struct vnode *vp, off_t loffset) | |
148 | { | |
149 | cluster_cache_t *cc; | |
150 | size_t hv; | |
151 | int i; | |
152 | int xact; | |
153 | ||
154 | hv = (size_t)(intptr_t)vp ^ (size_t)(intptr_t)vp / sizeof(*vp); | |
155 | hv &= CLUSTER_CACHE_MASK & ~3; | |
156 | cc = &cluster_array[hv]; | |
157 | ||
158 | xact = -1; | |
159 | for (i = 0; i < 4; ++i) { | |
160 | if (cc[i].vp != vp) | |
161 | continue; | |
3f7b7260 | 162 | if (rounddown2(cc[i].v_cstart ^ loffset, CLUSTER_ZONE) == 0) { |
38a4b308 MD |
163 | xact = i; |
164 | break; | |
165 | } | |
166 | } | |
167 | if (xact >= 0 && atomic_swap_int(&cc[xact].locked, 1) == 0) { | |
168 | if (cc[xact].vp == vp && | |
3f7b7260 | 169 | rounddown2(cc[i].v_cstart ^ loffset, CLUSTER_ZONE) == 0) { |
38a4b308 MD |
170 | return(&cc[xact]); |
171 | } | |
172 | atomic_swap_int(&cc[xact].locked, 0); | |
173 | } | |
174 | ||
175 | /* | |
176 | * New entry. If we can't acquire the cache line then use the | |
177 | * passed-in dummy element and reset all fields. | |
178 | * | |
179 | * When we are able to acquire the cache line we only clear the | |
180 | * fields if the vp does not match. This allows us to multi-zone | |
181 | * a vp and for excessive zones / partial clusters to be retired. | |
182 | */ | |
183 | i = cc->iterator++ & 3; | |
184 | cc += i; | |
185 | if (atomic_swap_int(&cc->locked, 1) != 0) { | |
186 | cc = dummy; | |
187 | cc->locked = 1; | |
188 | cc->vp = NULL; | |
189 | } | |
190 | if (cc->vp != vp) { | |
191 | cc->vp = vp; | |
192 | cc->v_lasta = 0; | |
193 | cc->v_clen = 0; | |
194 | cc->v_cstart = 0; | |
195 | cc->v_lastw = 0; | |
196 | } | |
197 | return(cc); | |
198 | } | |
199 | ||
200 | static | |
201 | void | |
202 | cluster_putcache(cluster_cache_t *cc) | |
203 | { | |
204 | atomic_swap_int(&cc->locked, 0); | |
205 | } | |
206 | ||
984263bc | 207 | /* |
dbb11a6e MD |
208 | * This replaces bread(), providing a synchronous read of the requested |
209 | * buffer plus asynchronous read-ahead within the specified bounds. | |
210 | * | |
211 | * The caller may pre-populate *bpp if it already has the requested buffer | |
212 | * in-hand, else must set *bpp to NULL. Note that the cluster_read() inline | |
213 | * sets *bpp to NULL and then calls cluster_readx() for compatibility. | |
364c022c MD |
214 | * |
215 | * filesize - read-ahead @ blksize will not cross this boundary | |
216 | * loffset - loffset for returned *bpp | |
217 | * blksize - blocksize for returned *bpp and read-ahead bps | |
218 | * minreq - minimum (not a hard minimum) in bytes, typically reflects | |
219 | * a higher level uio resid. | |
220 | * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) | |
221 | * bpp - return buffer (*bpp) for (loffset,blksize) | |
984263bc MD |
222 | */ |
223 | int | |
9c93755a MD |
224 | cluster_readx(struct vnode *vp, off_t filesize, off_t loffset, int blksize, |
225 | int bflags, size_t minreq, size_t maxreq, | |
226 | struct buf **bpp) | |
984263bc MD |
227 | { |
228 | struct buf *bp, *rbp, *reqbp; | |
54078292 MD |
229 | off_t origoffset; |
230 | off_t doffset; | |
231 | int error; | |
984263bc | 232 | int i; |
364c022c MD |
233 | int maxra; |
234 | int maxrbuild; | |
cb1fa82f | 235 | int sr; |
d32579c3 | 236 | int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0; |
984263bc | 237 | |
cb1fa82f | 238 | sr = 0; |
984263bc MD |
239 | |
240 | /* | |
364c022c MD |
241 | * Calculate the desired read-ahead in blksize'd blocks (maxra). |
242 | * To do this we calculate maxreq. | |
6b84c93e | 243 | * |
364c022c MD |
244 | * maxreq typically starts out as a sequential heuristic. If the |
245 | * high level uio/resid is bigger (minreq), we pop maxreq up to | |
246 | * minreq. This represents the case where random I/O is being | |
247 | * performed by the userland is issuing big read()'s. | |
6b84c93e | 248 | * |
364c022c MD |
249 | * Then we limit maxreq to max_readahead to ensure it is a reasonable |
250 | * value. | |
251 | * | |
b28ad496 | 252 | * Finally we must ensure that (loffset + maxreq) does not cross the |
364c022c MD |
253 | * boundary (filesize) for the current blocksize. If we allowed it |
254 | * to cross we could end up with buffers past the boundary with the | |
255 | * wrong block size (HAMMER large-data areas use mixed block sizes). | |
b28ad496 | 256 | * minreq is also absolutely limited to filesize. |
984263bc | 257 | */ |
364c022c MD |
258 | if (maxreq < minreq) |
259 | maxreq = minreq; | |
b28ad496 MD |
260 | /* minreq not used beyond this point */ |
261 | ||
364c022c MD |
262 | if (maxreq > max_readahead) { |
263 | maxreq = max_readahead; | |
264 | if (maxreq > 16 * 1024 * 1024) | |
265 | maxreq = 16 * 1024 * 1024; | |
266 | } | |
267 | if (maxreq < blksize) | |
268 | maxreq = blksize; | |
269 | if (loffset + maxreq > filesize) { | |
270 | if (loffset > filesize) | |
271 | maxreq = 0; | |
272 | else | |
273 | maxreq = filesize - loffset; | |
274 | } | |
275 | ||
276 | maxra = (int)(maxreq / blksize); | |
984263bc MD |
277 | |
278 | /* | |
ae8e83e6 | 279 | * Get the requested block. |
984263bc | 280 | */ |
54341a3b MD |
281 | if (*bpp) |
282 | reqbp = bp = *bpp; | |
283 | else | |
d32579c3 | 284 | *bpp = reqbp = bp = getblk(vp, loffset, blksize, blkflags, 0); |
54078292 | 285 | origoffset = loffset; |
984263bc | 286 | |
364c022c MD |
287 | /* |
288 | * Calculate the maximum cluster size for a single I/O, used | |
289 | * by cluster_rbuild(). | |
290 | */ | |
291 | maxrbuild = vmaxiosize(vp) / blksize; | |
292 | ||
984263bc | 293 | /* |
d9a07a60 | 294 | * If it is in the cache, then check to see if the reads have been |
984263bc MD |
295 | * sequential. If they have, then try some read-ahead, otherwise |
296 | * back-off on prospective read-aheads. | |
297 | */ | |
298 | if (bp->b_flags & B_CACHE) { | |
6b84c93e MD |
299 | /* |
300 | * Not sequential, do not do any read-ahead | |
301 | */ | |
364c022c | 302 | if (maxra <= 1) |
984263bc | 303 | return 0; |
6b84c93e MD |
304 | |
305 | /* | |
306 | * No read-ahead mark, do not do any read-ahead | |
307 | * yet. | |
308 | */ | |
309 | if ((bp->b_flags & B_RAM) == 0) | |
984263bc | 310 | return 0; |
b1c20cfa | 311 | |
6b84c93e MD |
312 | /* |
313 | * We hit a read-ahead-mark, figure out how much read-ahead | |
314 | * to do (maxra) and where to start (loffset). | |
315 | * | |
cb1fa82f MD |
316 | * Typically the way this works is that B_RAM is set in the |
317 | * middle of the cluster and triggers an overlapping | |
318 | * read-ahead of 1/2 a cluster more blocks. This ensures | |
319 | * that the cluster read-ahead scales with the read-ahead | |
320 | * count and is thus better-able to absorb the caller's | |
321 | * latency. | |
6b84c93e | 322 | * |
cb1fa82f MD |
323 | * Estimate where the next unread block will be by assuming |
324 | * that the B_RAM's are placed at the half-way point. | |
6b84c93e MD |
325 | */ |
326 | bp->b_flags &= ~B_RAM; | |
327 | ||
cb1fa82f MD |
328 | i = maxra / 2; |
329 | rbp = findblk(vp, loffset + i * blksize, FINDBLK_TEST); | |
330 | if (rbp == NULL || (rbp->b_flags & B_CACHE) == 0) { | |
331 | while (i) { | |
332 | --i; | |
333 | rbp = findblk(vp, loffset + i * blksize, | |
334 | FINDBLK_TEST); | |
335 | if (rbp) { | |
336 | ++i; | |
337 | break; | |
338 | } | |
339 | } | |
340 | } else { | |
341 | while (i < maxra) { | |
342 | rbp = findblk(vp, loffset + i * blksize, | |
343 | FINDBLK_TEST); | |
344 | if (rbp == NULL) | |
345 | break; | |
346 | ++i; | |
984263bc | 347 | } |
984263bc | 348 | } |
364c022c MD |
349 | |
350 | /* | |
351 | * We got everything or everything is in the cache, no | |
352 | * point continuing. | |
353 | */ | |
6b84c93e MD |
354 | if (i >= maxra) |
355 | return 0; | |
616dd1e9 MD |
356 | |
357 | /* | |
358 | * Calculate where to start the read-ahead and how much | |
359 | * to do. Generally speaking we want to read-ahead by | |
360 | * (maxra) when we've found a read-ahead mark. We do | |
361 | * not want to reduce maxra here as it will cause | |
362 | * successive read-ahead I/O's to be smaller and smaller. | |
cf83ee2c MD |
363 | * |
364 | * However, we have to make sure we don't break the | |
365 | * filesize limitation for the clustered operation. | |
616dd1e9 | 366 | */ |
6b84c93e | 367 | loffset += i * blksize; |
984263bc | 368 | reqbp = bp = NULL; |
cf83ee2c MD |
369 | |
370 | if (loffset >= filesize) | |
371 | return 0; | |
372 | if (loffset + maxra * blksize > filesize) { | |
373 | maxreq = filesize - loffset; | |
374 | maxra = (int)(maxreq / blksize); | |
375 | } | |
cb1fa82f MD |
376 | |
377 | /* | |
378 | * Set RAM on first read-ahead block since we still have | |
379 | * approximate maxra/2 blocks ahead of us that are already | |
380 | * cached or in-progress. | |
381 | */ | |
382 | sr = 1; | |
984263bc | 383 | } else { |
cb1fa82f MD |
384 | /* |
385 | * Start block is not valid, we will want to do a | |
386 | * full read-ahead. | |
387 | */ | |
4d8329e1 | 388 | __debugvar off_t firstread = bp->b_loffset; |
54078292 | 389 | int nblks; |
984263bc | 390 | |
ae8e83e6 MD |
391 | /* |
392 | * Set-up synchronous read for bp. | |
393 | */ | |
394 | bp->b_cmd = BUF_CMD_READ; | |
395 | bp->b_bio1.bio_done = biodone_sync; | |
396 | bp->b_bio1.bio_flags |= BIO_SYNC; | |
397 | ||
81b5c339 MD |
398 | KASSERT(firstread != NOOFFSET, |
399 | ("cluster_read: no buffer offset")); | |
54078292 | 400 | |
cb1fa82f MD |
401 | nblks = calc_rbuild_reqsize(maxra, maxrbuild); |
402 | ||
364c022c | 403 | /* |
cb1fa82f | 404 | * Set RAM half-way through the full-cluster. |
364c022c | 405 | */ |
cb1fa82f | 406 | sr = (maxra + 1) / 2; |
364c022c MD |
407 | |
408 | if (nblks > 1) { | |
409 | int burstbytes; | |
984263bc | 410 | |
e92ca23a MD |
411 | error = VOP_BMAP(vp, loffset, &doffset, |
412 | &burstbytes, NULL, BUF_CMD_READ); | |
984263bc MD |
413 | if (error) |
414 | goto single_block_read; | |
364c022c MD |
415 | if (nblks > burstbytes / blksize) |
416 | nblks = burstbytes / blksize; | |
54078292 | 417 | if (doffset == NOOFFSET) |
984263bc | 418 | goto single_block_read; |
364c022c | 419 | if (nblks <= 1) |
984263bc | 420 | goto single_block_read; |
984263bc | 421 | |
54078292 | 422 | bp = cluster_rbuild(vp, filesize, loffset, |
cb1fa82f | 423 | doffset, blksize, nblks, bp, &sr); |
54078292 | 424 | loffset += bp->b_bufsize; |
364c022c | 425 | maxra -= bp->b_bufsize / blksize; |
984263bc MD |
426 | } else { |
427 | single_block_read: | |
428 | /* | |
364c022c | 429 | * If it isn't in the cache, then get a chunk from |
984263bc MD |
430 | * disk if sequential, otherwise just get the block. |
431 | */ | |
e92ca23a | 432 | loffset += blksize; |
364c022c | 433 | --maxra; |
984263bc MD |
434 | } |
435 | } | |
436 | ||
984263bc | 437 | /* |
ae8e83e6 MD |
438 | * If B_CACHE was not set issue bp. bp will either be an |
439 | * asynchronous cluster buf or a synchronous single-buf. | |
440 | * If it is a single buf it will be the same as reqbp. | |
441 | * | |
442 | * NOTE: Once an async cluster buf is issued bp becomes invalid. | |
984263bc MD |
443 | */ |
444 | if (bp) { | |
445 | #if defined(CLUSTERDEBUG) | |
446 | if (rcluster) | |
364c022c MD |
447 | kprintf("S(%012jx,%d,%d)\n", |
448 | (intmax_t)bp->b_loffset, bp->b_bcount, maxra); | |
984263bc | 449 | #endif |
10f3fee5 MD |
450 | if ((bp->b_flags & B_CLUSTER) == 0) |
451 | vfs_busy_pages(vp, bp); | |
9c93755a MD |
452 | bp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); |
453 | bp->b_flags |= bflags; | |
81b5c339 | 454 | vn_strategy(vp, &bp->b_bio1); |
ae8e83e6 | 455 | /* bp invalid now */ |
dbb11a6e | 456 | bp = NULL; |
984263bc MD |
457 | } |
458 | ||
cb1fa82f MD |
459 | #if defined(CLUSTERDEBUG) |
460 | if (rcluster) | |
461 | kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n", | |
462 | loffset, blksize, maxra, sr); | |
463 | #endif | |
464 | ||
984263bc | 465 | /* |
bfda7080 | 466 | * If we have been doing sequential I/O, then do some read-ahead. |
6b84c93e MD |
467 | * The code above us should have positioned us at the next likely |
468 | * offset. | |
0728eafc MD |
469 | * |
470 | * Only mess with buffers which we can immediately lock. HAMMER | |
471 | * will do device-readahead irrespective of what the blocks | |
472 | * represent. | |
cb1fa82f MD |
473 | * |
474 | * Set B_RAM on the first buffer (the next likely offset needing | |
475 | * read-ahead), under the assumption that there are still | |
476 | * approximately maxra/2 blocks good ahead of us. | |
984263bc | 477 | */ |
cb1fa82f | 478 | while (maxra > 0) { |
bfda7080 | 479 | int burstbytes; |
364c022c | 480 | int nblks; |
bfda7080 | 481 | |
b77cfc40 | 482 | rbp = getblk(vp, loffset, blksize, |
d32579c3 MD |
483 | GETBLK_SZMATCH | GETBLK_NOWAIT | GETBLK_KVABIO, |
484 | 0); | |
cb1fa82f MD |
485 | #if defined(CLUSTERDEBUG) |
486 | if (rcluster) { | |
487 | kprintf("read-ahead %016jx rbp=%p ", | |
488 | loffset, rbp); | |
489 | } | |
490 | #endif | |
b77cfc40 MD |
491 | if (rbp == NULL) |
492 | goto no_read_ahead; | |
bfda7080 | 493 | if ((rbp->b_flags & B_CACHE)) { |
984263bc | 494 | bqrelse(rbp); |
bfda7080 SS |
495 | goto no_read_ahead; |
496 | } | |
497 | ||
ac7ffc8a | 498 | /* |
cb1fa82f MD |
499 | * If BMAP is not supported or has an issue, we still do |
500 | * (maxra) read-ahead, but we do not try to use rbuild. | |
ac7ffc8a | 501 | */ |
cb1fa82f MD |
502 | error = VOP_BMAP(vp, loffset, &doffset, |
503 | &burstbytes, NULL, BUF_CMD_READ); | |
504 | if (error || doffset == NOOFFSET) { | |
505 | nblks = 1; | |
506 | doffset = NOOFFSET; | |
507 | } else { | |
508 | nblks = calc_rbuild_reqsize(maxra, maxrbuild); | |
509 | if (nblks > burstbytes / blksize) | |
510 | nblks = burstbytes / blksize; | |
bfda7080 | 511 | } |
ae8e83e6 | 512 | rbp->b_cmd = BUF_CMD_READ; |
ae8e83e6 | 513 | |
364c022c | 514 | if (nblks > 1) { |
bfda7080 | 515 | rbp = cluster_rbuild(vp, filesize, loffset, |
e92ca23a | 516 | doffset, blksize, |
cb1fa82f | 517 | nblks, rbp, &sr); |
984263bc | 518 | } else { |
bfda7080 | 519 | rbp->b_bio2.bio_offset = doffset; |
cb1fa82f MD |
520 | if (--sr == 0) |
521 | cluster_setram(rbp); | |
bfda7080 | 522 | } |
364c022c | 523 | |
9c93755a MD |
524 | rbp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); |
525 | rbp->b_flags |= bflags; | |
10f3fee5 | 526 | |
bfda7080 SS |
527 | if ((rbp->b_flags & B_CLUSTER) == 0) |
528 | vfs_busy_pages(vp, rbp); | |
ae8e83e6 | 529 | BUF_KERNPROC(rbp); |
6b84c93e MD |
530 | loffset += rbp->b_bufsize; |
531 | maxra -= rbp->b_bufsize / blksize; | |
bfda7080 | 532 | vn_strategy(vp, &rbp->b_bio1); |
ae8e83e6 | 533 | /* rbp invalid now */ |
984263bc | 534 | } |
bfda7080 | 535 | |
ae8e83e6 MD |
536 | /* |
537 | * Wait for our original buffer to complete its I/O. reqbp will | |
538 | * be NULL if the original buffer was B_CACHE. We are returning | |
539 | * (*bpp) which is the same as reqbp when reqbp != NULL. | |
540 | */ | |
541 | no_read_ahead: | |
542 | if (reqbp) { | |
543 | KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC); | |
544 | error = biowait(&reqbp->b_bio1, "clurd"); | |
cb1fa82f MD |
545 | } else { |
546 | error = 0; | |
ae8e83e6 MD |
547 | } |
548 | return (error); | |
984263bc MD |
549 | } |
550 | ||
dbb11a6e MD |
551 | /* |
552 | * This replaces breadcb(), providing an asynchronous read of the requested | |
553 | * buffer with a callback, plus an asynchronous read-ahead within the | |
554 | * specified bounds. | |
555 | * | |
556 | * The callback must check whether BIO_DONE is set in the bio and issue | |
557 | * the bpdone(bp, 0) if it isn't. The callback is responsible for clearing | |
558 | * BIO_DONE and disposing of the I/O (bqrelse()ing it). | |
559 | * | |
560 | * filesize - read-ahead @ blksize will not cross this boundary | |
561 | * loffset - loffset for returned *bpp | |
562 | * blksize - blocksize for returned *bpp and read-ahead bps | |
563 | * minreq - minimum (not a hard minimum) in bytes, typically reflects | |
564 | * a higher level uio resid. | |
565 | * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) | |
566 | * bpp - return buffer (*bpp) for (loffset,blksize) | |
567 | */ | |
568 | void | |
9c93755a MD |
569 | cluster_readcb(struct vnode *vp, off_t filesize, off_t loffset, int blksize, |
570 | int bflags, size_t minreq, size_t maxreq, | |
571 | void (*func)(struct bio *), void *arg) | |
dbb11a6e MD |
572 | { |
573 | struct buf *bp, *rbp, *reqbp; | |
574 | off_t origoffset; | |
575 | off_t doffset; | |
576 | int i; | |
577 | int maxra; | |
578 | int maxrbuild; | |
cb1fa82f | 579 | int sr; |
d32579c3 | 580 | int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0; |
cb1fa82f MD |
581 | |
582 | sr = 0; | |
dbb11a6e MD |
583 | |
584 | /* | |
585 | * Calculate the desired read-ahead in blksize'd blocks (maxra). | |
586 | * To do this we calculate maxreq. | |
587 | * | |
588 | * maxreq typically starts out as a sequential heuristic. If the | |
589 | * high level uio/resid is bigger (minreq), we pop maxreq up to | |
590 | * minreq. This represents the case where random I/O is being | |
591 | * performed by the userland is issuing big read()'s. | |
592 | * | |
593 | * Then we limit maxreq to max_readahead to ensure it is a reasonable | |
594 | * value. | |
595 | * | |
596 | * Finally we must ensure that (loffset + maxreq) does not cross the | |
597 | * boundary (filesize) for the current blocksize. If we allowed it | |
598 | * to cross we could end up with buffers past the boundary with the | |
599 | * wrong block size (HAMMER large-data areas use mixed block sizes). | |
600 | * minreq is also absolutely limited to filesize. | |
601 | */ | |
602 | if (maxreq < minreq) | |
603 | maxreq = minreq; | |
604 | /* minreq not used beyond this point */ | |
605 | ||
606 | if (maxreq > max_readahead) { | |
607 | maxreq = max_readahead; | |
608 | if (maxreq > 16 * 1024 * 1024) | |
609 | maxreq = 16 * 1024 * 1024; | |
610 | } | |
611 | if (maxreq < blksize) | |
612 | maxreq = blksize; | |
613 | if (loffset + maxreq > filesize) { | |
614 | if (loffset > filesize) | |
615 | maxreq = 0; | |
616 | else | |
617 | maxreq = filesize - loffset; | |
618 | } | |
619 | ||
620 | maxra = (int)(maxreq / blksize); | |
621 | ||
622 | /* | |
623 | * Get the requested block. | |
624 | */ | |
d32579c3 | 625 | reqbp = bp = getblk(vp, loffset, blksize, blkflags, 0); |
dbb11a6e MD |
626 | origoffset = loffset; |
627 | ||
628 | /* | |
629 | * Calculate the maximum cluster size for a single I/O, used | |
630 | * by cluster_rbuild(). | |
631 | */ | |
632 | maxrbuild = vmaxiosize(vp) / blksize; | |
633 | ||
634 | /* | |
635 | * if it is in the cache, then check to see if the reads have been | |
636 | * sequential. If they have, then try some read-ahead, otherwise | |
637 | * back-off on prospective read-aheads. | |
638 | */ | |
639 | if (bp->b_flags & B_CACHE) { | |
640 | /* | |
641 | * Setup for func() call whether we do read-ahead or not. | |
642 | */ | |
643 | bp->b_bio1.bio_caller_info1.ptr = arg; | |
644 | bp->b_bio1.bio_flags |= BIO_DONE; | |
645 | ||
646 | /* | |
647 | * Not sequential, do not do any read-ahead | |
648 | */ | |
649 | if (maxra <= 1) | |
650 | goto no_read_ahead; | |
651 | ||
652 | /* | |
653 | * No read-ahead mark, do not do any read-ahead | |
654 | * yet. | |
655 | */ | |
656 | if ((bp->b_flags & B_RAM) == 0) | |
657 | goto no_read_ahead; | |
658 | bp->b_flags &= ~B_RAM; | |
659 | ||
660 | /* | |
661 | * We hit a read-ahead-mark, figure out how much read-ahead | |
662 | * to do (maxra) and where to start (loffset). | |
663 | * | |
664 | * Shortcut the scan. Typically the way this works is that | |
665 | * we've built up all the blocks inbetween except for the | |
666 | * last in previous iterations, so if the second-to-last | |
667 | * block is present we just skip ahead to it. | |
668 | * | |
669 | * This algorithm has O(1) cpu in the steady state no | |
670 | * matter how large maxra is. | |
671 | */ | |
672 | if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST)) | |
673 | i = maxra - 1; | |
674 | else | |
675 | i = 1; | |
676 | while (i < maxra) { | |
677 | if (findblk(vp, loffset + i * blksize, | |
678 | FINDBLK_TEST) == NULL) { | |
679 | break; | |
680 | } | |
681 | ++i; | |
682 | } | |
683 | ||
684 | /* | |
685 | * We got everything or everything is in the cache, no | |
686 | * point continuing. | |
687 | */ | |
688 | if (i >= maxra) | |
689 | goto no_read_ahead; | |
690 | ||
691 | /* | |
692 | * Calculate where to start the read-ahead and how much | |
693 | * to do. Generally speaking we want to read-ahead by | |
694 | * (maxra) when we've found a read-ahead mark. We do | |
695 | * not want to reduce maxra here as it will cause | |
696 | * successive read-ahead I/O's to be smaller and smaller. | |
697 | * | |
698 | * However, we have to make sure we don't break the | |
699 | * filesize limitation for the clustered operation. | |
700 | */ | |
701 | loffset += i * blksize; | |
702 | bp = NULL; | |
703 | /* leave reqbp intact to force function callback */ | |
704 | ||
705 | if (loffset >= filesize) | |
706 | goto no_read_ahead; | |
707 | if (loffset + maxra * blksize > filesize) { | |
708 | maxreq = filesize - loffset; | |
709 | maxra = (int)(maxreq / blksize); | |
710 | } | |
cb1fa82f | 711 | sr = 1; |
dbb11a6e | 712 | } else { |
cb1fa82f MD |
713 | /* |
714 | * bp is not valid, no prior cluster in progress so get a | |
715 | * full cluster read-ahead going. | |
716 | */ | |
dbb11a6e MD |
717 | __debugvar off_t firstread = bp->b_loffset; |
718 | int nblks; | |
cb1fa82f | 719 | int error; |
dbb11a6e MD |
720 | |
721 | /* | |
722 | * Set-up synchronous read for bp. | |
723 | */ | |
9c93755a MD |
724 | bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL | B_NOTMETA); |
725 | bp->b_flags |= bflags; | |
dbb11a6e MD |
726 | bp->b_cmd = BUF_CMD_READ; |
727 | bp->b_bio1.bio_done = func; | |
728 | bp->b_bio1.bio_caller_info1.ptr = arg; | |
729 | BUF_KERNPROC(bp); | |
730 | reqbp = NULL; /* don't func() reqbp, it's running async */ | |
731 | ||
732 | KASSERT(firstread != NOOFFSET, | |
733 | ("cluster_read: no buffer offset")); | |
734 | ||
735 | /* | |
736 | * nblks is our cluster_rbuild request size, limited | |
737 | * primarily by the device. | |
738 | */ | |
cb1fa82f MD |
739 | nblks = calc_rbuild_reqsize(maxra, maxrbuild); |
740 | ||
741 | /* | |
742 | * Set RAM half-way through the full-cluster. | |
743 | */ | |
744 | sr = (maxra + 1) / 2; | |
dbb11a6e MD |
745 | |
746 | if (nblks > 1) { | |
747 | int burstbytes; | |
748 | ||
cb1fa82f MD |
749 | error = VOP_BMAP(vp, loffset, &doffset, |
750 | &burstbytes, NULL, BUF_CMD_READ); | |
751 | if (error) | |
dbb11a6e MD |
752 | goto single_block_read; |
753 | if (nblks > burstbytes / blksize) | |
754 | nblks = burstbytes / blksize; | |
755 | if (doffset == NOOFFSET) | |
756 | goto single_block_read; | |
757 | if (nblks <= 1) | |
758 | goto single_block_read; | |
759 | ||
760 | bp = cluster_rbuild(vp, filesize, loffset, | |
cb1fa82f | 761 | doffset, blksize, nblks, bp, &sr); |
dbb11a6e MD |
762 | loffset += bp->b_bufsize; |
763 | maxra -= bp->b_bufsize / blksize; | |
764 | } else { | |
765 | single_block_read: | |
766 | /* | |
767 | * If it isn't in the cache, then get a chunk from | |
768 | * disk if sequential, otherwise just get the block. | |
769 | */ | |
dbb11a6e MD |
770 | loffset += blksize; |
771 | --maxra; | |
772 | } | |
773 | } | |
774 | ||
775 | /* | |
776 | * If bp != NULL then B_CACHE was *NOT* set and bp must be issued. | |
777 | * bp will either be an asynchronous cluster buf or an asynchronous | |
778 | * single-buf. | |
779 | * | |
780 | * NOTE: Once an async cluster buf is issued bp becomes invalid. | |
781 | */ | |
782 | if (bp) { | |
783 | #if defined(CLUSTERDEBUG) | |
784 | if (rcluster) | |
785 | kprintf("S(%012jx,%d,%d)\n", | |
786 | (intmax_t)bp->b_loffset, bp->b_bcount, maxra); | |
787 | #endif | |
788 | if ((bp->b_flags & B_CLUSTER) == 0) | |
789 | vfs_busy_pages(vp, bp); | |
9c93755a MD |
790 | bp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); |
791 | bp->b_flags |= bflags; | |
dbb11a6e MD |
792 | vn_strategy(vp, &bp->b_bio1); |
793 | /* bp invalid now */ | |
794 | bp = NULL; | |
795 | } | |
796 | ||
cb1fa82f MD |
797 | #if defined(CLUSTERDEBUG) |
798 | if (rcluster) | |
799 | kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n", | |
800 | loffset, blksize, maxra, sr); | |
801 | #endif | |
802 | ||
dbb11a6e MD |
803 | /* |
804 | * If we have been doing sequential I/O, then do some read-ahead. | |
805 | * The code above us should have positioned us at the next likely | |
806 | * offset. | |
807 | * | |
808 | * Only mess with buffers which we can immediately lock. HAMMER | |
809 | * will do device-readahead irrespective of what the blocks | |
810 | * represent. | |
811 | */ | |
812 | while (maxra > 0) { | |
813 | int burstbytes; | |
cb1fa82f | 814 | int error; |
dbb11a6e MD |
815 | int nblks; |
816 | ||
817 | rbp = getblk(vp, loffset, blksize, | |
d32579c3 MD |
818 | GETBLK_SZMATCH | GETBLK_NOWAIT | GETBLK_KVABIO, |
819 | 0); | |
dbb11a6e MD |
820 | if (rbp == NULL) |
821 | goto no_read_ahead; | |
822 | if ((rbp->b_flags & B_CACHE)) { | |
823 | bqrelse(rbp); | |
824 | goto no_read_ahead; | |
825 | } | |
826 | ||
827 | /* | |
cb1fa82f MD |
828 | * If BMAP is not supported or has an issue, we still do |
829 | * (maxra) read-ahead, but we do not try to use rbuild. | |
dbb11a6e | 830 | */ |
cb1fa82f MD |
831 | error = VOP_BMAP(vp, loffset, &doffset, |
832 | &burstbytes, NULL, BUF_CMD_READ); | |
833 | if (error || doffset == NOOFFSET) { | |
834 | nblks = 1; | |
835 | doffset = NOOFFSET; | |
836 | } else { | |
837 | nblks = calc_rbuild_reqsize(maxra, maxrbuild); | |
838 | if (nblks > burstbytes / blksize) | |
839 | nblks = burstbytes / blksize; | |
dbb11a6e | 840 | } |
dbb11a6e | 841 | rbp->b_cmd = BUF_CMD_READ; |
dbb11a6e MD |
842 | |
843 | if (nblks > 1) { | |
844 | rbp = cluster_rbuild(vp, filesize, loffset, | |
845 | doffset, blksize, | |
cb1fa82f | 846 | nblks, rbp, &sr); |
dbb11a6e MD |
847 | } else { |
848 | rbp->b_bio2.bio_offset = doffset; | |
cb1fa82f MD |
849 | if (--sr == 0) |
850 | cluster_setram(rbp); | |
dbb11a6e MD |
851 | } |
852 | ||
9c93755a MD |
853 | rbp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); |
854 | rbp->b_flags |= bflags; | |
dbb11a6e MD |
855 | |
856 | if ((rbp->b_flags & B_CLUSTER) == 0) | |
857 | vfs_busy_pages(vp, rbp); | |
858 | BUF_KERNPROC(rbp); | |
859 | loffset += rbp->b_bufsize; | |
860 | maxra -= rbp->b_bufsize / blksize; | |
861 | vn_strategy(vp, &rbp->b_bio1); | |
862 | /* rbp invalid now */ | |
863 | } | |
864 | ||
865 | /* | |
866 | * If reqbp is non-NULL it had B_CACHE set and we issue the | |
867 | * function callback synchronously. | |
868 | * | |
869 | * Note that we may start additional asynchronous I/O before doing | |
870 | * the func() callback for the B_CACHE case | |
871 | */ | |
872 | no_read_ahead: | |
873 | if (reqbp) | |
874 | func(&reqbp->b_bio1); | |
875 | } | |
876 | ||
984263bc MD |
877 | /* |
878 | * If blocks are contiguous on disk, use this to provide clustered | |
879 | * read ahead. We will read as many blocks as possible sequentially | |
880 | * and then parcel them up into logical blocks in the buffer hash table. | |
ae8e83e6 MD |
881 | * |
882 | * This function either returns a cluster buf or it returns fbp. fbp is | |
883 | * already expected to be set up as a synchronous or asynchronous request. | |
884 | * | |
885 | * If a cluster buf is returned it will always be async. | |
cb1fa82f MD |
886 | * |
887 | * (*srp) counts down original blocks to determine where B_RAM should be set. | |
888 | * Set B_RAM when *srp drops to 0. If (*srp) starts at 0, B_RAM will not be | |
889 | * set on any buffer. Make sure B_RAM is cleared on any other buffers to | |
890 | * prevent degenerate read-aheads from being generated. | |
984263bc MD |
891 | */ |
892 | static struct buf * | |
ae8e83e6 | 893 | cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, |
cb1fa82f | 894 | int blksize, int run, struct buf *fbp, int *srp) |
984263bc MD |
895 | { |
896 | struct buf *bp, *tbp; | |
54078292 MD |
897 | off_t boffset; |
898 | int i, j; | |
2ec4b00d | 899 | int maxiosize = vmaxiosize(vp); |
984263bc | 900 | |
984263bc MD |
901 | /* |
902 | * avoid a division | |
903 | */ | |
e92ca23a | 904 | while (loffset + run * blksize > filesize) { |
984263bc MD |
905 | --run; |
906 | } | |
907 | ||
6260e485 | 908 | tbp = fbp; |
54078292 | 909 | tbp->b_bio2.bio_offset = doffset; |
8158299a | 910 | if (((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) { |
cb1fa82f MD |
911 | if (--*srp == 0) |
912 | cluster_setram(tbp); | |
913 | else | |
914 | cluster_clrram(tbp); | |
984263bc | 915 | return tbp; |
10f3fee5 | 916 | } |
984263bc | 917 | |
d84f6fa1 MD |
918 | /* |
919 | * Get a pbuf, limit cluster I/O on a per-device basis. If | |
920 | * doing cluster I/O for a file, limit cluster I/O on a | |
921 | * per-mount basis. | |
922 | */ | |
923 | if (vp->v_type == VCHR || vp->v_type == VBLK) | |
924 | bp = trypbuf_kva(&vp->v_pbuf_count); | |
925 | else | |
926 | bp = trypbuf_kva(&vp->v_mount->mnt_pbuf_count); | |
927 | ||
928 | if (bp == NULL) | |
984263bc MD |
929 | return tbp; |
930 | ||
931 | /* | |
932 | * We are synthesizing a buffer out of vm_page_t's, but | |
933 | * if the block size is not page aligned then the starting | |
934 | * address may not be either. Inherit the b_data offset | |
935 | * from the original buffer. | |
936 | */ | |
d84f6fa1 | 937 | bp->b_vp = vp; |
984263bc | 938 | bp->b_data = (char *)((vm_offset_t)bp->b_data | |
d32579c3 MD |
939 | ((vm_offset_t)tbp->b_data & PAGE_MASK)); |
940 | bp->b_flags |= B_CLUSTER | B_VMIO | B_KVABIO; | |
10f3fee5 | 941 | bp->b_cmd = BUF_CMD_READ; |
ae8e83e6 | 942 | bp->b_bio1.bio_done = cluster_callback; /* default to async */ |
81b5c339 MD |
943 | bp->b_bio1.bio_caller_info1.cluster_head = NULL; |
944 | bp->b_bio1.bio_caller_info2.cluster_tail = NULL; | |
54078292 | 945 | bp->b_loffset = loffset; |
e92ca23a | 946 | bp->b_bio2.bio_offset = doffset; |
81b5c339 MD |
947 | KASSERT(bp->b_loffset != NOOFFSET, |
948 | ("cluster_rbuild: no buffer offset")); | |
984263bc | 949 | |
984263bc MD |
950 | bp->b_bcount = 0; |
951 | bp->b_bufsize = 0; | |
54f51aeb | 952 | bp->b_xio.xio_npages = 0; |
984263bc | 953 | |
e92ca23a | 954 | for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) { |
10f3fee5 | 955 | if (i) { |
54f51aeb | 956 | if ((bp->b_xio.xio_npages * PAGE_SIZE) + |
e92ca23a | 957 | round_page(blksize) > maxiosize) { |
984263bc MD |
958 | break; |
959 | } | |
960 | ||
961 | /* | |
962 | * Shortcut some checks and try to avoid buffers that | |
963 | * would block in the lock. The same checks have to | |
964 | * be made again after we officially get the buffer. | |
965 | */ | |
b77cfc40 | 966 | tbp = getblk(vp, loffset + i * blksize, blksize, |
d32579c3 MD |
967 | GETBLK_SZMATCH | |
968 | GETBLK_NOWAIT | | |
969 | GETBLK_KVABIO, | |
970 | 0); | |
b77cfc40 MD |
971 | if (tbp == NULL) |
972 | break; | |
973 | for (j = 0; j < tbp->b_xio.xio_npages; j++) { | |
974 | if (tbp->b_xio.xio_pages[j]->valid) | |
984263bc MD |
975 | break; |
976 | } | |
b77cfc40 MD |
977 | if (j != tbp->b_xio.xio_npages) { |
978 | bqrelse(tbp); | |
979 | break; | |
980 | } | |
984263bc MD |
981 | |
982 | /* | |
983 | * Stop scanning if the buffer is fuly valid | |
984 | * (marked B_CACHE), or locked (may be doing a | |
985 | * background write), or if the buffer is not | |
986 | * VMIO backed. The clustering code can only deal | |
987 | * with VMIO-backed buffers. | |
988 | */ | |
989 | if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || | |
27bc0cb1 MD |
990 | (tbp->b_flags & B_VMIO) == 0 || |
991 | (LIST_FIRST(&tbp->b_dep) != NULL && | |
992 | buf_checkread(tbp)) | |
993 | ) { | |
984263bc MD |
994 | bqrelse(tbp); |
995 | break; | |
996 | } | |
997 | ||
998 | /* | |
999 | * The buffer must be completely invalid in order to | |
1000 | * take part in the cluster. If it is partially valid | |
1001 | * then we stop. | |
1002 | */ | |
54f51aeb HP |
1003 | for (j = 0;j < tbp->b_xio.xio_npages; j++) { |
1004 | if (tbp->b_xio.xio_pages[j]->valid) | |
984263bc MD |
1005 | break; |
1006 | } | |
54f51aeb | 1007 | if (j != tbp->b_xio.xio_npages) { |
984263bc MD |
1008 | bqrelse(tbp); |
1009 | break; | |
1010 | } | |
1011 | ||
b86460bf MD |
1012 | /* |
1013 | * Depress the priority of buffers not explicitly | |
1014 | * requested. | |
1015 | */ | |
e92ca23a | 1016 | /* tbp->b_flags |= B_AGE; */ |
b86460bf | 1017 | |
984263bc | 1018 | /* |
984263bc MD |
1019 | * Set the block number if it isn't set, otherwise |
1020 | * if it is make sure it matches the block number we | |
1021 | * expect. | |
1022 | */ | |
54078292 MD |
1023 | if (tbp->b_bio2.bio_offset == NOOFFSET) { |
1024 | tbp->b_bio2.bio_offset = boffset; | |
1025 | } else if (tbp->b_bio2.bio_offset != boffset) { | |
984263bc MD |
1026 | brelse(tbp); |
1027 | break; | |
1028 | } | |
1029 | } | |
ae8e83e6 | 1030 | |
cb1fa82f MD |
1031 | /* |
1032 | * Set B_RAM if (*srp) is 1. B_RAM is only set on one buffer | |
1033 | * in the cluster, including potentially the first buffer | |
1034 | * once we start streaming the read-aheads. | |
1035 | */ | |
1036 | if (--*srp == 0) | |
1037 | cluster_setram(tbp); | |
1038 | else | |
1039 | cluster_clrram(tbp); | |
1040 | ||
984263bc | 1041 | /* |
ae8e83e6 MD |
1042 | * The passed-in tbp (i == 0) will already be set up for |
1043 | * async or sync operation. All other tbp's acquire in | |
1044 | * our loop are set up for async operation. | |
984263bc | 1045 | */ |
10f3fee5 | 1046 | tbp->b_cmd = BUF_CMD_READ; |
984263bc | 1047 | BUF_KERNPROC(tbp); |
81b5c339 | 1048 | cluster_append(&bp->b_bio1, tbp); |
54078292 | 1049 | for (j = 0; j < tbp->b_xio.xio_npages; ++j) { |
984263bc | 1050 | vm_page_t m; |
b12defdc | 1051 | |
54f51aeb | 1052 | m = tbp->b_xio.xio_pages[j]; |
b12defdc | 1053 | vm_page_busy_wait(m, FALSE, "clurpg"); |
984263bc | 1054 | vm_page_io_start(m); |
b12defdc | 1055 | vm_page_wakeup(m); |
984263bc | 1056 | vm_object_pip_add(m->object, 1); |
54f51aeb | 1057 | if ((bp->b_xio.xio_npages == 0) || |
c1f5cf51 | 1058 | (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) { |
54f51aeb HP |
1059 | bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; |
1060 | bp->b_xio.xio_npages++; | |
984263bc | 1061 | } |
ca88a24a | 1062 | if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { |
54f51aeb | 1063 | tbp->b_xio.xio_pages[j] = bogus_page; |
ca88a24a MD |
1064 | tbp->b_flags |= B_HASBOGUS; |
1065 | } | |
984263bc MD |
1066 | } |
1067 | /* | |
1068 | * XXX shouldn't this be += size for both, like in | |
1069 | * cluster_wbuild()? | |
1070 | * | |
1071 | * Don't inherit tbp->b_bufsize as it may be larger due to | |
1072 | * a non-page-aligned size. Instead just aggregate using | |
1073 | * 'size'. | |
1074 | */ | |
e92ca23a MD |
1075 | if (tbp->b_bcount != blksize) |
1076 | kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize); | |
1077 | if (tbp->b_bufsize != blksize) | |
1078 | kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize); | |
1079 | bp->b_bcount += blksize; | |
1080 | bp->b_bufsize += blksize; | |
984263bc MD |
1081 | } |
1082 | ||
1083 | /* | |
1084 | * Fully valid pages in the cluster are already good and do not need | |
1085 | * to be re-read from disk. Replace the page with bogus_page | |
1086 | */ | |
54f51aeb HP |
1087 | for (j = 0; j < bp->b_xio.xio_npages; j++) { |
1088 | if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) == | |
984263bc | 1089 | VM_PAGE_BITS_ALL) { |
54f51aeb | 1090 | bp->b_xio.xio_pages[j] = bogus_page; |
ca88a24a | 1091 | bp->b_flags |= B_HASBOGUS; |
984263bc MD |
1092 | } |
1093 | } | |
312dcd01 | 1094 | if (bp->b_bufsize > bp->b_kvasize) { |
54078292 | 1095 | panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)", |
984263bc | 1096 | bp->b_bufsize, bp->b_kvasize); |
312dcd01 | 1097 | } |
d32579c3 MD |
1098 | pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data), |
1099 | (vm_page_t *)bp->b_xio.xio_pages, | |
1100 | bp->b_xio.xio_npages); | |
ae8e83e6 | 1101 | BUF_KERNPROC(bp); |
984263bc MD |
1102 | return (bp); |
1103 | } | |
1104 | ||
1105 | /* | |
1106 | * Cleanup after a clustered read or write. | |
1107 | * This is complicated by the fact that any of the buffers might have | |
1108 | * extra memory (if there were no empty buffer headers at allocbuf time) | |
1109 | * that we will need to shift around. | |
81b5c339 MD |
1110 | * |
1111 | * The returned bio is &bp->b_bio1 | |
984263bc | 1112 | */ |
59b728a7 | 1113 | static void |
81b5c339 | 1114 | cluster_callback(struct bio *bio) |
984263bc | 1115 | { |
81b5c339 MD |
1116 | struct buf *bp = bio->bio_buf; |
1117 | struct buf *tbp; | |
c3c895a6 | 1118 | struct buf *next; |
d84f6fa1 | 1119 | struct vnode *vp; |
984263bc | 1120 | int error = 0; |
c3c895a6 | 1121 | int bpflags; |
984263bc MD |
1122 | |
1123 | /* | |
9a71d53f MD |
1124 | * Must propogate errors to all the components. A short read (EOF) |
1125 | * is a critical error. | |
984263bc | 1126 | */ |
9a71d53f | 1127 | if (bp->b_flags & B_ERROR) { |
984263bc | 1128 | error = bp->b_error; |
9a71d53f MD |
1129 | } else if (bp->b_bcount != bp->b_bufsize) { |
1130 | panic("cluster_callback: unexpected EOF on cluster %p!", bio); | |
1131 | } | |
984263bc | 1132 | |
d32579c3 MD |
1133 | pmap_qremove_noinval(trunc_page((vm_offset_t) bp->b_data), |
1134 | bp->b_xio.xio_npages); | |
c3c895a6 MD |
1135 | |
1136 | /* | |
1137 | * Retrieve the cluster head and dispose of the cluster buffer. | |
1138 | * the vp is only valid while we hold one or more cluster elements, | |
1139 | * so we have to do this before disposing of them. | |
1140 | */ | |
1141 | tbp = bio->bio_caller_info1.cluster_head; | |
1142 | bio->bio_caller_info1.cluster_head = NULL; | |
1143 | bpflags = bp->b_flags; | |
1144 | vp = bp->b_vp; | |
1145 | bp->b_vp = NULL; | |
1146 | ||
1147 | if (vp->v_type == VCHR || vp->v_type == VBLK) | |
1148 | relpbuf(bp, &vp->v_pbuf_count); | |
1149 | else | |
1150 | relpbuf(bp, &vp->v_mount->mnt_pbuf_count); | |
1151 | bp = NULL; /* SAFETY */ | |
1152 | ||
984263bc MD |
1153 | /* |
1154 | * Move memory from the large cluster buffer into the component | |
81b5c339 MD |
1155 | * buffers and mark IO as done on these. Since the memory map |
1156 | * is the same, no actual copying is required. | |
c3c895a6 MD |
1157 | * |
1158 | * (And we already disposed of the larger cluster buffer) | |
984263bc | 1159 | */ |
c3c895a6 MD |
1160 | while (tbp) { |
1161 | next = tbp->b_cluster_next; | |
984263bc | 1162 | if (error) { |
3b2afb67 | 1163 | tbp->b_flags |= B_ERROR | B_IOISSUED; |
984263bc MD |
1164 | tbp->b_error = error; |
1165 | } else { | |
1166 | tbp->b_dirtyoff = tbp->b_dirtyend = 0; | |
9c93755a MD |
1167 | tbp->b_flags &= ~(B_ERROR | B_INVAL); |
1168 | if (tbp->b_cmd == BUF_CMD_READ) { | |
1169 | tbp->b_flags = (tbp->b_flags & ~B_NOTMETA) | | |
c3c895a6 | 1170 | (bpflags & B_NOTMETA); |
9c93755a | 1171 | } |
3b2afb67 | 1172 | tbp->b_flags |= B_IOISSUED; |
c3c895a6 | 1173 | |
984263bc MD |
1174 | /* |
1175 | * XXX the bdwrite()/bqrelse() issued during | |
1176 | * cluster building clears B_RELBUF (see bqrelse() | |
1177 | * comment). If direct I/O was specified, we have | |
1178 | * to restore it here to allow the buffer and VM | |
1179 | * to be freed. | |
1180 | */ | |
1181 | if (tbp->b_flags & B_DIRECT) | |
1182 | tbp->b_flags |= B_RELBUF; | |
ffd3e597 MD |
1183 | |
1184 | /* | |
1185 | * XXX I think biodone() below will do this, but do | |
1186 | * it here anyway for consistency. | |
1187 | */ | |
1188 | if (tbp->b_cmd == BUF_CMD_WRITE) | |
1189 | bundirty(tbp); | |
984263bc | 1190 | } |
81b5c339 | 1191 | biodone(&tbp->b_bio1); |
c3c895a6 | 1192 | tbp = next; |
984263bc | 1193 | } |
984263bc MD |
1194 | } |
1195 | ||
1196 | /* | |
504ea70e | 1197 | * Implement modified write build for cluster. |
984263bc | 1198 | * |
504ea70e MD |
1199 | * write_behind = 0 write behind disabled |
1200 | * write_behind = 1 write behind normal (default) | |
1201 | * write_behind = 2 write behind backed-off | |
984263bc | 1202 | * |
504ea70e MD |
1203 | * In addition, write_behind is only activated for files that have |
1204 | * grown past a certain size (default 10MB). Otherwise temporary files | |
1205 | * wind up generating a lot of unnecessary disk I/O. | |
984263bc | 1206 | */ |
984263bc | 1207 | static __inline int |
e92ca23a | 1208 | cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len) |
984263bc MD |
1209 | { |
1210 | int r = 0; | |
1211 | ||
1212 | switch(write_behind) { | |
1213 | case 2: | |
54078292 | 1214 | if (start_loffset < len) |
984263bc | 1215 | break; |
54078292 | 1216 | start_loffset -= len; |
984263bc MD |
1217 | /* fall through */ |
1218 | case 1: | |
504ea70e MD |
1219 | if (vp->v_filesize >= write_behind_minfilesize) { |
1220 | r = cluster_wbuild(vp, NULL, blksize, | |
1221 | start_loffset, len); | |
1222 | } | |
984263bc MD |
1223 | /* fall through */ |
1224 | default: | |
1225 | /* fall through */ | |
1226 | break; | |
1227 | } | |
1228 | return(r); | |
1229 | } | |
1230 | ||
1231 | /* | |
1232 | * Do clustered write for FFS. | |
1233 | * | |
1234 | * Three cases: | |
1235 | * 1. Write is not sequential (write asynchronously) | |
1236 | * Write is sequential: | |
1237 | * 2. beginning of cluster - begin cluster | |
1238 | * 3. middle of a cluster - add to cluster | |
1239 | * 4. end of a cluster - asynchronously write cluster | |
38a4b308 MD |
1240 | * |
1241 | * WARNING! vnode fields are not locked and must ONLY be used heuristically. | |
984263bc MD |
1242 | */ |
1243 | void | |
e92ca23a | 1244 | cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount) |
984263bc MD |
1245 | { |
1246 | struct vnode *vp; | |
54078292 | 1247 | off_t loffset; |
984263bc | 1248 | int maxclen, cursize; |
984263bc | 1249 | int async; |
38a4b308 MD |
1250 | cluster_cache_t dummy; |
1251 | cluster_cache_t *cc; | |
984263bc MD |
1252 | |
1253 | vp = bp->b_vp; | |
e92ca23a | 1254 | if (vp->v_type == VREG) |
984263bc | 1255 | async = vp->v_mount->mnt_flag & MNT_ASYNC; |
e92ca23a | 1256 | else |
984263bc | 1257 | async = 0; |
54078292 | 1258 | loffset = bp->b_loffset; |
81b5c339 MD |
1259 | KASSERT(bp->b_loffset != NOOFFSET, |
1260 | ("cluster_write: no buffer offset")); | |
984263bc | 1261 | |
38a4b308 MD |
1262 | cc = cluster_getcache(&dummy, vp, loffset); |
1263 | ||
1264 | /* | |
1265 | * Initialize vnode to beginning of file. | |
1266 | */ | |
54078292 | 1267 | if (loffset == 0) |
38a4b308 | 1268 | cc->v_lasta = cc->v_clen = cc->v_cstart = cc->v_lastw = 0; |
984263bc | 1269 | |
cf297f2c | 1270 | if (cc->v_clen == 0 || loffset != cc->v_lastw || |
d9a07a60 | 1271 | (bp->b_bio2.bio_offset != NOOFFSET && |
cf297f2c | 1272 | (bp->b_bio2.bio_offset != cc->v_lasta))) { |
d9a07a60 MD |
1273 | /* |
1274 | * Next block is not logically sequential, or, if physical | |
1275 | * block offsets are available, not physically sequential. | |
1276 | * | |
1277 | * If physical block offsets are not available we only | |
1278 | * get here if we weren't logically sequential. | |
1279 | */ | |
2ec4b00d | 1280 | maxclen = vmaxiosize(vp); |
38a4b308 | 1281 | if (cc->v_clen != 0) { |
984263bc MD |
1282 | /* |
1283 | * Next block is not sequential. | |
1284 | * | |
1285 | * If we are not writing at end of file, the process | |
1286 | * seeked to another point in the file since its last | |
1287 | * write, or we have reached our maximum cluster size, | |
1288 | * then push the previous cluster. Otherwise try | |
1289 | * reallocating to make it sequential. | |
1290 | * | |
1291 | * Change to algorithm: only push previous cluster if | |
1292 | * it was sequential from the point of view of the | |
1293 | * seqcount heuristic, otherwise leave the buffer | |
1294 | * intact so we can potentially optimize the I/O | |
1295 | * later on in the buf_daemon or update daemon | |
1296 | * flush. | |
1297 | */ | |
cf297f2c | 1298 | cursize = cc->v_lastw - cc->v_cstart; |
9de13b88 | 1299 | if (bp->b_loffset + blksize < filesize || |
cf297f2c | 1300 | loffset != cc->v_lastw || |
38a4b308 | 1301 | cc->v_clen <= cursize) { |
984263bc | 1302 | if (!async && seqcount > 0) { |
e92ca23a | 1303 | cluster_wbuild_wb(vp, blksize, |
38a4b308 | 1304 | cc->v_cstart, cursize); |
984263bc MD |
1305 | } |
1306 | } else { | |
1307 | struct buf **bpp, **endbp; | |
1308 | struct cluster_save *buflist; | |
1309 | ||
38a4b308 MD |
1310 | buflist = cluster_collectbufs(cc, vp, |
1311 | bp, blksize); | |
984263bc | 1312 | endbp = &buflist->bs_children |
cf297f2c | 1313 | [buflist->bs_nchildren - 1]; |
984263bc MD |
1314 | if (VOP_REALLOCBLKS(vp, buflist)) { |
1315 | /* | |
1316 | * Failed, push the previous cluster | |
1317 | * if *really* writing sequentially | |
1318 | * in the logical file (seqcount > 1), | |
1319 | * otherwise delay it in the hopes that | |
1320 | * the low level disk driver can | |
1321 | * optimize the write ordering. | |
38a4b308 MD |
1322 | * |
1323 | * NOTE: We do not brelse the last | |
1324 | * element which is bp, and we | |
1325 | * do not return here. | |
984263bc MD |
1326 | */ |
1327 | for (bpp = buflist->bs_children; | |
1328 | bpp < endbp; bpp++) | |
1329 | brelse(*bpp); | |
efda3bd0 | 1330 | kfree(buflist, M_SEGMENT); |
984263bc MD |
1331 | if (seqcount > 1) { |
1332 | cluster_wbuild_wb(vp, | |
38a4b308 | 1333 | blksize, cc->v_cstart, |
984263bc MD |
1334 | cursize); |
1335 | } | |
1336 | } else { | |
1337 | /* | |
1338 | * Succeeded, keep building cluster. | |
1339 | */ | |
1340 | for (bpp = buflist->bs_children; | |
1341 | bpp <= endbp; bpp++) | |
1342 | bdwrite(*bpp); | |
efda3bd0 | 1343 | kfree(buflist, M_SEGMENT); |
cf297f2c MD |
1344 | cc->v_lastw = loffset + blksize; |
1345 | cc->v_lasta = bp->b_bio2.bio_offset + | |
1346 | blksize; | |
38a4b308 | 1347 | cluster_putcache(cc); |
984263bc MD |
1348 | return; |
1349 | } | |
1350 | } | |
1351 | } | |
d9a07a60 | 1352 | |
984263bc MD |
1353 | /* |
1354 | * Consider beginning a cluster. If at end of file, make | |
1355 | * cluster as large as possible, otherwise find size of | |
1356 | * existing cluster. | |
1357 | */ | |
1358 | if ((vp->v_type == VREG) && | |
9de13b88 | 1359 | bp->b_loffset + blksize < filesize && |
54078292 | 1360 | (bp->b_bio2.bio_offset == NOOFFSET) && |
e92ca23a | 1361 | (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) || |
54078292 | 1362 | bp->b_bio2.bio_offset == NOOFFSET)) { |
b642a6c1 | 1363 | bdwrite(bp); |
38a4b308 | 1364 | cc->v_clen = 0; |
cf297f2c MD |
1365 | cc->v_lasta = bp->b_bio2.bio_offset + blksize; |
1366 | cc->v_cstart = loffset; | |
1367 | cc->v_lastw = loffset + blksize; | |
38a4b308 | 1368 | cluster_putcache(cc); |
984263bc MD |
1369 | return; |
1370 | } | |
e92ca23a | 1371 | if (maxclen > blksize) |
cf297f2c | 1372 | cc->v_clen = maxclen; |
54078292 | 1373 | else |
cf297f2c | 1374 | cc->v_clen = blksize; |
38a4b308 | 1375 | if (!async && cc->v_clen == 0) { /* I/O not contiguous */ |
cf297f2c | 1376 | cc->v_cstart = loffset; |
b642a6c1 | 1377 | bdwrite(bp); |
984263bc | 1378 | } else { /* Wait for rest of cluster */ |
38a4b308 | 1379 | cc->v_cstart = loffset; |
984263bc MD |
1380 | bdwrite(bp); |
1381 | } | |
38a4b308 | 1382 | } else if (loffset == cc->v_cstart + cc->v_clen) { |
984263bc MD |
1383 | /* |
1384 | * At end of cluster, write it out if seqcount tells us we | |
1385 | * are operating sequentially, otherwise let the buf or | |
1386 | * update daemon handle it. | |
1387 | */ | |
1388 | bdwrite(bp); | |
1389 | if (seqcount > 1) | |
38a4b308 MD |
1390 | cluster_wbuild_wb(vp, blksize, cc->v_cstart, |
1391 | cc->v_clen + blksize); | |
1392 | cc->v_clen = 0; | |
cf297f2c | 1393 | cc->v_cstart = loffset; |
e91e64c7 | 1394 | } else if (vm_paging_severe() && |
b642a6c1 | 1395 | bp->b_loffset + blksize < filesize) { |
984263bc | 1396 | /* |
b642a6c1 MD |
1397 | * We are low on memory, get it going NOW. However, do not |
1398 | * try to push out a partial block at the end of the file | |
1399 | * as this could lead to extremely non-optimal write activity. | |
984263bc MD |
1400 | */ |
1401 | bawrite(bp); | |
1402 | } else { | |
1403 | /* | |
1404 | * In the middle of a cluster, so just delay the I/O for now. | |
1405 | */ | |
1406 | bdwrite(bp); | |
1407 | } | |
cf297f2c MD |
1408 | cc->v_lastw = loffset + blksize; |
1409 | cc->v_lasta = bp->b_bio2.bio_offset + blksize; | |
38a4b308 | 1410 | cluster_putcache(cc); |
984263bc MD |
1411 | } |
1412 | ||
9de13b88 MD |
1413 | /* |
1414 | * This is the clustered version of bawrite(). It works similarly to | |
1415 | * cluster_write() except I/O on the buffer is guaranteed to occur. | |
1416 | */ | |
1417 | int | |
1418 | cluster_awrite(struct buf *bp) | |
1419 | { | |
1420 | int total; | |
1421 | ||
1422 | /* | |
1423 | * Don't bother if it isn't clusterable. | |
1424 | */ | |
1425 | if ((bp->b_flags & B_CLUSTEROK) == 0 || | |
1426 | bp->b_vp == NULL || | |
1427 | (bp->b_vp->v_flag & VOBJBUF) == 0) { | |
1428 | total = bp->b_bufsize; | |
1429 | bawrite(bp); | |
1430 | return (total); | |
1431 | } | |
1432 | ||
1433 | total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize, | |
1434 | bp->b_loffset, vmaxiosize(bp->b_vp)); | |
d9a07a60 MD |
1435 | |
1436 | /* | |
1437 | * If bp is still non-NULL then cluster_wbuild() did not initiate | |
1438 | * I/O on it and we must do so here to provide the API guarantee. | |
1439 | */ | |
9de13b88 MD |
1440 | if (bp) |
1441 | bawrite(bp); | |
1442 | ||
1443 | return total; | |
1444 | } | |
984263bc MD |
1445 | |
1446 | /* | |
1447 | * This is an awful lot like cluster_rbuild...wish they could be combined. | |
1448 | * The last lbn argument is the current block on which I/O is being | |
1449 | * performed. Check to see that it doesn't fall in the middle of | |
1450 | * the current block (if last_bp == NULL). | |
9de13b88 MD |
1451 | * |
1452 | * cluster_wbuild() normally does not guarantee anything. If bpp is | |
1453 | * non-NULL and cluster_wbuild() is able to incorporate it into the | |
1454 | * I/O it will set *bpp to NULL, otherwise it will leave it alone and | |
1455 | * the caller must dispose of *bpp. | |
984263bc | 1456 | */ |
9de13b88 MD |
1457 | static int |
1458 | cluster_wbuild(struct vnode *vp, struct buf **bpp, | |
1459 | int blksize, off_t start_loffset, int bytes) | |
984263bc MD |
1460 | { |
1461 | struct buf *bp, *tbp; | |
e43a034f | 1462 | int i, j; |
984263bc | 1463 | int totalwritten = 0; |
9de13b88 | 1464 | int must_initiate; |
2ec4b00d | 1465 | int maxiosize = vmaxiosize(vp); |
984263bc | 1466 | |
54078292 | 1467 | while (bytes > 0) { |
984263bc | 1468 | /* |
9de13b88 MD |
1469 | * If the buffer matches the passed locked & removed buffer |
1470 | * we used the passed buffer (which might not be B_DELWRI). | |
1471 | * | |
1472 | * Otherwise locate the buffer and determine if it is | |
1473 | * compatible. | |
984263bc | 1474 | */ |
9de13b88 MD |
1475 | if (bpp && (*bpp)->b_loffset == start_loffset) { |
1476 | tbp = *bpp; | |
1477 | *bpp = NULL; | |
1478 | bpp = NULL; | |
1479 | } else { | |
d32579c3 MD |
1480 | tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK | |
1481 | FINDBLK_KVABIO); | |
9de13b88 MD |
1482 | if (tbp == NULL || |
1483 | (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != | |
1484 | B_DELWRI || | |
1485 | (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { | |
1486 | if (tbp) | |
1487 | BUF_UNLOCK(tbp); | |
1488 | start_loffset += blksize; | |
1489 | bytes -= blksize; | |
1490 | continue; | |
1491 | } | |
1492 | bremfree(tbp); | |
984263bc | 1493 | } |
10f3fee5 | 1494 | KKASSERT(tbp->b_cmd == BUF_CMD_DONE); |
984263bc MD |
1495 | |
1496 | /* | |
1497 | * Extra memory in the buffer, punt on this buffer. | |
1498 | * XXX we could handle this in most cases, but we would | |
1499 | * have to push the extra memory down to after our max | |
1500 | * possible cluster size and then potentially pull it back | |
1501 | * up if the cluster was terminated prematurely--too much | |
1502 | * hassle. | |
1503 | */ | |
8158299a MD |
1504 | if ((tbp->b_flags & B_CLUSTEROK) == 0 || |
1505 | tbp->b_bcount != tbp->b_bufsize || | |
1506 | tbp->b_bcount != blksize || | |
1507 | bytes == blksize) { | |
984263bc MD |
1508 | totalwritten += tbp->b_bufsize; |
1509 | bawrite(tbp); | |
e92ca23a MD |
1510 | start_loffset += blksize; |
1511 | bytes -= blksize; | |
984263bc MD |
1512 | continue; |
1513 | } | |
1514 | ||
d84f6fa1 MD |
1515 | /* |
1516 | * Get a pbuf, limit cluster I/O on a per-device basis. If | |
1517 | * doing cluster I/O for a file, limit cluster I/O on a | |
1518 | * per-mount basis. | |
1519 | * | |
1520 | * HAMMER and other filesystems may attempt to queue a massive | |
1521 | * amount of write I/O, using trypbuf() here easily results in | |
1522 | * situation where the I/O stream becomes non-clustered. | |
1523 | */ | |
1524 | if (vp->v_type == VCHR || vp->v_type == VBLK) | |
1525 | bp = getpbuf_kva(&vp->v_pbuf_count); | |
1526 | else | |
1527 | bp = getpbuf_kva(&vp->v_mount->mnt_pbuf_count); | |
1528 | ||
984263bc | 1529 | /* |
9a71d53f MD |
1530 | * Set up the pbuf. Track our append point with b_bcount |
1531 | * and b_bufsize. b_bufsize is not used by the device but | |
1532 | * our caller uses it to loop clusters and we use it to | |
1533 | * detect a premature EOF on the block device. | |
984263bc | 1534 | */ |
984263bc MD |
1535 | bp->b_bcount = 0; |
1536 | bp->b_bufsize = 0; | |
54f51aeb | 1537 | bp->b_xio.xio_npages = 0; |
81b5c339 | 1538 | bp->b_loffset = tbp->b_loffset; |
54078292 | 1539 | bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; |
d84f6fa1 | 1540 | bp->b_vp = vp; |
984263bc MD |
1541 | |
1542 | /* | |
1543 | * We are synthesizing a buffer out of vm_page_t's, but | |
1544 | * if the block size is not page aligned then the starting | |
1545 | * address may not be either. Inherit the b_data offset | |
1546 | * from the original buffer. | |
1547 | */ | |
1548 | bp->b_data = (char *)((vm_offset_t)bp->b_data | | |
d32579c3 | 1549 | ((vm_offset_t)tbp->b_data & PAGE_MASK)); |
9c93755a | 1550 | bp->b_flags &= ~(B_ERROR | B_NOTMETA); |
d32579c3 | 1551 | bp->b_flags |= B_CLUSTER | B_BNOCLIP | B_KVABIO | |
9c93755a MD |
1552 | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | |
1553 | B_NOTMETA)); | |
81b5c339 MD |
1554 | bp->b_bio1.bio_caller_info1.cluster_head = NULL; |
1555 | bp->b_bio1.bio_caller_info2.cluster_tail = NULL; | |
b1c20cfa | 1556 | |
984263bc MD |
1557 | /* |
1558 | * From this location in the file, scan forward to see | |
1559 | * if there are buffers with adjacent data that need to | |
1560 | * be written as well. | |
9de13b88 MD |
1561 | * |
1562 | * IO *must* be initiated on index 0 at this point | |
1563 | * (particularly when called from cluster_awrite()). | |
984263bc | 1564 | */ |
e92ca23a | 1565 | for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { |
9de13b88 MD |
1566 | if (i == 0) { |
1567 | must_initiate = 1; | |
1568 | } else { | |
1569 | /* | |
1570 | * Not first buffer. | |
1571 | */ | |
1572 | must_initiate = 0; | |
b1c20cfa | 1573 | tbp = findblk(vp, start_loffset, |
d32579c3 | 1574 | FINDBLK_NBLOCK | FINDBLK_KVABIO); |
984263bc | 1575 | /* |
b1c20cfa MD |
1576 | * Buffer not found or could not be locked |
1577 | * non-blocking. | |
984263bc | 1578 | */ |
b1c20cfa | 1579 | if (tbp == NULL) |
984263bc | 1580 | break; |
984263bc MD |
1581 | |
1582 | /* | |
1583 | * If it IS in core, but has different | |
b1c20cfa MD |
1584 | * characteristics, then don't cluster |
1585 | * with it. | |
984263bc MD |
1586 | */ |
1587 | if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | | |
b1c20cfa MD |
1588 | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) |
1589 | != (B_DELWRI | B_CLUSTEROK | | |
1590 | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || | |
9de13b88 | 1591 | (tbp->b_flags & B_LOCKED) |
b1c20cfa MD |
1592 | ) { |
1593 | BUF_UNLOCK(tbp); | |
984263bc MD |
1594 | break; |
1595 | } | |
1596 | ||
1597 | /* | |
1598 | * Check that the combined cluster | |
1599 | * would make sense with regard to pages | |
1600 | * and would not be too large | |
9de13b88 MD |
1601 | * |
1602 | * WARNING! buf_checkwrite() must be the last | |
1603 | * check made. If it returns 0 then | |
1604 | * we must initiate the I/O. | |
984263bc | 1605 | */ |
e92ca23a | 1606 | if ((tbp->b_bcount != blksize) || |
54078292 MD |
1607 | ((bp->b_bio2.bio_offset + i) != |
1608 | tbp->b_bio2.bio_offset) || | |
54f51aeb | 1609 | ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > |
9de13b88 MD |
1610 | (maxiosize / PAGE_SIZE)) || |
1611 | (LIST_FIRST(&tbp->b_dep) && | |
1612 | buf_checkwrite(tbp)) | |
1613 | ) { | |
984263bc | 1614 | BUF_UNLOCK(tbp); |
984263bc MD |
1615 | break; |
1616 | } | |
9de13b88 MD |
1617 | if (LIST_FIRST(&tbp->b_dep)) |
1618 | must_initiate = 1; | |
984263bc MD |
1619 | /* |
1620 | * Ok, it's passed all the tests, | |
1621 | * so remove it from the free list | |
1622 | * and mark it busy. We will use it. | |
1623 | */ | |
1624 | bremfree(tbp); | |
10f3fee5 | 1625 | KKASSERT(tbp->b_cmd == BUF_CMD_DONE); |
9de13b88 | 1626 | } |
81b5c339 | 1627 | |
984263bc MD |
1628 | /* |
1629 | * If the IO is via the VM then we do some | |
1630 | * special VM hackery (yuck). Since the buffer's | |
1631 | * block size may not be page-aligned it is possible | |
1632 | * for a page to be shared between two buffers. We | |
1633 | * have to get rid of the duplication when building | |
1634 | * the cluster. | |
1635 | */ | |
1636 | if (tbp->b_flags & B_VMIO) { | |
1637 | vm_page_t m; | |
1638 | ||
9de13b88 MD |
1639 | /* |
1640 | * Try to avoid deadlocks with the VM system. | |
1641 | * However, we cannot abort the I/O if | |
1642 | * must_initiate is non-zero. | |
1643 | */ | |
1644 | if (must_initiate == 0) { | |
1645 | for (j = 0; | |
1646 | j < tbp->b_xio.xio_npages; | |
1647 | ++j) { | |
54f51aeb | 1648 | m = tbp->b_xio.xio_pages[j]; |
bc0aa189 MD |
1649 | if (m->busy_count & |
1650 | PBUSY_LOCKED) { | |
984263bc MD |
1651 | bqrelse(tbp); |
1652 | goto finishcluster; | |
1653 | } | |
1654 | } | |
1655 | } | |
1656 | ||
54078292 | 1657 | for (j = 0; j < tbp->b_xio.xio_npages; ++j) { |
54f51aeb | 1658 | m = tbp->b_xio.xio_pages[j]; |
b12defdc | 1659 | vm_page_busy_wait(m, FALSE, "clurpg"); |
984263bc | 1660 | vm_page_io_start(m); |
b12defdc | 1661 | vm_page_wakeup(m); |
984263bc | 1662 | vm_object_pip_add(m->object, 1); |
54f51aeb HP |
1663 | if ((bp->b_xio.xio_npages == 0) || |
1664 | (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) { | |
1665 | bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; | |
1666 | bp->b_xio.xio_npages++; | |
984263bc MD |
1667 | } |
1668 | } | |
1669 | } | |
e92ca23a MD |
1670 | bp->b_bcount += blksize; |
1671 | bp->b_bufsize += blksize; | |
984263bc | 1672 | |
ffd3e597 MD |
1673 | /* |
1674 | * NOTE: see bwrite/bawrite code for why we no longer | |
1675 | * undirty tbp here. | |
1676 | * | |
1677 | * bundirty(tbp); REMOVED | |
1678 | */ | |
10f3fee5 | 1679 | tbp->b_flags &= ~B_ERROR; |
10f3fee5 | 1680 | tbp->b_cmd = BUF_CMD_WRITE; |
984263bc | 1681 | BUF_KERNPROC(tbp); |
81b5c339 | 1682 | cluster_append(&bp->b_bio1, tbp); |
2aee763b MD |
1683 | |
1684 | /* | |
1685 | * check for latent dependencies to be handled | |
1686 | */ | |
408357d8 MD |
1687 | if (LIST_FIRST(&tbp->b_dep) != NULL) |
1688 | buf_start(tbp); | |
984263bc MD |
1689 | } |
1690 | finishcluster: | |
d32579c3 MD |
1691 | pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data), |
1692 | (vm_page_t *)bp->b_xio.xio_pages, | |
1693 | bp->b_xio.xio_npages); | |
312dcd01 | 1694 | if (bp->b_bufsize > bp->b_kvasize) { |
9de13b88 MD |
1695 | panic("cluster_wbuild: b_bufsize(%d) " |
1696 | "> b_kvasize(%d)\n", | |
1697 | bp->b_bufsize, bp->b_kvasize); | |
312dcd01 | 1698 | } |
984263bc MD |
1699 | totalwritten += bp->b_bufsize; |
1700 | bp->b_dirtyoff = 0; | |
1701 | bp->b_dirtyend = bp->b_bufsize; | |
ae8e83e6 | 1702 | bp->b_bio1.bio_done = cluster_callback; |
10f3fee5 | 1703 | bp->b_cmd = BUF_CMD_WRITE; |
ae8e83e6 | 1704 | |
10f3fee5 | 1705 | vfs_busy_pages(vp, bp); |
77912481 | 1706 | bsetrunningbufspace(bp, bp->b_bufsize); |
ae8e83e6 | 1707 | BUF_KERNPROC(bp); |
a8f169e2 | 1708 | vn_strategy(vp, &bp->b_bio1); |
984263bc | 1709 | |
54078292 | 1710 | bytes -= i; |
984263bc MD |
1711 | } |
1712 | return totalwritten; | |
1713 | } | |
1714 | ||
1715 | /* | |
47269f33 MD |
1716 | * Collect together all the buffers in a cluster, plus add one |
1717 | * additional buffer passed-in. | |
1718 | * | |
1719 | * Only pre-existing buffers whos block size matches blksize are collected. | |
1720 | * (this is primarily because HAMMER1 uses varying block sizes and we don't | |
1721 | * want to override its choices). | |
65ec5030 MD |
1722 | * |
1723 | * This code will not try to collect buffers that it cannot lock, otherwise | |
1724 | * it might deadlock against SMP-friendly filesystems. | |
984263bc MD |
1725 | */ |
1726 | static struct cluster_save * | |
38a4b308 MD |
1727 | cluster_collectbufs(cluster_cache_t *cc, struct vnode *vp, |
1728 | struct buf *last_bp, int blksize) | |
984263bc MD |
1729 | { |
1730 | struct cluster_save *buflist; | |
1731 | struct buf *bp; | |
54078292 | 1732 | off_t loffset; |
984263bc | 1733 | int i, len; |
47269f33 MD |
1734 | int j; |
1735 | int k; | |
984263bc | 1736 | |
cf297f2c | 1737 | len = (int)(cc->v_lastw - cc->v_cstart) / blksize; |
38a4b308 | 1738 | KKASSERT(len > 0); |
77652cad | 1739 | buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), |
54078292 | 1740 | M_SEGMENT, M_WAITOK); |
984263bc MD |
1741 | buflist->bs_nchildren = 0; |
1742 | buflist->bs_children = (struct buf **) (buflist + 1); | |
38a4b308 | 1743 | for (loffset = cc->v_cstart, i = 0, j = 0; |
47269f33 MD |
1744 | i < len; |
1745 | (loffset += blksize), i++) { | |
1746 | bp = getcacheblk(vp, loffset, | |
65ec5030 MD |
1747 | last_bp->b_bcount, GETBLK_SZMATCH | |
1748 | GETBLK_NOWAIT); | |
984263bc | 1749 | buflist->bs_children[i] = bp; |
47269f33 MD |
1750 | if (bp == NULL) { |
1751 | j = i + 1; | |
1752 | } else if (bp->b_bio2.bio_offset == NOOFFSET) { | |
08daea96 | 1753 | VOP_BMAP(bp->b_vp, bp->b_loffset, |
e92ca23a MD |
1754 | &bp->b_bio2.bio_offset, |
1755 | NULL, NULL, BUF_CMD_WRITE); | |
54078292 | 1756 | } |
984263bc | 1757 | } |
47269f33 MD |
1758 | |
1759 | /* | |
1760 | * Get rid of gaps | |
1761 | */ | |
1762 | for (k = 0; k < j; ++k) { | |
1763 | if (buflist->bs_children[k]) { | |
1764 | bqrelse(buflist->bs_children[k]); | |
1765 | buflist->bs_children[k] = NULL; | |
1766 | } | |
1767 | } | |
1768 | if (j != 0) { | |
1769 | if (j != i) { | |
1770 | bcopy(buflist->bs_children + j, | |
1771 | buflist->bs_children + 0, | |
1772 | sizeof(buflist->bs_children[0]) * (i - j)); | |
1773 | } | |
1774 | i -= j; | |
1775 | } | |
984263bc | 1776 | buflist->bs_children[i] = bp = last_bp; |
54078292 | 1777 | if (bp->b_bio2.bio_offset == NOOFFSET) { |
e92ca23a MD |
1778 | VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, |
1779 | NULL, NULL, BUF_CMD_WRITE); | |
54078292 | 1780 | } |
984263bc MD |
1781 | buflist->bs_nchildren = i + 1; |
1782 | return (buflist); | |
1783 | } | |
81b5c339 MD |
1784 | |
1785 | void | |
1786 | cluster_append(struct bio *bio, struct buf *tbp) | |
1787 | { | |
1788 | tbp->b_cluster_next = NULL; | |
1789 | if (bio->bio_caller_info1.cluster_head == NULL) { | |
1790 | bio->bio_caller_info1.cluster_head = tbp; | |
1791 | bio->bio_caller_info2.cluster_tail = tbp; | |
1792 | } else { | |
1793 | bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp; | |
1794 | bio->bio_caller_info2.cluster_tail = tbp; | |
1795 | } | |
1796 | } | |
1797 | ||
cf1bb2a8 MD |
1798 | static |
1799 | void | |
cb1fa82f | 1800 | cluster_setram(struct buf *bp) |
cf1bb2a8 MD |
1801 | { |
1802 | bp->b_flags |= B_RAM; | |
1803 | if (bp->b_xio.xio_npages) | |
1804 | vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM); | |
1805 | } | |
cb1fa82f MD |
1806 | |
1807 | static | |
1808 | void | |
1809 | cluster_clrram(struct buf *bp) | |
1810 | { | |
1811 | bp->b_flags &= ~B_RAM; | |
1812 | if (bp->b_xio.xio_npages) | |
1813 | vm_page_flag_clear(bp->b_xio.xio_pages[0], PG_RAM); | |
1814 | } |