Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /* |
2 | * Copyright (c) 1994,1997 John S. Dyson | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice immediately at the beginning of the file, without modification, | |
10 | * this list of conditions, and the following disclaimer. | |
11 | * 2. Absolutely no warranty of function or purpose is made by the author | |
12 | * John S. Dyson. | |
13 | * | |
14 | * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $ | |
3c37c940 | 15 | * $DragonFly: src/sys/kern/vfs_bio.c,v 1.90 2007/05/06 19:23:31 dillon Exp $ |
984263bc MD |
16 | */ |
17 | ||
18 | /* | |
19 | * this file contains a new buffer I/O scheme implementing a coherent | |
20 | * VM object and buffer cache scheme. Pains have been taken to make | |
21 | * sure that the performance degradation associated with schemes such | |
22 | * as this is not realized. | |
23 | * | |
24 | * Author: John S. Dyson | |
25 | * Significant help during the development and debugging phases | |
26 | * had been provided by David Greenman, also of the FreeBSD core team. | |
27 | * | |
28 | * see man buf(9) for more info. | |
29 | */ | |
30 | ||
31 | #include <sys/param.h> | |
32 | #include <sys/systm.h> | |
33 | #include <sys/buf.h> | |
34 | #include <sys/conf.h> | |
35 | #include <sys/eventhandler.h> | |
36 | #include <sys/lock.h> | |
37 | #include <sys/malloc.h> | |
38 | #include <sys/mount.h> | |
39 | #include <sys/kernel.h> | |
40 | #include <sys/kthread.h> | |
41 | #include <sys/proc.h> | |
42 | #include <sys/reboot.h> | |
43 | #include <sys/resourcevar.h> | |
44 | #include <sys/sysctl.h> | |
45 | #include <sys/vmmeter.h> | |
46 | #include <sys/vnode.h> | |
3020e3be | 47 | #include <sys/proc.h> |
984263bc MD |
48 | #include <vm/vm.h> |
49 | #include <vm/vm_param.h> | |
50 | #include <vm/vm_kern.h> | |
51 | #include <vm/vm_pageout.h> | |
52 | #include <vm/vm_page.h> | |
53 | #include <vm/vm_object.h> | |
54 | #include <vm/vm_extern.h> | |
55 | #include <vm/vm_map.h> | |
654a39f0 | 56 | |
3020e3be | 57 | #include <sys/buf2.h> |
654a39f0 | 58 | #include <sys/thread2.h> |
f832287e | 59 | #include <sys/spinlock2.h> |
12e4aaff | 60 | #include <vm/vm_page2.h> |
984263bc | 61 | |
135bd6a8 MD |
62 | #include "opt_ddb.h" |
63 | #ifdef DDB | |
64 | #include <ddb/ddb.h> | |
65 | #endif | |
66 | ||
b3098c79 HP |
67 | /* |
68 | * Buffer queues. | |
69 | */ | |
70 | #define BUFFER_QUEUES 6 | |
71 | enum bufq_type { | |
72 | BQUEUE_NONE, /* not on any queue */ | |
73 | BQUEUE_LOCKED, /* locked buffers */ | |
74 | BQUEUE_CLEAN, /* non-B_DELWRI buffers */ | |
75 | BQUEUE_DIRTY, /* B_DELWRI buffers */ | |
76 | BQUEUE_EMPTYKVA, /* empty buffer headers with KVA assignment */ | |
77 | BQUEUE_EMPTY /* empty buffer headers */ | |
78 | }; | |
79 | TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; | |
80 | ||
984263bc MD |
81 | static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); |
82 | ||
83 | struct bio_ops bioops; /* I/O operation notification */ | |
84 | ||
85 | struct buf *buf; /* buffer header pool */ | |
984263bc | 86 | |
c8e4131d | 87 | static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, |
984263bc | 88 | vm_offset_t to); |
c8e4131d | 89 | static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, |
984263bc MD |
90 | vm_offset_t to); |
91 | static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, | |
92 | int pageno, vm_page_t m); | |
c8e4131d | 93 | static void vfs_clean_pages(struct buf *bp); |
984263bc MD |
94 | static void vfs_setdirty(struct buf *bp); |
95 | static void vfs_vmio_release(struct buf *bp); | |
984263bc MD |
96 | static int flushbufqueues(void); |
97 | ||
402ed7e1 | 98 | static void buf_daemon (void); |
984263bc MD |
99 | /* |
100 | * bogus page -- for I/O to/from partially complete buffers | |
101 | * this is a temporary solution to the problem, but it is not | |
102 | * really that bad. it would be better to split the buffer | |
103 | * for input in the case of buffers partially already in memory, | |
104 | * but the code is intricate enough already. | |
105 | */ | |
106 | vm_page_t bogus_page; | |
984263bc | 107 | int runningbufspace; |
a0c36a34 | 108 | |
460426e6 MD |
109 | /* |
110 | * These are all static, but make the ones we export globals so we do | |
111 | * not need to use compiler magic. | |
112 | */ | |
113 | int bufspace, maxbufspace, | |
984263bc MD |
114 | bufmallocspace, maxbufmallocspace, lobufspace, hibufspace; |
115 | static int bufreusecnt, bufdefragcnt, buffreekvacnt; | |
984263bc | 116 | static int lorunningspace, hirunningspace, runningbufreq; |
460426e6 | 117 | int numdirtybuffers, lodirtybuffers, hidirtybuffers; |
984263bc MD |
118 | static int numfreebuffers, lofreebuffers, hifreebuffers; |
119 | static int getnewbufcalls; | |
120 | static int getnewbufrestarts; | |
121 | ||
f832287e MD |
122 | static int needsbuffer; /* locked by needsbuffer_spin */ |
123 | static int bd_request; /* locked by needsbuffer_spin */ | |
124 | static struct spinlock needsbuffer_spin; | |
125 | ||
3f779080 HP |
126 | /* |
127 | * Sysctls for operational control of the buffer cache. | |
128 | */ | |
129 | SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, | |
130 | "Number of dirty buffers to flush before bufdaemon becomes inactive"); | |
131 | SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, | |
bb606263 | 132 | "High watermark used to trigger explicit flushing of dirty buffers"); |
3f779080 | 133 | SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, |
bb606263 | 134 | "Low watermark for special reserve in low-memory situations"); |
3f779080 | 135 | SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, |
bb606263 | 136 | "High watermark for special reserve in low-memory situations"); |
3f779080 HP |
137 | SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, |
138 | "Minimum amount of buffer space required for active I/O"); | |
139 | SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, | |
140 | "Maximum amount of buffer space to usable for active I/O"); | |
3f779080 HP |
141 | /* |
142 | * Sysctls determining current state of the buffer cache. | |
143 | */ | |
144 | SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, | |
145 | "Pending number of dirty buffers"); | |
146 | SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, | |
147 | "Number of free buffers on the buffer cache free list"); | |
148 | SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, | |
bb606263 | 149 | "I/O bytes currently in progress due to asynchronous writes"); |
3f779080 HP |
150 | SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, |
151 | "Hard limit on maximum amount of memory usable for buffer space"); | |
152 | SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, | |
153 | "Soft limit on maximum amount of memory usable for buffer space"); | |
154 | SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, | |
155 | "Minimum amount of memory to reserve for system buffer space"); | |
156 | SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, | |
157 | "Amount of memory available for buffers"); | |
158 | SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace, | |
bb606263 | 159 | 0, "Maximum amount of memory reserved for buffers using malloc"); |
3f779080 HP |
160 | SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, |
161 | "Amount of memory left for buffers using malloc-scheme"); | |
162 | SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0, | |
163 | "New buffer header acquisition requests"); | |
164 | SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, | |
165 | 0, "New buffer header acquisition restarts"); | |
166 | SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0, | |
bb606263 | 167 | "Buffer acquisition restarts due to fragmented buffer map"); |
3f779080 HP |
168 | SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0, |
169 | "Amount of time KVA space was deallocated in an arbitrary buffer"); | |
170 | SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0, | |
171 | "Amount of time buffer re-use operations were successful"); | |
306ab3cb HP |
172 | SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), |
173 | "sizeof(struct buf)"); | |
984263bc | 174 | |
984263bc MD |
175 | char *buf_wmesg = BUF_WMESG; |
176 | ||
177 | extern int vm_swap_size; | |
178 | ||
179 | #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ | |
180 | #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ | |
181 | #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ | |
182 | #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ | |
183 | ||
984263bc | 184 | /* |
3f779080 | 185 | * numdirtywakeup: |
984263bc MD |
186 | * |
187 | * If someone is blocked due to there being too many dirty buffers, | |
188 | * and numdirtybuffers is now reasonable, wake them up. | |
189 | */ | |
190 | ||
191 | static __inline void | |
192 | numdirtywakeup(int level) | |
193 | { | |
194 | if (numdirtybuffers <= level) { | |
195 | if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { | |
f832287e | 196 | spin_lock_wr(&needsbuffer_spin); |
984263bc | 197 | needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; |
f832287e | 198 | spin_unlock_wr(&needsbuffer_spin); |
984263bc MD |
199 | wakeup(&needsbuffer); |
200 | } | |
201 | } | |
202 | } | |
203 | ||
204 | /* | |
3f779080 | 205 | * bufspacewakeup: |
984263bc MD |
206 | * |
207 | * Called when buffer space is potentially available for recovery. | |
208 | * getnewbuf() will block on this flag when it is unable to free | |
209 | * sufficient buffer space. Buffer space becomes recoverable when | |
210 | * bp's get placed back in the queues. | |
211 | */ | |
212 | ||
213 | static __inline void | |
214 | bufspacewakeup(void) | |
215 | { | |
216 | /* | |
217 | * If someone is waiting for BUF space, wake them up. Even | |
218 | * though we haven't freed the kva space yet, the waiting | |
219 | * process will be able to now. | |
220 | */ | |
221 | if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { | |
f832287e | 222 | spin_lock_wr(&needsbuffer_spin); |
984263bc | 223 | needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; |
f832287e | 224 | spin_unlock_wr(&needsbuffer_spin); |
984263bc MD |
225 | wakeup(&needsbuffer); |
226 | } | |
227 | } | |
228 | ||
229 | /* | |
3f779080 HP |
230 | * runningbufwakeup: |
231 | * | |
232 | * Accounting for I/O in progress. | |
984263bc MD |
233 | * |
234 | */ | |
235 | static __inline void | |
236 | runningbufwakeup(struct buf *bp) | |
237 | { | |
238 | if (bp->b_runningbufspace) { | |
239 | runningbufspace -= bp->b_runningbufspace; | |
240 | bp->b_runningbufspace = 0; | |
241 | if (runningbufreq && runningbufspace <= lorunningspace) { | |
242 | runningbufreq = 0; | |
243 | wakeup(&runningbufreq); | |
244 | } | |
245 | } | |
246 | } | |
247 | ||
248 | /* | |
3f779080 | 249 | * bufcountwakeup: |
984263bc MD |
250 | * |
251 | * Called when a buffer has been added to one of the free queues to | |
252 | * account for the buffer and to wakeup anyone waiting for free buffers. | |
253 | * This typically occurs when large amounts of metadata are being handled | |
254 | * by the buffer cache ( else buffer space runs out first, usually ). | |
255 | */ | |
256 | ||
257 | static __inline void | |
258 | bufcountwakeup(void) | |
259 | { | |
260 | ++numfreebuffers; | |
261 | if (needsbuffer) { | |
f832287e | 262 | spin_lock_wr(&needsbuffer_spin); |
984263bc MD |
263 | needsbuffer &= ~VFS_BIO_NEED_ANY; |
264 | if (numfreebuffers >= hifreebuffers) | |
265 | needsbuffer &= ~VFS_BIO_NEED_FREE; | |
f832287e | 266 | spin_unlock_wr(&needsbuffer_spin); |
984263bc MD |
267 | wakeup(&needsbuffer); |
268 | } | |
269 | } | |
270 | ||
271 | /* | |
3f779080 | 272 | * waitrunningbufspace() |
984263bc MD |
273 | * |
274 | * runningbufspace is a measure of the amount of I/O currently | |
275 | * running. This routine is used in async-write situations to | |
276 | * prevent creating huge backups of pending writes to a device. | |
277 | * Only asynchronous writes are governed by this function. | |
278 | * | |
279 | * Reads will adjust runningbufspace, but will not block based on it. | |
280 | * The read load has a side effect of reducing the allowed write load. | |
281 | * | |
282 | * This does NOT turn an async write into a sync write. It waits | |
283 | * for earlier writes to complete and generally returns before the | |
284 | * caller's write has reached the device. | |
285 | */ | |
286 | static __inline void | |
287 | waitrunningbufspace(void) | |
288 | { | |
e43a034f MD |
289 | if (runningbufspace > hirunningspace) { |
290 | crit_enter(); | |
291 | while (runningbufspace > hirunningspace) { | |
292 | ++runningbufreq; | |
293 | tsleep(&runningbufreq, 0, "wdrain", 0); | |
294 | } | |
295 | crit_exit(); | |
984263bc MD |
296 | } |
297 | } | |
298 | ||
299 | /* | |
3f779080 | 300 | * vfs_buf_test_cache: |
984263bc MD |
301 | * |
302 | * Called when a buffer is extended. This function clears the B_CACHE | |
303 | * bit if the newly extended portion of the buffer does not contain | |
304 | * valid data. | |
305 | */ | |
306 | static __inline__ | |
307 | void | |
308 | vfs_buf_test_cache(struct buf *bp, | |
309 | vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, | |
310 | vm_page_t m) | |
311 | { | |
312 | if (bp->b_flags & B_CACHE) { | |
313 | int base = (foff + off) & PAGE_MASK; | |
314 | if (vm_page_is_valid(m, base, size) == 0) | |
315 | bp->b_flags &= ~B_CACHE; | |
316 | } | |
317 | } | |
318 | ||
3f779080 HP |
319 | /* |
320 | * bd_wakeup: | |
321 | * | |
322 | * Wake up the buffer daemon if the number of outstanding dirty buffers | |
323 | * is above specified threshold 'dirtybuflevel'. | |
324 | * | |
325 | * The buffer daemon is explicitly woken up when (a) the pending number | |
326 | * of dirty buffers exceeds the recovery and stall mid-point value, | |
327 | * (b) during bwillwrite() or (c) buf freelist was exhausted. | |
328 | */ | |
984263bc MD |
329 | static __inline__ |
330 | void | |
331 | bd_wakeup(int dirtybuflevel) | |
332 | { | |
333 | if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { | |
f832287e | 334 | spin_lock_wr(&needsbuffer_spin); |
984263bc | 335 | bd_request = 1; |
f832287e | 336 | spin_unlock_wr(&needsbuffer_spin); |
984263bc MD |
337 | wakeup(&bd_request); |
338 | } | |
339 | } | |
340 | ||
341 | /* | |
3f779080 HP |
342 | * bd_speedup: |
343 | * | |
344 | * Speed up the buffer cache flushing process. | |
984263bc MD |
345 | */ |
346 | ||
347 | static __inline__ | |
348 | void | |
349 | bd_speedup(void) | |
350 | { | |
351 | bd_wakeup(1); | |
352 | } | |
353 | ||
3f779080 HP |
354 | /* |
355 | * bufinit: | |
356 | * | |
357 | * Load time initialisation of the buffer cache, called from machine | |
358 | * dependant initialization code. | |
359 | */ | |
984263bc MD |
360 | void |
361 | bufinit(void) | |
362 | { | |
363 | struct buf *bp; | |
b8bb0773 | 364 | vm_offset_t bogus_offset; |
984263bc MD |
365 | int i; |
366 | ||
f832287e MD |
367 | spin_init(&needsbuffer_spin); |
368 | ||
984263bc MD |
369 | /* next, make a null set of free lists */ |
370 | for (i = 0; i < BUFFER_QUEUES; i++) | |
371 | TAILQ_INIT(&bufqueues[i]); | |
372 | ||
373 | /* finally, initialize each buffer header and stick on empty q */ | |
374 | for (i = 0; i < nbuf; i++) { | |
375 | bp = &buf[i]; | |
376 | bzero(bp, sizeof *bp); | |
377 | bp->b_flags = B_INVAL; /* we're just an empty header */ | |
10f3fee5 | 378 | bp->b_cmd = BUF_CMD_DONE; |
b3098c79 | 379 | bp->b_qindex = BQUEUE_EMPTY; |
81b5c339 | 380 | initbufbio(bp); |
54f51aeb | 381 | xio_init(&bp->b_xio); |
984263bc MD |
382 | LIST_INIT(&bp->b_dep); |
383 | BUF_LOCKINIT(bp); | |
b3098c79 | 384 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist); |
984263bc MD |
385 | } |
386 | ||
387 | /* | |
388 | * maxbufspace is the absolute maximum amount of buffer space we are | |
389 | * allowed to reserve in KVM and in real terms. The absolute maximum | |
390 | * is nominally used by buf_daemon. hibufspace is the nominal maximum | |
391 | * used by most other processes. The differential is required to | |
392 | * ensure that buf_daemon is able to run when other processes might | |
393 | * be blocked waiting for buffer space. | |
394 | * | |
395 | * maxbufspace is based on BKVASIZE. Allocating buffers larger then | |
396 | * this may result in KVM fragmentation which is not handled optimally | |
397 | * by the system. | |
398 | */ | |
399 | maxbufspace = nbuf * BKVASIZE; | |
400 | hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); | |
401 | lobufspace = hibufspace - MAXBSIZE; | |
402 | ||
403 | lorunningspace = 512 * 1024; | |
404 | hirunningspace = 1024 * 1024; | |
405 | ||
406 | /* | |
407 | * Limit the amount of malloc memory since it is wired permanently into | |
408 | * the kernel space. Even though this is accounted for in the buffer | |
409 | * allocation, we don't want the malloced region to grow uncontrolled. | |
410 | * The malloc scheme improves memory utilization significantly on average | |
411 | * (small) directories. | |
412 | */ | |
413 | maxbufmallocspace = hibufspace / 20; | |
414 | ||
415 | /* | |
416 | * Reduce the chance of a deadlock occuring by limiting the number | |
417 | * of delayed-write dirty buffers we allow to stack up. | |
418 | */ | |
419 | hidirtybuffers = nbuf / 4 + 20; | |
420 | numdirtybuffers = 0; | |
421 | /* | |
422 | * To support extreme low-memory systems, make sure hidirtybuffers cannot | |
423 | * eat up all available buffer space. This occurs when our minimum cannot | |
424 | * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming | |
425 | * BKVASIZE'd (8K) buffers. | |
426 | */ | |
427 | while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { | |
428 | hidirtybuffers >>= 1; | |
429 | } | |
430 | lodirtybuffers = hidirtybuffers / 2; | |
431 | ||
432 | /* | |
433 | * Try to keep the number of free buffers in the specified range, | |
434 | * and give special processes (e.g. like buf_daemon) access to an | |
435 | * emergency reserve. | |
436 | */ | |
437 | lofreebuffers = nbuf / 18 + 5; | |
438 | hifreebuffers = 2 * lofreebuffers; | |
439 | numfreebuffers = nbuf; | |
440 | ||
441 | /* | |
442 | * Maximum number of async ops initiated per buf_daemon loop. This is | |
443 | * somewhat of a hack at the moment, we really need to limit ourselves | |
444 | * based on the number of bytes of I/O in-transit that were initiated | |
445 | * from buf_daemon. | |
446 | */ | |
447 | ||
e4846942 | 448 | bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE); |
c439ad8f | 449 | bogus_page = vm_page_alloc(&kernel_object, |
e4846942 MD |
450 | (bogus_offset >> PAGE_SHIFT), |
451 | VM_ALLOC_NORMAL); | |
12e4aaff | 452 | vmstats.v_wire_count++; |
984263bc MD |
453 | |
454 | } | |
455 | ||
81b5c339 MD |
456 | /* |
457 | * Initialize the embedded bio structures | |
458 | */ | |
459 | void | |
460 | initbufbio(struct buf *bp) | |
461 | { | |
462 | bp->b_bio1.bio_buf = bp; | |
463 | bp->b_bio1.bio_prev = NULL; | |
81b5c339 MD |
464 | bp->b_bio1.bio_offset = NOOFFSET; |
465 | bp->b_bio1.bio_next = &bp->b_bio2; | |
466 | bp->b_bio1.bio_done = NULL; | |
467 | ||
468 | bp->b_bio2.bio_buf = bp; | |
469 | bp->b_bio2.bio_prev = &bp->b_bio1; | |
81b5c339 MD |
470 | bp->b_bio2.bio_offset = NOOFFSET; |
471 | bp->b_bio2.bio_next = NULL; | |
472 | bp->b_bio2.bio_done = NULL; | |
473 | } | |
474 | ||
475 | /* | |
476 | * Reinitialize the embedded bio structures as well as any additional | |
477 | * translation cache layers. | |
478 | */ | |
479 | void | |
480 | reinitbufbio(struct buf *bp) | |
481 | { | |
482 | struct bio *bio; | |
483 | ||
484 | for (bio = &bp->b_bio1; bio; bio = bio->bio_next) { | |
485 | bio->bio_done = NULL; | |
81b5c339 MD |
486 | bio->bio_offset = NOOFFSET; |
487 | } | |
488 | } | |
489 | ||
490 | /* | |
491 | * Push another BIO layer onto an existing BIO and return it. The new | |
492 | * BIO layer may already exist, holding cached translation data. | |
493 | */ | |
494 | struct bio * | |
495 | push_bio(struct bio *bio) | |
496 | { | |
497 | struct bio *nbio; | |
498 | ||
499 | if ((nbio = bio->bio_next) == NULL) { | |
500 | int index = bio - &bio->bio_buf->b_bio_array[0]; | |
bbd44c71 | 501 | if (index >= NBUF_BIO - 1) { |
81b5c339 MD |
502 | panic("push_bio: too many layers bp %p\n", |
503 | bio->bio_buf); | |
504 | } | |
505 | nbio = &bio->bio_buf->b_bio_array[index + 1]; | |
506 | bio->bio_next = nbio; | |
507 | nbio->bio_prev = bio; | |
508 | nbio->bio_buf = bio->bio_buf; | |
81b5c339 MD |
509 | nbio->bio_offset = NOOFFSET; |
510 | nbio->bio_done = NULL; | |
511 | nbio->bio_next = NULL; | |
512 | } | |
513 | KKASSERT(nbio->bio_done == NULL); | |
514 | return(nbio); | |
515 | } | |
516 | ||
517 | void | |
518 | pop_bio(struct bio *bio) | |
519 | { | |
520 | /* NOP */ | |
521 | } | |
522 | ||
523 | void | |
524 | clearbiocache(struct bio *bio) | |
525 | { | |
526 | while (bio) { | |
81b5c339 MD |
527 | bio->bio_offset = NOOFFSET; |
528 | bio = bio->bio_next; | |
529 | } | |
530 | } | |
531 | ||
984263bc | 532 | /* |
3f779080 HP |
533 | * bfreekva: |
534 | * | |
535 | * Free the KVA allocation for buffer 'bp'. | |
984263bc | 536 | * |
e43a034f | 537 | * Must be called from a critical section as this is the only locking for |
984263bc MD |
538 | * buffer_map. |
539 | * | |
540 | * Since this call frees up buffer space, we call bufspacewakeup(). | |
541 | */ | |
542 | static void | |
312dcd01 | 543 | bfreekva(struct buf *bp) |
984263bc | 544 | { |
a108bf71 MD |
545 | int count; |
546 | ||
984263bc MD |
547 | if (bp->b_kvasize) { |
548 | ++buffreekvacnt; | |
a108bf71 | 549 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
e4846942 | 550 | vm_map_lock(&buffer_map); |
984263bc | 551 | bufspace -= bp->b_kvasize; |
e4846942 | 552 | vm_map_delete(&buffer_map, |
984263bc | 553 | (vm_offset_t) bp->b_kvabase, |
a108bf71 MD |
554 | (vm_offset_t) bp->b_kvabase + bp->b_kvasize, |
555 | &count | |
984263bc | 556 | ); |
e4846942 | 557 | vm_map_unlock(&buffer_map); |
a108bf71 | 558 | vm_map_entry_release(count); |
984263bc MD |
559 | bp->b_kvasize = 0; |
560 | bufspacewakeup(); | |
561 | } | |
562 | } | |
563 | ||
564 | /* | |
3f779080 | 565 | * bremfree: |
984263bc MD |
566 | * |
567 | * Remove the buffer from the appropriate free list. | |
568 | */ | |
569 | void | |
c8e4131d | 570 | bremfree(struct buf *bp) |
984263bc | 571 | { |
e43a034f MD |
572 | int old_qindex; |
573 | ||
574 | crit_enter(); | |
575 | old_qindex = bp->b_qindex; | |
984263bc | 576 | |
b3098c79 | 577 | if (bp->b_qindex != BQUEUE_NONE) { |
77bb9400 MD |
578 | KASSERT(BUF_REFCNTNB(bp) == 1, |
579 | ("bremfree: bp %p not locked",bp)); | |
984263bc | 580 | TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); |
b3098c79 | 581 | bp->b_qindex = BQUEUE_NONE; |
984263bc | 582 | } else { |
77bb9400 | 583 | if (BUF_REFCNTNB(bp) <= 1) |
984263bc MD |
584 | panic("bremfree: removing a buffer not on a queue"); |
585 | } | |
586 | ||
587 | /* | |
588 | * Fixup numfreebuffers count. If the buffer is invalid or not | |
589 | * delayed-write, and it was on the EMPTY, LRU, or AGE queues, | |
590 | * the buffer was free and we must decrement numfreebuffers. | |
591 | */ | |
592 | if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { | |
593 | switch(old_qindex) { | |
b3098c79 HP |
594 | case BQUEUE_DIRTY: |
595 | case BQUEUE_CLEAN: | |
596 | case BQUEUE_EMPTY: | |
597 | case BQUEUE_EMPTYKVA: | |
984263bc MD |
598 | --numfreebuffers; |
599 | break; | |
600 | default: | |
601 | break; | |
602 | } | |
603 | } | |
e43a034f | 604 | crit_exit(); |
984263bc MD |
605 | } |
606 | ||
607 | ||
608 | /* | |
3f779080 HP |
609 | * bread: |
610 | * | |
611 | * Get a buffer with the specified data. Look in the cache first. We | |
612 | * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE | |
613 | * is set, the buffer is valid and we do not have to do anything ( see | |
614 | * getblk() ). | |
984263bc MD |
615 | */ |
616 | int | |
c8e4131d | 617 | bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp) |
984263bc MD |
618 | { |
619 | struct buf *bp; | |
620 | ||
54078292 | 621 | bp = getblk(vp, loffset, size, 0, 0); |
984263bc MD |
622 | *bpp = bp; |
623 | ||
624 | /* if not found in cache, do some I/O */ | |
625 | if ((bp->b_flags & B_CACHE) == 0) { | |
984263bc | 626 | KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); |
984263bc | 627 | bp->b_flags &= ~(B_ERROR | B_INVAL); |
10f3fee5 MD |
628 | bp->b_cmd = BUF_CMD_READ; |
629 | vfs_busy_pages(vp, bp); | |
81b5c339 | 630 | vn_strategy(vp, &bp->b_bio1); |
984263bc MD |
631 | return (biowait(bp)); |
632 | } | |
633 | return (0); | |
634 | } | |
635 | ||
636 | /* | |
3f779080 HP |
637 | * breadn: |
638 | * | |
639 | * Operates like bread, but also starts asynchronous I/O on | |
640 | * read-ahead blocks. We must clear B_ERROR and B_INVAL prior | |
641 | * to initiating I/O . If B_CACHE is set, the buffer is valid | |
642 | * and we do not have to do anything. | |
984263bc MD |
643 | */ |
644 | int | |
a8f169e2 | 645 | breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, |
c8e4131d | 646 | int *rabsize, int cnt, struct buf **bpp) |
984263bc MD |
647 | { |
648 | struct buf *bp, *rabp; | |
649 | int i; | |
650 | int rv = 0, readwait = 0; | |
651 | ||
54078292 | 652 | *bpp = bp = getblk(vp, loffset, size, 0, 0); |
984263bc MD |
653 | |
654 | /* if not found in cache, do some I/O */ | |
655 | if ((bp->b_flags & B_CACHE) == 0) { | |
984263bc | 656 | bp->b_flags &= ~(B_ERROR | B_INVAL); |
10f3fee5 MD |
657 | bp->b_cmd = BUF_CMD_READ; |
658 | vfs_busy_pages(vp, bp); | |
81b5c339 | 659 | vn_strategy(vp, &bp->b_bio1); |
984263bc MD |
660 | ++readwait; |
661 | } | |
662 | ||
54078292 MD |
663 | for (i = 0; i < cnt; i++, raoffset++, rabsize++) { |
664 | if (inmem(vp, *raoffset)) | |
984263bc | 665 | continue; |
54078292 | 666 | rabp = getblk(vp, *raoffset, *rabsize, 0, 0); |
984263bc MD |
667 | |
668 | if ((rabp->b_flags & B_CACHE) == 0) { | |
10f3fee5 | 669 | rabp->b_flags |= B_ASYNC; |
984263bc | 670 | rabp->b_flags &= ~(B_ERROR | B_INVAL); |
10f3fee5 MD |
671 | rabp->b_cmd = BUF_CMD_READ; |
672 | vfs_busy_pages(vp, rabp); | |
984263bc | 673 | BUF_KERNPROC(rabp); |
81b5c339 | 674 | vn_strategy(vp, &rabp->b_bio1); |
984263bc MD |
675 | } else { |
676 | brelse(rabp); | |
677 | } | |
678 | } | |
679 | ||
680 | if (readwait) { | |
681 | rv = biowait(bp); | |
682 | } | |
683 | return (rv); | |
684 | } | |
685 | ||
686 | /* | |
3f779080 HP |
687 | * bwrite: |
688 | * | |
689 | * Write, release buffer on completion. (Done by iodone | |
690 | * if async). Do not bother writing anything if the buffer | |
691 | * is invalid. | |
692 | * | |
693 | * Note that we set B_CACHE here, indicating that buffer is | |
694 | * fully valid and thus cacheable. This is true even of NFS | |
695 | * now so we set it generally. This could be set either here | |
696 | * or in biodone() since the I/O is synchronous. We put it | |
697 | * here. | |
984263bc MD |
698 | */ |
699 | int | |
c8e4131d | 700 | bwrite(struct buf *bp) |
984263bc | 701 | { |
e43a034f | 702 | int oldflags; |
984263bc MD |
703 | |
704 | if (bp->b_flags & B_INVAL) { | |
705 | brelse(bp); | |
706 | return (0); | |
707 | } | |
708 | ||
709 | oldflags = bp->b_flags; | |
710 | ||
77bb9400 | 711 | if (BUF_REFCNTNB(bp) == 0) |
984263bc | 712 | panic("bwrite: buffer is not busy???"); |
e43a034f | 713 | crit_enter(); |
984263bc MD |
714 | |
715 | /* Mark the buffer clean */ | |
716 | bundirty(bp); | |
717 | ||
10f3fee5 | 718 | bp->b_flags &= ~B_ERROR; |
6bae6177 | 719 | bp->b_flags |= B_CACHE; |
10f3fee5 MD |
720 | bp->b_cmd = BUF_CMD_WRITE; |
721 | vfs_busy_pages(bp->b_vp, bp); | |
984263bc MD |
722 | |
723 | /* | |
9a71d53f MD |
724 | * Normal bwrites pipeline writes. NOTE: b_bufsize is only |
725 | * valid for vnode-backed buffers. | |
984263bc MD |
726 | */ |
727 | bp->b_runningbufspace = bp->b_bufsize; | |
728 | runningbufspace += bp->b_runningbufspace; | |
729 | ||
e43a034f | 730 | crit_exit(); |
984263bc MD |
731 | if (oldflags & B_ASYNC) |
732 | BUF_KERNPROC(bp); | |
81b5c339 | 733 | vn_strategy(bp->b_vp, &bp->b_bio1); |
984263bc MD |
734 | |
735 | if ((oldflags & B_ASYNC) == 0) { | |
736 | int rtval = biowait(bp); | |
737 | brelse(bp); | |
738 | return (rtval); | |
739 | } else if ((oldflags & B_NOWDRAIN) == 0) { | |
740 | /* | |
741 | * don't allow the async write to saturate the I/O | |
742 | * system. Deadlocks can occur only if a device strategy | |
743 | * routine (like in VN) turns around and issues another | |
744 | * high-level write, in which case B_NOWDRAIN is expected | |
745 | * to be set. Otherwise we will not deadlock here because | |
746 | * we are blocking waiting for I/O that is already in-progress | |
747 | * to complete. | |
748 | */ | |
749 | waitrunningbufspace(); | |
750 | } | |
751 | ||
752 | return (0); | |
753 | } | |
754 | ||
984263bc | 755 | /* |
3f779080 HP |
756 | * bdwrite: |
757 | * | |
758 | * Delayed write. (Buffer is marked dirty). Do not bother writing | |
759 | * anything if the buffer is marked invalid. | |
984263bc | 760 | * |
3f779080 HP |
761 | * Note that since the buffer must be completely valid, we can safely |
762 | * set B_CACHE. In fact, we have to set B_CACHE here rather then in | |
763 | * biodone() in order to prevent getblk from writing the buffer | |
764 | * out synchronously. | |
984263bc MD |
765 | */ |
766 | void | |
493c516a | 767 | bdwrite(struct buf *bp) |
984263bc | 768 | { |
77bb9400 | 769 | if (BUF_REFCNTNB(bp) == 0) |
984263bc MD |
770 | panic("bdwrite: buffer is not busy"); |
771 | ||
772 | if (bp->b_flags & B_INVAL) { | |
773 | brelse(bp); | |
774 | return; | |
775 | } | |
776 | bdirty(bp); | |
777 | ||
778 | /* | |
779 | * Set B_CACHE, indicating that the buffer is fully valid. This is | |
780 | * true even of NFS now. | |
781 | */ | |
782 | bp->b_flags |= B_CACHE; | |
783 | ||
784 | /* | |
785 | * This bmap keeps the system from needing to do the bmap later, | |
786 | * perhaps when the system is attempting to do a sync. Since it | |
787 | * is likely that the indirect block -- or whatever other datastructure | |
788 | * that the filesystem needs is still in memory now, it is a good | |
789 | * thing to do this. Note also, that if the pageout daemon is | |
790 | * requesting a sync -- there might not be enough memory to do | |
791 | * the bmap then... So, this is important to do. | |
792 | */ | |
54078292 MD |
793 | if (bp->b_bio2.bio_offset == NOOFFSET) { |
794 | VOP_BMAP(bp->b_vp, bp->b_loffset, NULL, &bp->b_bio2.bio_offset, | |
81b5c339 | 795 | NULL, NULL); |
984263bc MD |
796 | } |
797 | ||
798 | /* | |
799 | * Set the *dirty* buffer range based upon the VM system dirty pages. | |
800 | */ | |
801 | vfs_setdirty(bp); | |
802 | ||
803 | /* | |
804 | * We need to do this here to satisfy the vnode_pager and the | |
805 | * pageout daemon, so that it thinks that the pages have been | |
806 | * "cleaned". Note that since the pages are in a delayed write | |
807 | * buffer -- the VFS layer "will" see that the pages get written | |
808 | * out on the next sync, or perhaps the cluster will be completed. | |
809 | */ | |
810 | vfs_clean_pages(bp); | |
811 | bqrelse(bp); | |
812 | ||
813 | /* | |
814 | * Wakeup the buffer flushing daemon if we have a lot of dirty | |
815 | * buffers (midpoint between our recovery point and our stall | |
816 | * point). | |
817 | */ | |
818 | bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); | |
819 | ||
820 | /* | |
821 | * note: we cannot initiate I/O from a bdwrite even if we wanted to, | |
822 | * due to the softdep code. | |
823 | */ | |
824 | } | |
825 | ||
826 | /* | |
3f779080 | 827 | * bdirty: |
984263bc | 828 | * |
10f3fee5 MD |
829 | * Turn buffer into delayed write request by marking it B_DELWRI. |
830 | * B_RELBUF and B_NOCACHE must be cleared. | |
984263bc | 831 | * |
10f3fee5 MD |
832 | * We reassign the buffer to itself to properly update it in the |
833 | * dirty/clean lists. | |
984263bc | 834 | * |
10f3fee5 MD |
835 | * Since the buffer is not on a queue, we do not update the |
836 | * numfreebuffers count. | |
984263bc | 837 | * |
e43a034f | 838 | * Must be called from a critical section. |
b3098c79 | 839 | * The buffer must be on BQUEUE_NONE. |
984263bc MD |
840 | */ |
841 | void | |
493c516a | 842 | bdirty(struct buf *bp) |
984263bc | 843 | { |
b3098c79 | 844 | KASSERT(bp->b_qindex == BQUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); |
69f8c926 | 845 | if (bp->b_flags & B_NOCACHE) { |
6ea70f76 | 846 | kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp); |
69f8c926 MD |
847 | bp->b_flags &= ~B_NOCACHE; |
848 | } | |
849 | if (bp->b_flags & B_INVAL) { | |
6ea70f76 | 850 | kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp); |
69f8c926 | 851 | } |
10f3fee5 | 852 | bp->b_flags &= ~B_RELBUF; |
984263bc MD |
853 | |
854 | if ((bp->b_flags & B_DELWRI) == 0) { | |
10f3fee5 | 855 | bp->b_flags |= B_DELWRI; |
1f1ea522 | 856 | reassignbuf(bp); |
984263bc MD |
857 | ++numdirtybuffers; |
858 | bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); | |
859 | } | |
860 | } | |
861 | ||
862 | /* | |
3f779080 | 863 | * bundirty: |
984263bc MD |
864 | * |
865 | * Clear B_DELWRI for buffer. | |
866 | * | |
867 | * Since the buffer is not on a queue, we do not update the numfreebuffers | |
868 | * count. | |
869 | * | |
e43a034f | 870 | * Must be called from a critical section. |
eaaadca0 | 871 | * |
b3098c79 | 872 | * The buffer is typically on BQUEUE_NONE but there is one case in |
eaaadca0 MD |
873 | * brelse() that calls this function after placing the buffer on |
874 | * a different queue. | |
984263bc MD |
875 | */ |
876 | ||
877 | void | |
493c516a | 878 | bundirty(struct buf *bp) |
984263bc | 879 | { |
984263bc MD |
880 | if (bp->b_flags & B_DELWRI) { |
881 | bp->b_flags &= ~B_DELWRI; | |
1f1ea522 | 882 | reassignbuf(bp); |
984263bc MD |
883 | --numdirtybuffers; |
884 | numdirtywakeup(lodirtybuffers); | |
885 | } | |
886 | /* | |
887 | * Since it is now being written, we can clear its deferred write flag. | |
888 | */ | |
889 | bp->b_flags &= ~B_DEFERRED; | |
890 | } | |
891 | ||
892 | /* | |
3f779080 | 893 | * bawrite: |
984263bc MD |
894 | * |
895 | * Asynchronous write. Start output on a buffer, but do not wait for | |
896 | * it to complete. The buffer is released when the output completes. | |
897 | * | |
898 | * bwrite() ( or the VOP routine anyway ) is responsible for handling | |
899 | * B_INVAL buffers. Not us. | |
900 | */ | |
901 | void | |
c8e4131d | 902 | bawrite(struct buf *bp) |
984263bc MD |
903 | { |
904 | bp->b_flags |= B_ASYNC; | |
62cfda27 | 905 | bwrite(bp); |
984263bc MD |
906 | } |
907 | ||
908 | /* | |
3f779080 | 909 | * bowrite: |
984263bc MD |
910 | * |
911 | * Ordered write. Start output on a buffer, and flag it so that the | |
912 | * device will write it in the order it was queued. The buffer is | |
913 | * released when the output completes. bwrite() ( or the VOP routine | |
914 | * anyway ) is responsible for handling B_INVAL buffers. | |
915 | */ | |
916 | int | |
c8e4131d | 917 | bowrite(struct buf *bp) |
984263bc MD |
918 | { |
919 | bp->b_flags |= B_ORDERED | B_ASYNC; | |
62cfda27 | 920 | return (bwrite(bp)); |
984263bc MD |
921 | } |
922 | ||
923 | /* | |
3f779080 | 924 | * bwillwrite: |
984263bc MD |
925 | * |
926 | * Called prior to the locking of any vnodes when we are expecting to | |
927 | * write. We do not want to starve the buffer cache with too many | |
928 | * dirty buffers so we block here. By blocking prior to the locking | |
929 | * of any vnodes we attempt to avoid the situation where a locked vnode | |
930 | * prevents the various system daemons from flushing related buffers. | |
931 | */ | |
932 | ||
933 | void | |
934 | bwillwrite(void) | |
935 | { | |
936 | if (numdirtybuffers >= hidirtybuffers) { | |
984263bc MD |
937 | while (numdirtybuffers >= hidirtybuffers) { |
938 | bd_wakeup(1); | |
f832287e MD |
939 | spin_lock_wr(&needsbuffer_spin); |
940 | if (numdirtybuffers >= hidirtybuffers) { | |
941 | needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; | |
942 | msleep(&needsbuffer, &needsbuffer_spin, 0, | |
943 | "flswai", 0); | |
944 | } | |
945 | spin_unlock_wr(&needsbuffer_spin); | |
984263bc | 946 | } |
984263bc MD |
947 | } |
948 | } | |
949 | ||
950 | /* | |
3f779080 HP |
951 | * buf_dirty_count_severe: |
952 | * | |
953 | * Return true if we have too many dirty buffers. | |
984263bc MD |
954 | */ |
955 | int | |
956 | buf_dirty_count_severe(void) | |
957 | { | |
958 | return(numdirtybuffers >= hidirtybuffers); | |
959 | } | |
960 | ||
961 | /* | |
3f779080 | 962 | * brelse: |
984263bc MD |
963 | * |
964 | * Release a busy buffer and, if requested, free its resources. The | |
965 | * buffer will be stashed in the appropriate bufqueue[] allowing it | |
966 | * to be accessed later as a cache entity or reused for other purposes. | |
967 | */ | |
968 | void | |
c8e4131d | 969 | brelse(struct buf *bp) |
984263bc | 970 | { |
9188c711 MD |
971 | #ifdef INVARIANTS |
972 | int saved_flags = bp->b_flags; | |
973 | #endif | |
974 | ||
984263bc MD |
975 | KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); |
976 | ||
e43a034f | 977 | crit_enter(); |
984263bc | 978 | |
135bd6a8 MD |
979 | /* |
980 | * If B_NOCACHE is set we are being asked to destroy the buffer and | |
981 | * its backing store. Clear B_DELWRI. | |
982 | * | |
983 | * B_NOCACHE is set in two cases: (1) when the caller really wants | |
984 | * to destroy the buffer and backing store and (2) when the caller | |
985 | * wants to destroy the buffer and backing store after a write | |
986 | * completes. | |
987 | */ | |
988 | if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) { | |
989 | bundirty(bp); | |
69f8c926 MD |
990 | } |
991 | ||
984263bc MD |
992 | if (bp->b_flags & B_LOCKED) |
993 | bp->b_flags &= ~B_ERROR; | |
994 | ||
135bd6a8 MD |
995 | /* |
996 | * If a write error occurs and the caller does not want to throw | |
997 | * away the buffer, redirty the buffer. This will also clear | |
998 | * B_NOCACHE. | |
999 | */ | |
10f3fee5 MD |
1000 | if (bp->b_cmd == BUF_CMD_WRITE && |
1001 | (bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) { | |
984263bc MD |
1002 | /* |
1003 | * Failed write, redirty. Must clear B_ERROR to prevent | |
1004 | * pages from being scrapped. If B_INVAL is set then | |
1005 | * this case is not run and the next case is run to | |
1006 | * destroy the buffer. B_INVAL can occur if the buffer | |
1007 | * is outside the range supported by the underlying device. | |
1008 | */ | |
1009 | bp->b_flags &= ~B_ERROR; | |
1010 | bdirty(bp); | |
10f3fee5 MD |
1011 | } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || |
1012 | (bp->b_bufsize <= 0) || bp->b_cmd == BUF_CMD_FREEBLKS) { | |
984263bc MD |
1013 | /* |
1014 | * Either a failed I/O or we were asked to free or not | |
1015 | * cache the buffer. | |
1016 | */ | |
1017 | bp->b_flags |= B_INVAL; | |
1018 | if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) | |
1019 | (*bioops.io_deallocate)(bp); | |
1020 | if (bp->b_flags & B_DELWRI) { | |
1021 | --numdirtybuffers; | |
1022 | numdirtywakeup(lodirtybuffers); | |
1023 | } | |
10f3fee5 | 1024 | bp->b_flags &= ~(B_DELWRI | B_CACHE); |
984263bc MD |
1025 | } |
1026 | ||
1027 | /* | |
1028 | * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() | |
1029 | * is called with B_DELWRI set, the underlying pages may wind up | |
1030 | * getting freed causing a previous write (bdwrite()) to get 'lost' | |
1031 | * because pages associated with a B_DELWRI bp are marked clean. | |
1032 | * | |
1033 | * We still allow the B_INVAL case to call vfs_vmio_release(), even | |
1034 | * if B_DELWRI is set. | |
1035 | * | |
1036 | * If B_DELWRI is not set we may have to set B_RELBUF if we are low | |
1037 | * on pages to return pages to the VM page queues. | |
1038 | */ | |
1039 | if (bp->b_flags & B_DELWRI) | |
1040 | bp->b_flags &= ~B_RELBUF; | |
70371608 | 1041 | else if (vm_page_count_severe()) |
984263bc MD |
1042 | bp->b_flags |= B_RELBUF; |
1043 | ||
9188c711 MD |
1044 | /* |
1045 | * At this point destroying the buffer is governed by the B_INVAL | |
1046 | * or B_RELBUF flags. | |
1047 | */ | |
10f3fee5 | 1048 | bp->b_cmd = BUF_CMD_DONE; |
9188c711 | 1049 | |
984263bc | 1050 | /* |
135bd6a8 MD |
1051 | * VMIO buffer rundown. Make sure the VM page array is restored |
1052 | * after an I/O may have replaces some of the pages with bogus pages | |
1053 | * in order to not destroy dirty pages in a fill-in read. | |
1054 | * | |
1055 | * Note that due to the code above, if a buffer is marked B_DELWRI | |
1056 | * then the B_RELBUF and B_NOCACHE bits will always be clear. | |
1057 | * B_INVAL may still be set, however. | |
984263bc | 1058 | * |
135bd6a8 MD |
1059 | * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer |
1060 | * but not the backing store. B_NOCACHE will destroy the backing | |
1061 | * store. | |
984263bc | 1062 | * |
135bd6a8 MD |
1063 | * Note that dirty NFS buffers contain byte-granular write ranges |
1064 | * and should not be destroyed w/ B_INVAL even if the backing store | |
1065 | * is left intact. | |
984263bc | 1066 | */ |
135bd6a8 | 1067 | if (bp->b_flags & B_VMIO) { |
9188c711 MD |
1068 | /* |
1069 | * Rundown for VMIO buffers which are not dirty NFS buffers. | |
1070 | */ | |
984263bc MD |
1071 | int i, j, resid; |
1072 | vm_page_t m; | |
1073 | off_t foff; | |
1074 | vm_pindex_t poff; | |
1075 | vm_object_t obj; | |
1076 | struct vnode *vp; | |
1077 | ||
1078 | vp = bp->b_vp; | |
1079 | ||
1080 | /* | |
1081 | * Get the base offset and length of the buffer. Note that | |
1082 | * in the VMIO case if the buffer block size is not | |
1083 | * page-aligned then b_data pointer may not be page-aligned. | |
236b2b9f | 1084 | * But our b_xio.xio_pages array *IS* page aligned. |
984263bc MD |
1085 | * |
1086 | * block sizes less then DEV_BSIZE (usually 512) are not | |
1087 | * supported due to the page granularity bits (m->valid, | |
1088 | * m->dirty, etc...). | |
1089 | * | |
1090 | * See man buf(9) for more information | |
1091 | */ | |
1092 | ||
1093 | resid = bp->b_bufsize; | |
81b5c339 | 1094 | foff = bp->b_loffset; |
984263bc | 1095 | |
54f51aeb HP |
1096 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
1097 | m = bp->b_xio.xio_pages[i]; | |
984263bc MD |
1098 | vm_page_flag_clear(m, PG_ZERO); |
1099 | /* | |
1100 | * If we hit a bogus page, fixup *all* of them | |
06ecca5a MD |
1101 | * now. Note that we left these pages wired |
1102 | * when we removed them so they had better exist, | |
1103 | * and they cannot be ripped out from under us so | |
e43a034f | 1104 | * no critical section protection is necessary. |
984263bc MD |
1105 | */ |
1106 | if (m == bogus_page) { | |
7540ab49 | 1107 | obj = vp->v_object; |
81b5c339 | 1108 | poff = OFF_TO_IDX(bp->b_loffset); |
984263bc | 1109 | |
54f51aeb | 1110 | for (j = i; j < bp->b_xio.xio_npages; j++) { |
984263bc MD |
1111 | vm_page_t mtmp; |
1112 | ||
54f51aeb | 1113 | mtmp = bp->b_xio.xio_pages[j]; |
984263bc MD |
1114 | if (mtmp == bogus_page) { |
1115 | mtmp = vm_page_lookup(obj, poff + j); | |
1116 | if (!mtmp) { | |
fc92d4aa | 1117 | panic("brelse: page missing"); |
984263bc | 1118 | } |
54f51aeb | 1119 | bp->b_xio.xio_pages[j] = mtmp; |
984263bc MD |
1120 | } |
1121 | } | |
1122 | ||
1123 | if ((bp->b_flags & B_INVAL) == 0) { | |
54f51aeb HP |
1124 | pmap_qenter(trunc_page((vm_offset_t)bp->b_data), |
1125 | bp->b_xio.xio_pages, bp->b_xio.xio_npages); | |
984263bc | 1126 | } |
54f51aeb | 1127 | m = bp->b_xio.xio_pages[i]; |
984263bc | 1128 | } |
8d429613 MD |
1129 | |
1130 | /* | |
1131 | * Invalidate the backing store if B_NOCACHE is set | |
1132 | * (e.g. used with vinvalbuf()). If this is NFS | |
1133 | * we impose a requirement that the block size be | |
1134 | * a multiple of PAGE_SIZE and create a temporary | |
1135 | * hack to basically invalidate the whole page. The | |
1136 | * problem is that NFS uses really odd buffer sizes | |
1137 | * especially when tracking piecemeal writes and | |
1138 | * it also vinvalbuf()'s a lot, which would result | |
1139 | * in only partial page validation and invalidation | |
1140 | * here. If the file page is mmap()'d, however, | |
1141 | * all the valid bits get set so after we invalidate | |
1142 | * here we would end up with weird m->valid values | |
1143 | * like 0xfc. nfs_getpages() can't handle this so | |
1144 | * we clear all the valid bits for the NFS case | |
1145 | * instead of just some of them. | |
1146 | * | |
1147 | * The real bug is the VM system having to set m->valid | |
1148 | * to VM_PAGE_BITS_ALL for faulted-in pages, which | |
1149 | * itself is an artifact of the whole 512-byte | |
1150 | * granular mess that exists to support odd block | |
1151 | * sizes and UFS meta-data block sizes (e.g. 6144). | |
1152 | * A complete rewrite is required. | |
1153 | */ | |
984263bc MD |
1154 | if (bp->b_flags & (B_NOCACHE|B_ERROR)) { |
1155 | int poffset = foff & PAGE_MASK; | |
8d429613 MD |
1156 | int presid; |
1157 | ||
1158 | presid = PAGE_SIZE - poffset; | |
1159 | if (bp->b_vp->v_tag == VT_NFS && | |
1160 | bp->b_vp->v_type == VREG) { | |
1161 | ; /* entire page */ | |
1162 | } else if (presid > resid) { | |
1163 | presid = resid; | |
1164 | } | |
984263bc MD |
1165 | KASSERT(presid >= 0, ("brelse: extra page")); |
1166 | vm_page_set_invalid(m, poffset, presid); | |
1167 | } | |
1168 | resid -= PAGE_SIZE - (foff & PAGE_MASK); | |
1169 | foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; | |
1170 | } | |
984263bc MD |
1171 | if (bp->b_flags & (B_INVAL | B_RELBUF)) |
1172 | vfs_vmio_release(bp); | |
9188c711 MD |
1173 | } else { |
1174 | /* | |
1175 | * Rundown for non-VMIO buffers. | |
1176 | */ | |
1177 | if (bp->b_flags & (B_INVAL | B_RELBUF)) { | |
1178 | #if 0 | |
1179 | if (bp->b_vp) | |
6ea70f76 | 1180 | kprintf("brelse bp %p %08x/%08x: Warning, caught and fixed brelvp bug\n", bp, saved_flags, bp->b_flags); |
9188c711 MD |
1181 | #endif |
1182 | if (bp->b_bufsize) | |
1183 | allocbuf(bp, 0); | |
1184 | if (bp->b_vp) | |
1185 | brelvp(bp); | |
1186 | } | |
984263bc MD |
1187 | } |
1188 | ||
b3098c79 | 1189 | if (bp->b_qindex != BQUEUE_NONE) |
984263bc | 1190 | panic("brelse: free buffer onto another queue???"); |
77bb9400 | 1191 | if (BUF_REFCNTNB(bp) > 1) { |
984263bc MD |
1192 | /* Temporary panic to verify exclusive locking */ |
1193 | /* This panic goes away when we allow shared refs */ | |
1194 | panic("brelse: multiple refs"); | |
1195 | /* do not release to free list */ | |
1196 | BUF_UNLOCK(bp); | |
e43a034f | 1197 | crit_exit(); |
984263bc MD |
1198 | return; |
1199 | } | |
1200 | ||
9188c711 MD |
1201 | /* |
1202 | * Figure out the correct queue to place the cleaned up buffer on. | |
1203 | * Buffers placed in the EMPTY or EMPTYKVA had better already be | |
1204 | * disassociated from their vnode. | |
1205 | */ | |
984263bc | 1206 | |
984263bc | 1207 | if (bp->b_bufsize == 0) { |
9188c711 MD |
1208 | /* |
1209 | * Buffers with no memory. Due to conditionals near the top | |
1210 | * of brelse() such buffers should probably already be | |
1211 | * marked B_INVAL and disassociated from their vnode. | |
1212 | */ | |
984263bc | 1213 | bp->b_flags |= B_INVAL; |
54078292 | 1214 | KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); |
1f1ea522 | 1215 | KKASSERT((bp->b_flags & B_HASHED) == 0); |
984263bc | 1216 | if (bp->b_kvasize) { |
b3098c79 | 1217 | bp->b_qindex = BQUEUE_EMPTYKVA; |
984263bc | 1218 | } else { |
b3098c79 | 1219 | bp->b_qindex = BQUEUE_EMPTY; |
984263bc MD |
1220 | } |
1221 | TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); | |
984263bc | 1222 | } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { |
9188c711 MD |
1223 | /* |
1224 | * Buffers with junk contents. Again these buffers had better | |
1225 | * already be disassociated from their vnode. | |
1226 | */ | |
54078292 | 1227 | KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); |
1f1ea522 | 1228 | KKASSERT((bp->b_flags & B_HASHED) == 0); |
984263bc | 1229 | bp->b_flags |= B_INVAL; |
b3098c79 HP |
1230 | bp->b_qindex = BQUEUE_CLEAN; |
1231 | TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); | |
984263bc | 1232 | } else if (bp->b_flags & B_LOCKED) { |
9188c711 MD |
1233 | /* |
1234 | * Buffers that are locked. | |
1235 | */ | |
b3098c79 HP |
1236 | bp->b_qindex = BQUEUE_LOCKED; |
1237 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); | |
984263bc | 1238 | } else { |
9188c711 MD |
1239 | /* |
1240 | * Remaining buffers. These buffers are still associated with | |
1241 | * their vnode. | |
1242 | */ | |
984263bc MD |
1243 | switch(bp->b_flags & (B_DELWRI|B_AGE)) { |
1244 | case B_DELWRI | B_AGE: | |
b3098c79 HP |
1245 | bp->b_qindex = BQUEUE_DIRTY; |
1246 | TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_DIRTY], bp, b_freelist); | |
984263bc MD |
1247 | break; |
1248 | case B_DELWRI: | |
b3098c79 HP |
1249 | bp->b_qindex = BQUEUE_DIRTY; |
1250 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist); | |
984263bc MD |
1251 | break; |
1252 | case B_AGE: | |
b3098c79 HP |
1253 | bp->b_qindex = BQUEUE_CLEAN; |
1254 | TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); | |
984263bc MD |
1255 | break; |
1256 | default: | |
b3098c79 HP |
1257 | bp->b_qindex = BQUEUE_CLEAN; |
1258 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); | |
984263bc MD |
1259 | break; |
1260 | } | |
1261 | } | |
1262 | ||
1263 | /* | |
1264 | * If B_INVAL, clear B_DELWRI. We've already placed the buffer | |
1265 | * on the correct queue. | |
1266 | */ | |
1267 | if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) | |
1268 | bundirty(bp); | |
1269 | ||
1270 | /* | |
1271 | * Fixup numfreebuffers count. The bp is on an appropriate queue | |
1272 | * unless locked. We then bump numfreebuffers if it is not B_DELWRI. | |
1273 | * We've already handled the B_INVAL case ( B_DELWRI will be clear | |
1274 | * if B_INVAL is set ). | |
1275 | */ | |
984263bc MD |
1276 | if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) |
1277 | bufcountwakeup(); | |
1278 | ||
1279 | /* | |
1280 | * Something we can maybe free or reuse | |
1281 | */ | |
1282 | if (bp->b_bufsize || bp->b_kvasize) | |
1283 | bufspacewakeup(); | |
1284 | ||
69f8c926 MD |
1285 | /* |
1286 | * Clean up temporary flags and unlock the buffer. | |
1287 | */ | |
984263bc MD |
1288 | bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | |
1289 | B_DIRECT | B_NOWDRAIN); | |
69f8c926 | 1290 | BUF_UNLOCK(bp); |
e43a034f | 1291 | crit_exit(); |
984263bc MD |
1292 | } |
1293 | ||
1294 | /* | |
3f779080 HP |
1295 | * bqrelse: |
1296 | * | |
1297 | * Release a buffer back to the appropriate queue but do not try to free | |
1298 | * it. The buffer is expected to be used again soon. | |
984263bc | 1299 | * |
3f779080 HP |
1300 | * bqrelse() is used by bdwrite() to requeue a delayed write, and used by |
1301 | * biodone() to requeue an async I/O on completion. It is also used when | |
1302 | * known good buffers need to be requeued but we think we may need the data | |
1303 | * again soon. | |
984263bc | 1304 | * |
3f779080 | 1305 | * XXX we should be able to leave the B_RELBUF hint set on completion. |
984263bc MD |
1306 | */ |
1307 | void | |
c8e4131d | 1308 | bqrelse(struct buf *bp) |
984263bc | 1309 | { |
e43a034f | 1310 | crit_enter(); |
984263bc MD |
1311 | |
1312 | KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); | |
1313 | ||
b3098c79 | 1314 | if (bp->b_qindex != BQUEUE_NONE) |
984263bc | 1315 | panic("bqrelse: free buffer onto another queue???"); |
77bb9400 | 1316 | if (BUF_REFCNTNB(bp) > 1) { |
984263bc MD |
1317 | /* do not release to free list */ |
1318 | panic("bqrelse: multiple refs"); | |
1319 | BUF_UNLOCK(bp); | |
e43a034f | 1320 | crit_exit(); |
984263bc MD |
1321 | return; |
1322 | } | |
1323 | if (bp->b_flags & B_LOCKED) { | |
1324 | bp->b_flags &= ~B_ERROR; | |
b3098c79 HP |
1325 | bp->b_qindex = BQUEUE_LOCKED; |
1326 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); | |
984263bc MD |
1327 | /* buffers with stale but valid contents */ |
1328 | } else if (bp->b_flags & B_DELWRI) { | |
b3098c79 HP |
1329 | bp->b_qindex = BQUEUE_DIRTY; |
1330 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist); | |
984263bc MD |
1331 | } else if (vm_page_count_severe()) { |
1332 | /* | |
1333 | * We are too low on memory, we have to try to free the | |
1334 | * buffer (most importantly: the wired pages making up its | |
1335 | * backing store) *now*. | |
1336 | */ | |
e43a034f | 1337 | crit_exit(); |
984263bc MD |
1338 | brelse(bp); |
1339 | return; | |
1340 | } else { | |
b3098c79 HP |
1341 | bp->b_qindex = BQUEUE_CLEAN; |
1342 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); | |
984263bc MD |
1343 | } |
1344 | ||
1345 | if ((bp->b_flags & B_LOCKED) == 0 && | |
1346 | ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { | |
1347 | bufcountwakeup(); | |
1348 | } | |
1349 | ||
1350 | /* | |
1351 | * Something we can maybe free or reuse. | |
1352 | */ | |
1353 | if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) | |
1354 | bufspacewakeup(); | |
1355 | ||
9188c711 MD |
1356 | /* |
1357 | * Final cleanup and unlock. Clear bits that are only used while a | |
1358 | * buffer is actively locked. | |
1359 | */ | |
984263bc | 1360 | bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); |
9188c711 | 1361 | BUF_UNLOCK(bp); |
e43a034f | 1362 | crit_exit(); |
984263bc MD |
1363 | } |
1364 | ||
3f779080 HP |
1365 | /* |
1366 | * vfs_vmio_release: | |
1367 | * | |
1368 | * Return backing pages held by the buffer 'bp' back to the VM system | |
1369 | * if possible. The pages are freed if they are no longer valid or | |
1370 | * attempt to free if it was used for direct I/O otherwise they are | |
1371 | * sent to the page cache. | |
1372 | * | |
1373 | * Pages that were marked busy are left alone and skipped. | |
1374 | * | |
1375 | * The KVA mapping (b_data) for the underlying pages is removed by | |
1376 | * this function. | |
1377 | */ | |
984263bc | 1378 | static void |
493c516a | 1379 | vfs_vmio_release(struct buf *bp) |
984263bc | 1380 | { |
e43a034f | 1381 | int i; |
984263bc MD |
1382 | vm_page_t m; |
1383 | ||
e43a034f | 1384 | crit_enter(); |
54f51aeb HP |
1385 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
1386 | m = bp->b_xio.xio_pages[i]; | |
1387 | bp->b_xio.xio_pages[i] = NULL; | |
984263bc MD |
1388 | /* |
1389 | * In order to keep page LRU ordering consistent, put | |
1390 | * everything on the inactive queue. | |
1391 | */ | |
1392 | vm_page_unwire(m, 0); | |
1393 | /* | |
1394 | * We don't mess with busy pages, it is | |
1395 | * the responsibility of the process that | |
1396 | * busied the pages to deal with them. | |
1397 | */ | |
1398 | if ((m->flags & PG_BUSY) || (m->busy != 0)) | |
1399 | continue; | |
1400 | ||
1401 | if (m->wire_count == 0) { | |
1402 | vm_page_flag_clear(m, PG_ZERO); | |
1403 | /* | |
1404 | * Might as well free the page if we can and it has | |
1405 | * no valid data. We also free the page if the | |
1406 | * buffer was used for direct I/O. | |
1407 | */ | |
3f779080 HP |
1408 | if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && |
1409 | m->hold_count == 0) { | |
984263bc MD |
1410 | vm_page_busy(m); |
1411 | vm_page_protect(m, VM_PROT_NONE); | |
1412 | vm_page_free(m); | |
1413 | } else if (bp->b_flags & B_DIRECT) { | |
1414 | vm_page_try_to_free(m); | |
1415 | } else if (vm_page_count_severe()) { | |
1416 | vm_page_try_to_cache(m); | |
1417 | } | |
1418 | } | |
1419 | } | |
e43a034f | 1420 | crit_exit(); |
54f51aeb | 1421 | pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages); |
984263bc MD |
1422 | if (bp->b_bufsize) { |
1423 | bufspacewakeup(); | |
1424 | bp->b_bufsize = 0; | |
1425 | } | |
54f51aeb | 1426 | bp->b_xio.xio_npages = 0; |
984263bc MD |
1427 | bp->b_flags &= ~B_VMIO; |
1428 | if (bp->b_vp) | |
1429 | brelvp(bp); | |
1430 | } | |
1431 | ||
984263bc | 1432 | /* |
3f779080 | 1433 | * vfs_bio_awrite: |
984263bc MD |
1434 | * |
1435 | * Implement clustered async writes for clearing out B_DELWRI buffers. | |
1436 | * This is much better then the old way of writing only one buffer at | |
1437 | * a time. Note that we may not be presented with the buffers in the | |
1438 | * correct order, so we search for the cluster in both directions. | |
6f68d895 MD |
1439 | * |
1440 | * The buffer is locked on call. | |
984263bc MD |
1441 | */ |
1442 | int | |
6f68d895 | 1443 | vfs_bio_awrite(struct buf *bp) |
984263bc MD |
1444 | { |
1445 | int i; | |
1446 | int j; | |
54078292 | 1447 | off_t loffset = bp->b_loffset; |
984263bc | 1448 | struct vnode *vp = bp->b_vp; |
54078292 | 1449 | int nbytes; |
984263bc MD |
1450 | struct buf *bpa; |
1451 | int nwritten; | |
1452 | int size; | |
984263bc | 1453 | |
e43a034f | 1454 | crit_enter(); |
984263bc MD |
1455 | /* |
1456 | * right now we support clustered writing only to regular files. If | |
1457 | * we find a clusterable block we could be in the middle of a cluster | |
1458 | * rather then at the beginning. | |
81b5c339 | 1459 | * |
54078292 MD |
1460 | * NOTE: b_bio1 contains the logical loffset and is aliased |
1461 | * to b_loffset. b_bio2 contains the translated block number. | |
984263bc MD |
1462 | */ |
1463 | if ((vp->v_type == VREG) && | |
1464 | (vp->v_mount != 0) && /* Only on nodes that have the size info */ | |
1465 | (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { | |
1466 | ||
1467 | size = vp->v_mount->mnt_stat.f_iosize; | |
984263bc | 1468 | |
54078292 MD |
1469 | for (i = size; i < MAXPHYS; i += size) { |
1470 | if ((bpa = findblk(vp, loffset + i)) && | |
984263bc MD |
1471 | BUF_REFCNT(bpa) == 0 && |
1472 | ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == | |
1473 | (B_DELWRI | B_CLUSTEROK)) && | |
1474 | (bpa->b_bufsize == size)) { | |
54078292 MD |
1475 | if ((bpa->b_bio2.bio_offset == NOOFFSET) || |
1476 | (bpa->b_bio2.bio_offset != | |
1477 | bp->b_bio2.bio_offset + i)) | |
984263bc MD |
1478 | break; |
1479 | } else { | |
1480 | break; | |
1481 | } | |
1482 | } | |
54078292 MD |
1483 | for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) { |
1484 | if ((bpa = findblk(vp, loffset - j)) && | |
984263bc MD |
1485 | BUF_REFCNT(bpa) == 0 && |
1486 | ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == | |
1487 | (B_DELWRI | B_CLUSTEROK)) && | |
1488 | (bpa->b_bufsize == size)) { | |
54078292 MD |
1489 | if ((bpa->b_bio2.bio_offset == NOOFFSET) || |
1490 | (bpa->b_bio2.bio_offset != | |
1491 | bp->b_bio2.bio_offset - j)) | |
984263bc MD |
1492 | break; |
1493 | } else { | |
1494 | break; | |
1495 | } | |
1496 | } | |
54078292 MD |
1497 | j -= size; |
1498 | nbytes = (i + j); | |
984263bc MD |
1499 | /* |
1500 | * this is a possible cluster write | |
1501 | */ | |
54078292 | 1502 | if (nbytes != size) { |
6f68d895 | 1503 | BUF_UNLOCK(bp); |
54078292 MD |
1504 | nwritten = cluster_wbuild(vp, size, |
1505 | loffset - j, nbytes); | |
e43a034f | 1506 | crit_exit(); |
984263bc MD |
1507 | return nwritten; |
1508 | } | |
1509 | } | |
1510 | ||
984263bc MD |
1511 | bremfree(bp); |
1512 | bp->b_flags |= B_ASYNC; | |
1513 | ||
e43a034f | 1514 | crit_exit(); |
984263bc MD |
1515 | /* |
1516 | * default (old) behavior, writing out only one block | |
1517 | * | |
1518 | * XXX returns b_bufsize instead of b_bcount for nwritten? | |
1519 | */ | |
1520 | nwritten = bp->b_bufsize; | |
62cfda27 | 1521 | bwrite(bp); |
984263bc MD |
1522 | |
1523 | return nwritten; | |
1524 | } | |
1525 | ||
1526 | /* | |
3f779080 | 1527 | * getnewbuf: |
984263bc MD |
1528 | * |
1529 | * Find and initialize a new buffer header, freeing up existing buffers | |
1530 | * in the bufqueues as necessary. The new buffer is returned locked. | |
1531 | * | |
1532 | * Important: B_INVAL is not set. If the caller wishes to throw the | |
1533 | * buffer away, the caller must set B_INVAL prior to calling brelse(). | |
1534 | * | |
1535 | * We block if: | |
1536 | * We have insufficient buffer headers | |
1537 | * We have insufficient buffer space | |
1538 | * buffer_map is too fragmented ( space reservation fails ) | |
1539 | * If we have to flush dirty buffers ( but we try to avoid this ) | |
1540 | * | |
1541 | * To avoid VFS layer recursion we do not flush dirty buffers ourselves. | |
1542 | * Instead we ask the buf daemon to do it for us. We attempt to | |
1543 | * avoid piecemeal wakeups of the pageout daemon. | |
1544 | */ | |
1545 | ||
1546 | static struct buf * | |
1547 | getnewbuf(int slpflag, int slptimeo, int size, int maxsize) | |
1548 | { | |
1549 | struct buf *bp; | |
1550 | struct buf *nbp; | |
1551 | int defrag = 0; | |
1552 | int nqindex; | |
1553 | static int flushingbufs; | |
1554 | ||
1555 | /* | |
1556 | * We can't afford to block since we might be holding a vnode lock, | |
1557 | * which may prevent system daemons from running. We deal with | |
1558 | * low-memory situations by proactively returning memory and running | |
1559 | * async I/O rather then sync I/O. | |
1560 | */ | |
1561 | ||
1562 | ++getnewbufcalls; | |
1563 | --getnewbufrestarts; | |
1564 | restart: | |
1565 | ++getnewbufrestarts; | |
1566 | ||
1567 | /* | |
1568 | * Setup for scan. If we do not have enough free buffers, | |
1569 | * we setup a degenerate case that immediately fails. Note | |
1570 | * that if we are specially marked process, we are allowed to | |
1571 | * dip into our reserves. | |
1572 | * | |
1573 | * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN | |
1574 | * | |
1575 | * We start with EMPTYKVA. If the list is empty we backup to EMPTY. | |
1576 | * However, there are a number of cases (defragging, reusing, ...) | |
1577 | * where we cannot backup. | |
1578 | */ | |
b3098c79 HP |
1579 | nqindex = BQUEUE_EMPTYKVA; |
1580 | nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]); | |
984263bc MD |
1581 | |
1582 | if (nbp == NULL) { | |
1583 | /* | |
1584 | * If no EMPTYKVA buffers and we are either | |
1585 | * defragging or reusing, locate a CLEAN buffer | |
1586 | * to free or reuse. If bufspace useage is low | |
1587 | * skip this step so we can allocate a new buffer. | |
1588 | */ | |
1589 | if (defrag || bufspace >= lobufspace) { | |
b3098c79 HP |
1590 | nqindex = BQUEUE_CLEAN; |
1591 | nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); | |
984263bc MD |
1592 | } |
1593 | ||
1594 | /* | |
1595 | * If we could not find or were not allowed to reuse a | |
1596 | * CLEAN buffer, check to see if it is ok to use an EMPTY | |
1597 | * buffer. We can only use an EMPTY buffer if allocating | |
1598 | * its KVA would not otherwise run us out of buffer space. | |
1599 | */ | |
1600 | if (nbp == NULL && defrag == 0 && | |
1601 | bufspace + maxsize < hibufspace) { | |
b3098c79 HP |
1602 | nqindex = BQUEUE_EMPTY; |
1603 | nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTY]); | |
984263bc MD |
1604 | } |
1605 | } | |
1606 | ||
1607 | /* | |
1608 | * Run scan, possibly freeing data and/or kva mappings on the fly | |
1609 | * depending. | |
1610 | */ | |
1611 | ||
1612 | while ((bp = nbp) != NULL) { | |
1613 | int qindex = nqindex; | |
1614 | ||
1615 | /* | |
1616 | * Calculate next bp ( we can only use it if we do not block | |
1617 | * or do other fancy things ). | |
1618 | */ | |
1619 | if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { | |
1620 | switch(qindex) { | |
b3098c79 HP |
1621 | case BQUEUE_EMPTY: |
1622 | nqindex = BQUEUE_EMPTYKVA; | |
1623 | if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]))) | |
984263bc MD |
1624 | break; |
1625 | /* fall through */ | |
b3098c79 HP |
1626 | case BQUEUE_EMPTYKVA: |
1627 | nqindex = BQUEUE_CLEAN; | |
1628 | if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]))) | |
984263bc MD |
1629 | break; |
1630 | /* fall through */ | |
b3098c79 | 1631 | case BQUEUE_CLEAN: |
984263bc MD |
1632 | /* |
1633 | * nbp is NULL. | |
1634 | */ | |
1635 | break; | |
1636 | } | |
1637 | } | |
1638 | ||
1639 | /* | |
1640 | * Sanity Checks | |
1641 | */ | |
1642 | KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); | |
1643 | ||
1644 | /* | |
1645 | * Note: we no longer distinguish between VMIO and non-VMIO | |
1646 | * buffers. | |
1647 | */ | |
1648 | ||
1649 | KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); | |
1650 | ||
1651 | /* | |
1652 | * If we are defragging then we need a buffer with | |
1653 | * b_kvasize != 0. XXX this situation should no longer | |
1654 | * occur, if defrag is non-zero the buffer's b_kvasize | |
1655 | * should also be non-zero at this point. XXX | |
1656 | */ | |
1657 | if (defrag && bp->b_kvasize == 0) { | |
6ea70f76 | 1658 | kprintf("Warning: defrag empty buffer %p\n", bp); |
984263bc MD |
1659 | continue; |
1660 | } | |
1661 | ||
1662 | /* | |
1663 | * Start freeing the bp. This is somewhat involved. nbp | |
b3098c79 | 1664 | * remains valid only for BQUEUE_EMPTY[KVA] bp's. Buffers |
9188c711 MD |
1665 | * on the clean list must be disassociated from their |
1666 | * current vnode. Buffers on the empty[kva] lists have | |
1667 | * already been disassociated. | |
984263bc MD |
1668 | */ |
1669 | ||
d9dba6f6 | 1670 | if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { |
6ea70f76 | 1671 | kprintf("getnewbuf: warning, locked buf %p, race corrected\n", bp); |
d9dba6f6 MD |
1672 | tsleep(&bd_request, 0, "gnbxxx", hz / 100); |
1673 | goto restart; | |
1674 | } | |
1675 | if (bp->b_qindex != qindex) { | |
6ea70f76 | 1676 | kprintf("getnewbuf: warning, BUF_LOCK blocked unexpectedly on buf %p index %d->%d, race corrected\n", bp, qindex, bp->b_qindex); |
d9dba6f6 MD |
1677 | BUF_UNLOCK(bp); |
1678 | goto restart; | |
1679 | } | |
984263bc MD |
1680 | bremfree(bp); |
1681 | ||
b3098c79 | 1682 | if (qindex == BQUEUE_CLEAN) { |
984263bc MD |
1683 | if (bp->b_flags & B_VMIO) { |
1684 | bp->b_flags &= ~B_ASYNC; | |
1685 | vfs_vmio_release(bp); | |
1686 | } | |
1687 | if (bp->b_vp) | |
1688 | brelvp(bp); | |
1689 | } | |
1690 | ||
1691 | /* | |
1692 | * NOTE: nbp is now entirely invalid. We can only restart | |
1693 | * the scan from this point on. | |
1694 | * | |
1695 | * Get the rest of the buffer freed up. b_kva* is still | |
1696 | * valid after this operation. | |
1697 | */ | |
1698 | ||
54078292 | 1699 | KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08x vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex)); |
1f1ea522 | 1700 | KKASSERT((bp->b_flags & B_HASHED) == 0); |
984263bc MD |
1701 | if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) |
1702 | (*bioops.io_deallocate)(bp); | |
984263bc | 1703 | |
06ecca5a | 1704 | /* |
e43a034f MD |
1705 | * critical section protection is not required when |
1706 | * scrapping a buffer's contents because it is already | |
1707 | * wired. | |
06ecca5a | 1708 | */ |
984263bc MD |
1709 | if (bp->b_bufsize) |
1710 | allocbuf(bp, 0); | |
1711 | ||
4414f2c9 | 1712 | bp->b_flags = B_BNOCLIP; |
10f3fee5 | 1713 | bp->b_cmd = BUF_CMD_DONE; |
984263bc | 1714 | bp->b_vp = NULL; |
984263bc MD |
1715 | bp->b_error = 0; |
1716 | bp->b_resid = 0; | |
1717 | bp->b_bcount = 0; | |
54f51aeb | 1718 | bp->b_xio.xio_npages = 0; |
984263bc | 1719 | bp->b_dirtyoff = bp->b_dirtyend = 0; |
81b5c339 | 1720 | reinitbufbio(bp); |
984263bc MD |
1721 | |
1722 | LIST_INIT(&bp->b_dep); | |
1723 | ||
1724 | /* | |
1725 | * If we are defragging then free the buffer. | |
1726 | */ | |
1727 | if (defrag) { | |
1728 | bp->b_flags |= B_INVAL; | |
1729 | bfreekva(bp); | |
1730 | brelse(bp); | |
1731 | defrag = 0; | |
1732 | goto restart; | |
1733 | } | |
1734 | ||
1735 | /* | |
1736 | * If we are overcomitted then recover the buffer and its | |
1737 | * KVM space. This occurs in rare situations when multiple | |
1738 | * processes are blocked in getnewbuf() or allocbuf(). | |
1739 | */ | |
1740 | if (bufspace >= hibufspace) | |
1741 | flushingbufs = 1; | |
1742 | if (flushingbufs && bp->b_kvasize != 0) { | |
1743 | bp->b_flags |= B_INVAL; | |
1744 | bfreekva(bp); | |
1745 | brelse(bp); | |
1746 | goto restart; | |
1747 | } | |
1748 | if (bufspace < lobufspace) | |
1749 | flushingbufs = 0; | |
1750 | break; | |
1751 | } | |
1752 | ||
1753 | /* | |
1754 | * If we exhausted our list, sleep as appropriate. We may have to | |
1755 | * wakeup various daemons and write out some dirty buffers. | |
1756 | * | |
1757 | * Generally we are sleeping due to insufficient buffer space. | |
1758 | */ | |
1759 | ||
1760 | if (bp == NULL) { | |
1761 | int flags; | |
1762 | char *waitmsg; | |
1763 | ||
1764 | if (defrag) { | |
1765 | flags = VFS_BIO_NEED_BUFSPACE; | |
1766 | waitmsg = "nbufkv"; | |
1767 | } else if (bufspace >= hibufspace) { | |
1768 | waitmsg = "nbufbs"; | |
1769 | flags = VFS_BIO_NEED_BUFSPACE; | |
1770 | } else { | |
1771 | waitmsg = "newbuf"; | |
1772 | flags = VFS_BIO_NEED_ANY; | |
1773 | } | |
1774 | ||
1775 | bd_speedup(); /* heeeelp */ | |
1776 | ||
1777 | needsbuffer |= flags; | |
1778 | while (needsbuffer & flags) { | |
377d4740 | 1779 | if (tsleep(&needsbuffer, slpflag, waitmsg, slptimeo)) |
984263bc MD |
1780 | return (NULL); |
1781 | } | |
1782 | } else { | |
1783 | /* | |
1784 | * We finally have a valid bp. We aren't quite out of the | |
1785 | * woods, we still have to reserve kva space. In order | |
1786 | * to keep fragmentation sane we only allocate kva in | |
1787 | * BKVASIZE chunks. | |
1788 | */ | |
1789 | maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; | |
1790 | ||
1791 | if (maxsize != bp->b_kvasize) { | |
1792 | vm_offset_t addr = 0; | |
a108bf71 | 1793 | int count; |
984263bc MD |
1794 | |
1795 | bfreekva(bp); | |
1796 | ||
a108bf71 | 1797 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
e4846942 | 1798 | vm_map_lock(&buffer_map); |
984263bc | 1799 | |
e4846942 MD |
1800 | if (vm_map_findspace(&buffer_map, |
1801 | vm_map_min(&buffer_map), maxsize, | |
e9bb90e8 | 1802 | maxsize, &addr)) { |
984263bc | 1803 | /* |
3f779080 | 1804 | * Uh oh. Buffer map is too fragmented. We |
984263bc MD |
1805 | * must defragment the map. |
1806 | */ | |
e4846942 | 1807 | vm_map_unlock(&buffer_map); |
a108bf71 | 1808 | vm_map_entry_release(count); |
984263bc MD |
1809 | ++bufdefragcnt; |
1810 | defrag = 1; | |
1811 | bp->b_flags |= B_INVAL; | |
1812 | brelse(bp); | |
1813 | goto restart; | |
1814 | } | |
1815 | if (addr) { | |
e4846942 | 1816 | vm_map_insert(&buffer_map, &count, |
a108bf71 | 1817 | NULL, 0, |
984263bc | 1818 | addr, addr + maxsize, |
1b874851 MD |
1819 | VM_MAPTYPE_NORMAL, |
1820 | VM_PROT_ALL, VM_PROT_ALL, | |
1821 | MAP_NOFAULT); | |
984263bc MD |
1822 | |
1823 | bp->b_kvabase = (caddr_t) addr; | |
1824 | bp->b_kvasize = maxsize; | |
1825 | bufspace += bp->b_kvasize; | |
1826 | ++bufreusecnt; | |
1827 | } | |
e4846942 | 1828 | vm_map_unlock(&buffer_map); |
a108bf71 | 1829 | vm_map_entry_release(count); |
984263bc MD |
1830 | } |
1831 | bp->b_data = bp->b_kvabase; | |
1832 | } | |
1833 | return(bp); | |
1834 | } | |
1835 | ||
1836 | /* | |
3f779080 | 1837 | * buf_daemon: |
984263bc | 1838 | * |
3f779080 | 1839 | * Buffer flushing daemon. Buffers are normally flushed by the |
984263bc MD |
1840 | * update daemon but if it cannot keep up this process starts to |
1841 | * take the load in an attempt to prevent getnewbuf() from blocking. | |
1842 | */ | |
1843 | ||
bc6dffab | 1844 | static struct thread *bufdaemonthread; |
984263bc MD |
1845 | |
1846 | static struct kproc_desc buf_kp = { | |
1847 | "bufdaemon", | |
1848 | buf_daemon, | |
bc6dffab | 1849 | &bufdaemonthread |
984263bc MD |
1850 | }; |
1851 | SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) | |
1852 | ||
1853 | static void | |
c972a82f | 1854 | buf_daemon(void) |
984263bc | 1855 | { |
984263bc MD |
1856 | /* |
1857 | * This process needs to be suspended prior to shutdown sync. | |
1858 | */ | |
bc6dffab MD |
1859 | EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, |
1860 | bufdaemonthread, SHUTDOWN_PRI_LAST); | |
984263bc MD |
1861 | |
1862 | /* | |
1863 | * This process is allowed to take the buffer cache to the limit | |
1864 | */ | |
e43a034f | 1865 | crit_enter(); |
984263bc MD |
1866 | |
1867 | for (;;) { | |
0cfcada1 | 1868 | kproc_suspend_loop(); |
984263bc MD |
1869 | |
1870 | /* | |
1871 | * Do the flush. Limit the amount of in-transit I/O we | |
1872 | * allow to build up, otherwise we would completely saturate | |
1873 | * the I/O system. Wakeup any waiting processes before we | |
1874 | * normally would so they can run in parallel with our drain. | |
1875 | */ | |
1876 | while (numdirtybuffers > lodirtybuffers) { | |
1877 | if (flushbufqueues() == 0) | |
1878 | break; | |
1879 | waitrunningbufspace(); | |
1880 | numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); | |
1881 | } | |
1882 | ||
1883 | /* | |
1884 | * Only clear bd_request if we have reached our low water | |
1885 | * mark. The buf_daemon normally waits 5 seconds and | |
1886 | * then incrementally flushes any dirty buffers that have | |
1887 | * built up, within reason. | |
1888 | * | |
1889 | * If we were unable to hit our low water mark and couldn't | |
1890 | * find any flushable buffers, we sleep half a second. | |
1891 | * Otherwise we loop immediately. | |
1892 | */ | |
1893 | if (numdirtybuffers <= lodirtybuffers) { | |
1894 | /* | |
1895 | * We reached our low water mark, reset the | |
1896 | * request and sleep until we are needed again. | |
1897 | * The sleep is just so the suspend code works. | |
1898 | */ | |
f832287e | 1899 | spin_lock_wr(&needsbuffer_spin); |
984263bc | 1900 | bd_request = 0; |
f832287e MD |
1901 | msleep(&bd_request, &needsbuffer_spin, 0, "psleep", hz); |
1902 | spin_unlock_wr(&needsbuffer_spin); | |
984263bc MD |
1903 | } else { |
1904 | /* | |
1905 | * We couldn't find any flushable dirty buffers but | |
1906 | * still have too many dirty buffers, we | |
1907 | * have to sleep and try again. (rare) | |
1908 | */ | |
377d4740 | 1909 | tsleep(&bd_request, 0, "qsleep", hz / 2); |
984263bc MD |
1910 | } |
1911 | } | |
1912 | } | |
1913 | ||
1914 | /* | |
3f779080 | 1915 | * flushbufqueues: |
984263bc MD |
1916 | * |
1917 | * Try to flush a buffer in the dirty queue. We must be careful to | |
1918 | * free up B_INVAL buffers instead of write them, which NFS is | |
1919 | * particularly sensitive to. | |
1920 | */ | |
1921 | ||
1922 | static int | |
1923 | flushbufqueues(void) | |
1924 | { | |
1925 | struct buf *bp; | |
1926 | int r = 0; | |
1927 | ||
b3098c79 | 1928 | bp = TAILQ_FIRST(&bufqueues[BQUEUE_DIRTY]); |
984263bc MD |
1929 | |
1930 | while (bp) { | |
1931 | KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); | |
70371608 | 1932 | if (bp->b_flags & B_DELWRI) { |
984263bc MD |
1933 | if (bp->b_flags & B_INVAL) { |
1934 | if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) | |
1935 | panic("flushbufqueues: locked buf"); | |
1936 | bremfree(bp); | |
1937 | brelse(bp); | |
1938 | ++r; | |
1939 | break; | |
1940 | } | |
1941 | if (LIST_FIRST(&bp->b_dep) != NULL && | |
1942 | bioops.io_countdeps && | |
1943 | (bp->b_flags & B_DEFERRED) == 0 && | |
1944 | (*bioops.io_countdeps)(bp, 0)) { | |
b3098c79 | 1945 | TAILQ_REMOVE(&bufqueues[BQUEUE_DIRTY], |
6f68d895 | 1946 | bp, b_freelist); |
b3098c79 | 1947 | TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], |
6f68d895 | 1948 | bp, b_freelist); |
984263bc | 1949 | bp->b_flags |= B_DEFERRED; |
b3098c79 | 1950 | bp = TAILQ_FIRST(&bufqueues[BQUEUE_DIRTY]); |
984263bc MD |
1951 | continue; |
1952 | } | |
6f68d895 MD |
1953 | |
1954 | /* | |
1955 | * Only write it out if we can successfully lock | |
1956 | * it. | |
1957 | */ | |
1958 | if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { | |
1959 | vfs_bio_awrite(bp); | |
1960 | ++r; | |
1961 | break; | |
1962 | } | |
984263bc MD |
1963 | } |
1964 | bp = TAILQ_NEXT(bp, b_freelist); | |
1965 | } | |
1966 | return (r); | |
1967 | } | |
1968 | ||
984263bc | 1969 | /* |
3f779080 HP |
1970 | * inmem: |
1971 | * | |
1972 | * Returns true if no I/O is needed to access the associated VM object. | |
1f1ea522 | 1973 | * This is like findblk except it also hunts around in the VM system for |
3f779080 | 1974 | * the data. |
06ecca5a | 1975 | * |
3f779080 HP |
1976 | * Note that we ignore vm_page_free() races from interrupts against our |
1977 | * lookup, since if the caller is not protected our return value will not | |
1978 | * be any more valid then otherwise once we exit the critical section. | |
984263bc | 1979 | */ |
984263bc | 1980 | int |
54078292 | 1981 | inmem(struct vnode *vp, off_t loffset) |
984263bc MD |
1982 | { |
1983 | vm_object_t obj; | |
1984 | vm_offset_t toff, tinc, size; | |
1985 | vm_page_t m; | |
984263bc | 1986 | |
54078292 | 1987 | if (findblk(vp, loffset)) |
984263bc MD |
1988 | return 1; |
1989 | if (vp->v_mount == NULL) | |
1990 | return 0; | |
7540ab49 MD |
1991 | if ((obj = vp->v_object) == NULL) |
1992 | return 0; | |
984263bc MD |
1993 | |
1994 | size = PAGE_SIZE; | |
1995 | if (size > vp->v_mount->mnt_stat.f_iosize) | |
1996 | size = vp->v_mount->mnt_stat.f_iosize; | |
984263bc MD |
1997 | |
1998 | for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { | |
54078292 MD |
1999 | m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff)); |
2000 | if (m == NULL) | |
984263bc MD |
2001 | return 0; |
2002 | tinc = size; | |
54078292 MD |
2003 | if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK)) |
2004 | tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK); | |
984263bc | 2005 | if (vm_page_is_valid(m, |
54078292 | 2006 | (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) |
984263bc MD |
2007 | return 0; |
2008 | } | |
2009 | return 1; | |
2010 | } | |
2011 | ||
2012 | /* | |
3f779080 | 2013 | * vfs_setdirty: |
984263bc MD |
2014 | * |
2015 | * Sets the dirty range for a buffer based on the status of the dirty | |
2016 | * bits in the pages comprising the buffer. | |
2017 | * | |
2018 | * The range is limited to the size of the buffer. | |
2019 | * | |
2020 | * This routine is primarily used by NFS, but is generalized for the | |
2021 | * B_VMIO case. | |
2022 | */ | |
2023 | static void | |
2024 | vfs_setdirty(struct buf *bp) | |
2025 | { | |
2026 | int i; | |
2027 | vm_object_t object; | |
2028 | ||
2029 | /* | |
2030 | * Degenerate case - empty buffer | |
2031 | */ | |
2032 | ||
2033 | if (bp->b_bufsize == 0) | |
2034 | return; | |
2035 | ||
2036 | /* | |
2037 | * We qualify the scan for modified pages on whether the | |
2038 | * object has been flushed yet. The OBJ_WRITEABLE flag | |
2039 | * is not cleared simply by protecting pages off. | |
2040 | */ | |
2041 | ||
2042 | if ((bp->b_flags & B_VMIO) == 0) | |
2043 | return; | |
2044 | ||
54f51aeb | 2045 | object = bp->b_xio.xio_pages[0]->object; |
984263bc MD |
2046 | |
2047 | if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) | |
6ea70f76 | 2048 | kprintf("Warning: object %p writeable but not mightbedirty\n", object); |
984263bc | 2049 | if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) |
6ea70f76 | 2050 | kprintf("Warning: object %p mightbedirty but not writeable\n", object); |
984263bc MD |
2051 | |
2052 | if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { | |
2053 | vm_offset_t boffset; | |
2054 | vm_offset_t eoffset; | |
2055 | ||
2056 | /* | |
2057 | * test the pages to see if they have been modified directly | |
2058 | * by users through the VM system. | |
2059 | */ | |
54f51aeb HP |
2060 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
2061 | vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO); | |
2062 | vm_page_test_dirty(bp->b_xio.xio_pages[i]); | |
984263bc MD |
2063 | } |
2064 | ||
2065 | /* | |
2066 | * Calculate the encompassing dirty range, boffset and eoffset, | |
2067 | * (eoffset - boffset) bytes. | |
2068 | */ | |
2069 | ||
54f51aeb HP |
2070 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
2071 | if (bp->b_xio.xio_pages[i]->dirty) | |
984263bc MD |
2072 | break; |
2073 | } | |
81b5c339 | 2074 | boffset = (i << PAGE_SHIFT) - (bp->b_loffset & PAGE_MASK); |
984263bc | 2075 | |
54f51aeb HP |
2076 | for (i = bp->b_xio.xio_npages - 1; i >= 0; --i) { |
2077 | if (bp->b_xio.xio_pages[i]->dirty) { | |
984263bc MD |
2078 | break; |
2079 | } | |
2080 | } | |
81b5c339 | 2081 | eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_loffset & PAGE_MASK); |
984263bc MD |
2082 | |
2083 | /* | |
2084 | * Fit it to the buffer. | |
2085 | */ | |
2086 | ||
2087 | if (eoffset > bp->b_bcount) | |
2088 | eoffset = bp->b_bcount; | |
2089 | ||
2090 | /* | |
2091 | * If we have a good dirty range, merge with the existing | |
2092 | * dirty range. | |
2093 | */ | |
2094 | ||
2095 | if (boffset < eoffset) { | |
2096 | if (bp->b_dirtyoff > boffset) | |
2097 | bp->b_dirtyoff = boffset; | |
2098 | if (bp->b_dirtyend < eoffset) | |
2099 | bp->b_dirtyend = eoffset; | |
2100 | } | |
2101 | } | |
2102 | } | |
2103 | ||
1f1ea522 MD |
2104 | /* |
2105 | * findblk: | |
2106 | * | |
2107 | * Locate and return the specified buffer, or NULL if the buffer does | |
2108 | * not exist. Do not attempt to lock the buffer or manipulate it in | |
2109 | * any way. The caller must validate that the correct buffer has been | |
2110 | * obtain after locking it. | |
2111 | */ | |
2112 | struct buf * | |
54078292 | 2113 | findblk(struct vnode *vp, off_t loffset) |
1f1ea522 MD |
2114 | { |
2115 | struct buf *bp; | |
2116 | ||
2117 | crit_enter(); | |
54078292 | 2118 | bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); |
1f1ea522 MD |
2119 | crit_exit(); |
2120 | return(bp); | |
2121 | } | |
2122 | ||
984263bc | 2123 | /* |
3f779080 | 2124 | * getblk: |
984263bc MD |
2125 | * |
2126 | * Get a block given a specified block and offset into a file/device. | |
10f3fee5 MD |
2127 | * B_INVAL may or may not be set on return. The caller should clear |
2128 | * B_INVAL prior to initiating a READ. | |
984263bc | 2129 | * |
77bb9400 MD |
2130 | * IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE |
2131 | * IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ, | |
2132 | * OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer | |
2133 | * without doing any of those things the system will likely believe | |
2134 | * the buffer to be valid (especially if it is not B_VMIO), and the | |
2135 | * next getblk() will return the buffer with B_CACHE set. | |
2136 | * | |
984263bc MD |
2137 | * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for |
2138 | * an existing buffer. | |
2139 | * | |
2140 | * For a VMIO buffer, B_CACHE is modified according to the backing VM. | |
2141 | * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set | |
2142 | * and then cleared based on the backing VM. If the previous buffer is | |
2143 | * non-0-sized but invalid, B_CACHE will be cleared. | |
2144 | * | |
2145 | * If getblk() must create a new buffer, the new buffer is returned with | |
2146 | * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which | |
2147 | * case it is returned with B_INVAL clear and B_CACHE set based on the | |
2148 | * backing VM. | |
2149 | * | |
62cfda27 | 2150 | * getblk() also forces a bwrite() for any B_DELWRI buffer whos |
984263bc MD |
2151 | * B_CACHE bit is clear. |
2152 | * | |
2153 | * What this means, basically, is that the caller should use B_CACHE to | |
2154 | * determine whether the buffer is fully valid or not and should clear | |
2155 | * B_INVAL prior to issuing a read. If the caller intends to validate | |
2156 | * the buffer by loading its data area with something, the caller needs | |
2157 | * to clear B_INVAL. If the caller does this without issuing an I/O, | |
2158 | * the caller should set B_CACHE ( as an optimization ), else the caller | |
2159 | * should issue the I/O and biodone() will set B_CACHE if the I/O was | |
2160 | * a write attempt or if it was a successfull read. If the caller | |
2161 | * intends to issue a READ, the caller must clear B_INVAL and B_ERROR | |
2162 | * prior to issuing the READ. biodone() will *not* clear B_INVAL. | |
2163 | */ | |
2164 | struct buf * | |
54078292 | 2165 | getblk(struct vnode *vp, off_t loffset, int size, int slpflag, int slptimeo) |
984263bc MD |
2166 | { |
2167 | struct buf *bp; | |
984263bc MD |
2168 | |
2169 | if (size > MAXBSIZE) | |
fc92d4aa | 2170 | panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); |
7540ab49 MD |
2171 | if (vp->v_object == NULL) |
2172 | panic("getblk: vnode %p has no object!", vp); | |
984263bc | 2173 | |
e43a034f | 2174 | crit_enter(); |
984263bc MD |
2175 | loop: |
2176 | /* | |
2177 | * Block if we are low on buffers. Certain processes are allowed | |
2178 | * to completely exhaust the buffer cache. | |
2179 | * | |
2180 | * If this check ever becomes a bottleneck it may be better to | |
1f1ea522 | 2181 | * move it into the else, when findblk() fails. At the moment |
984263bc MD |
2182 | * it isn't a problem. |
2183 | * | |
2184 | * XXX remove, we cannot afford to block anywhere if holding a vnode | |
2185 | * lock in low-memory situation, so take it to the max. | |
2186 | */ | |
2187 | if (numfreebuffers == 0) { | |
2188 | if (!curproc) | |
2189 | return NULL; | |
2190 | needsbuffer |= VFS_BIO_NEED_ANY; | |
377d4740 | 2191 | tsleep(&needsbuffer, slpflag, "newbuf", slptimeo); |
984263bc MD |
2192 | } |
2193 | ||
54078292 | 2194 | if ((bp = findblk(vp, loffset))) { |
984263bc | 2195 | /* |
a0da602d MD |
2196 | * The buffer was found in the cache, but we need to lock it. |
2197 | * Even with LK_NOWAIT the lockmgr may break our critical | |
2198 | * section, so double-check the validity of the buffer | |
2199 | * once the lock has been obtained. | |
984263bc | 2200 | */ |
984263bc | 2201 | if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { |
f2770c70 MD |
2202 | int lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; |
2203 | if (slpflag & PCATCH) | |
2204 | lkflags |= LK_PCATCH; | |
2205 | if (BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo) == | |
2206 | ENOLCK) { | |
984263bc | 2207 | goto loop; |
f2770c70 | 2208 | } |
e43a034f | 2209 | crit_exit(); |
f2770c70 | 2210 | return (NULL); |
984263bc MD |
2211 | } |
2212 | ||
a0da602d MD |
2213 | /* |
2214 | * Once the buffer has been locked, make sure we didn't race | |
2215 | * a buffer recyclement. Buffers that are no longer hashed | |
2216 | * will have b_vp == NULL, so this takes care of that check | |
2217 | * as well. | |
2218 | */ | |
54078292 | 2219 | if (bp->b_vp != vp || bp->b_loffset != loffset) { |
6ea70f76 | 2220 | kprintf("Warning buffer %p (vp %p loffset %lld) was recycled\n", bp, vp, loffset); |
a9518ecf | 2221 | BUF_UNLOCK(bp); |
a0da602d MD |
2222 | goto loop; |
2223 | } | |
2224 | ||
4baec531 MD |
2225 | /* |
2226 | * All vnode-based buffers must be backed by a VM object. | |
2227 | */ | |
2228 | KKASSERT(bp->b_flags & B_VMIO); | |
10f3fee5 | 2229 | KKASSERT(bp->b_cmd == BUF_CMD_DONE); |
4baec531 | 2230 | |
a0da602d MD |
2231 | /* |
2232 | * Make sure that B_INVAL buffers do not have a cached | |
2233 | * block number translation. | |
2234 | */ | |
54078292 | 2235 | if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) { |
6ea70f76 | 2236 | kprintf("Warning invalid buffer %p (vp %p loffset %lld) did not have cleared bio_offset cache\n", bp, vp, loffset); |
81b5c339 | 2237 | clearbiocache(&bp->b_bio2); |
a0da602d MD |
2238 | } |
2239 | ||
984263bc MD |
2240 | /* |
2241 | * The buffer is locked. B_CACHE is cleared if the buffer is | |
4baec531 | 2242 | * invalid. |
984263bc MD |
2243 | */ |
2244 | if (bp->b_flags & B_INVAL) | |
2245 | bp->b_flags &= ~B_CACHE; | |
984263bc MD |
2246 | bremfree(bp); |
2247 | ||
2248 | /* | |
4baec531 MD |
2249 | * Any size inconsistancy with a dirty buffer or a buffer |
2250 | * with a softupdates dependancy must be resolved. Resizing | |
2251 | * the buffer in such circumstances can lead to problems. | |
984263bc | 2252 | */ |
4baec531 MD |
2253 | if (size != bp->b_bcount) { |
2254 | if (bp->b_flags & B_DELWRI) { | |
2255 | bp->b_flags |= B_NOCACHE; | |
62cfda27 | 2256 | bwrite(bp); |
4baec531 MD |
2257 | } else if (LIST_FIRST(&bp->b_dep)) { |
2258 | bp->b_flags |= B_NOCACHE; | |
62cfda27 | 2259 | bwrite(bp); |
4baec531 MD |
2260 | } else { |
2261 | bp->b_flags |= B_RELBUF; | |
2262 | brelse(bp); | |
984263bc | 2263 | } |
4baec531 | 2264 | goto loop; |
984263bc | 2265 | } |
4baec531 | 2266 | KKASSERT(size <= bp->b_kvasize); |
81b5c339 MD |
2267 | KASSERT(bp->b_loffset != NOOFFSET, |
2268 | ("getblk: no buffer offset")); | |
984263bc MD |
2269 | |
2270 | /* | |
2271 | * A buffer with B_DELWRI set and B_CACHE clear must | |
2272 | * be committed before we can return the buffer in | |
2273 | * order to prevent the caller from issuing a read | |
2274 | * ( due to B_CACHE not being set ) and overwriting | |
2275 | * it. | |
2276 | * | |
2277 | * Most callers, including NFS and FFS, need this to | |
2278 | * operate properly either because they assume they | |
2279 | * can issue a read if B_CACHE is not set, or because | |
2280 | * ( for example ) an uncached B_DELWRI might loop due | |
2281 | * to softupdates re-dirtying the buffer. In the latter | |
2282 | * case, B_CACHE is set after the first write completes, | |
2283 | * preventing further loops. | |
2284 | * | |
2285 | * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE | |
2286 | * above while extending the buffer, we cannot allow the | |
2287 | * buffer to remain with B_CACHE set after the write | |
2288 | * completes or it will represent a corrupt state. To | |
2289 | * deal with this we set B_NOCACHE to scrap the buffer | |
2290 | * after the write. | |
2291 | * | |
2292 | * We might be able to do something fancy, like setting | |
2293 | * B_CACHE in bwrite() except if B_DELWRI is already set, | |
2294 | * so the below call doesn't set B_CACHE, but that gets real | |
2295 | * confusing. This is much easier. | |
2296 | */ | |
2297 | ||
2298 | if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { | |
2299 | bp->b_flags |= B_NOCACHE; | |
62cfda27 | 2300 | bwrite(bp); |
984263bc MD |
2301 | goto loop; |
2302 | } | |
e43a034f | 2303 | crit_exit(); |
984263bc MD |
2304 | } else { |
2305 | /* | |
2306 | * Buffer is not in-core, create new buffer. The buffer | |
2307 | * returned by getnewbuf() is locked. Note that the returned | |
2308 | * buffer is also considered valid (not marked B_INVAL). | |
21ab32bd MD |
2309 | * |
2310 | * Calculating the offset for the I/O requires figuring out | |
2311 | * the block size. We use DEV_BSIZE for VBLK or VCHR and | |
2312 | * the mount's f_iosize otherwise. If the vnode does not | |
2313 | * have an associated mount we assume that the passed size is | |
2314 | * the block size. | |
2315 | * | |
2316 | * Note that vn_isdisk() cannot be used here since it may | |
2317 | * return a failure for numerous reasons. Note that the | |
2318 | * buffer size may be larger then the block size (the caller | |
2319 | * will use block numbers with the proper multiple). Beware | |
2320 | * of using any v_* fields which are part of unions. In | |
2321 | * particular, in DragonFly the mount point overloading | |
1d505369 MD |
2322 | * mechanism uses the namecache only and the underlying |
2323 | * directory vnode is not a special case. | |
984263bc | 2324 | */ |
7540ab49 | 2325 | int bsize, maxsize; |
984263bc | 2326 | |
21ab32bd | 2327 | if (vp->v_type == VBLK || vp->v_type == VCHR) |
984263bc | 2328 | bsize = DEV_BSIZE; |
984263bc MD |
2329 | else if (vp->v_mount) |
2330 | bsize = vp->v_mount->mnt_stat.f_iosize; | |
2331 | else | |
2332 | bsize = size; | |
2333 | ||
7540ab49 | 2334 | maxsize = size + (loffset & PAGE_MASK); |
984263bc MD |
2335 | maxsize = imax(maxsize, bsize); |
2336 | ||
2337 | if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { | |
2338 | if (slpflag || slptimeo) { | |
e43a034f | 2339 | crit_exit(); |
984263bc MD |
2340 | return NULL; |
2341 | } | |
2342 | goto loop; | |
2343 | } | |
2344 | ||
2345 | /* | |
2346 | * This code is used to make sure that a buffer is not | |
2347 | * created while the getnewbuf routine is blocked. | |
2348 | * This can be a problem whether the vnode is locked or not. | |
2349 | * If the buffer is created out from under us, we have to | |
179024a5 | 2350 | * throw away the one we just created. There is no window |
e43a034f MD |
2351 | * race because we are safely running in a critical section |
2352 | * from the point of the duplicate buffer creation through | |
2353 | * to here, and we've locked the buffer. | |
984263bc | 2354 | */ |
54078292 | 2355 | if (findblk(vp, loffset)) { |
984263bc MD |
2356 | bp->b_flags |= B_INVAL; |
2357 | brelse(bp); | |
2358 | goto loop; | |
2359 | } | |
2360 | ||
2361 | /* | |
2362 | * Insert the buffer into the hash, so that it can | |
1f1ea522 MD |
2363 | * be found by findblk(). |
2364 | * | |
2365 | * Make sure the translation layer has been cleared. | |
984263bc | 2366 | */ |
54078292 MD |
2367 | bp->b_loffset = loffset; |
2368 | bp->b_bio2.bio_offset = NOOFFSET; | |
1f1ea522 | 2369 | /* bp->b_bio2.bio_next = NULL; */ |
984263bc MD |
2370 | |
2371 | bgetvp(vp, bp); | |
984263bc MD |
2372 | |
2373 | /* | |
4baec531 | 2374 | * All vnode-based buffers must be backed by a VM object. |
984263bc | 2375 | */ |
4baec531 MD |
2376 | KKASSERT(vp->v_object != NULL); |
2377 | bp->b_flags |= B_VMIO; | |
10f3fee5 | 2378 | KKASSERT(bp->b_cmd == BUF_CMD_DONE); |
984263bc MD |
2379 | |
2380 | allocbuf(bp, size); | |
2381 | ||
e43a034f | 2382 | crit_exit(); |
984263bc MD |
2383 | } |
2384 | return (bp); | |
2385 | } | |
2386 | ||
2387 | /* | |
3f779080 HP |
2388 | * geteblk: |
2389 | * | |
2390 | * Get an empty, disassociated buffer of given size. The buffer is | |
2391 | * initially set to B_INVAL. | |
06ecca5a | 2392 | * |
3f779080 HP |
2393 | * critical section protection is not required for the allocbuf() |
2394 | * call because races are impossible here. | |
984263bc MD |
2395 | */ |
2396 | struct buf * | |
2397 | geteblk(int size) | |
2398 | { | |
2399 | struct buf *bp; | |
984263bc MD |
2400 | int maxsize; |
2401 | ||
2402 | maxsize = (size + BKVAMASK) & ~BKVAMASK; | |
2403 | ||
e43a034f MD |
2404 | crit_enter(); |
2405 | while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) | |
2406 | ; | |
2407 | crit_exit(); | |
984263bc MD |
2408 | allocbuf(bp, size); |
2409 | bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ | |
2410 | return (bp); | |
2411 | } | |
2412 | ||
2413 | ||
2414 | /* | |
3f779080 | 2415 | * allocbuf: |
984263bc | 2416 | * |
3f779080 HP |
2417 | * This code constitutes the buffer memory from either anonymous system |
2418 | * memory (in the case of non-VMIO operations) or from an associated | |
2419 | * VM object (in the case of VMIO operations). This code is able to | |
2420 | * resize a buffer up or down. | |
984263bc | 2421 | * |
3f779080 HP |
2422 | * Note that this code is tricky, and has many complications to resolve |
2423 | * deadlock or inconsistant data situations. Tread lightly!!! | |
2424 | * There are B_CACHE and B_DELWRI interactions that must be dealt with by | |
2425 | * the caller. Calling this code willy nilly can result in the loss of data. | |
06ecca5a | 2426 | * |
3f779080 HP |
2427 | * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with |
2428 | * B_CACHE for the non-VMIO case. | |
2429 | * | |
2430 | * This routine does not need to be called from a critical section but you | |
2431 | * must own the buffer. | |
984263bc | 2432 | */ |
984263bc MD |
2433 | int |
2434 | allocbuf(struct buf *bp, int size) | |
2435 | { | |
2436 | int newbsize, mbsize; | |
2437 | int i; | |
2438 | ||
2439 | if (BUF_REFCNT(bp) == 0) | |
2440 | panic("allocbuf: buffer not busy"); | |
2441 | ||
2442 | if (bp->b_kvasize < size) | |
2443 | panic("allocbuf: buffer too small"); | |
2444 | ||
2445 | if ((bp->b_flags & B_VMIO) == 0) { | |
2446 | caddr_t origbuf; | |
2447 | int origbufsize; | |
2448 | /* | |
2449 | * Just get anonymous memory from the kernel. Don't | |
2450 | * mess with B_CACHE. | |
2451 | */ | |
2452 | mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); | |
984263bc MD |
2453 | if (bp->b_flags & B_MALLOC) |
2454 | newbsize = mbsize; | |
2455 | else | |
984263bc MD |
2456 | newbsize = round_page(size); |
2457 | ||
2458 | if (newbsize < bp->b_bufsize) { | |
984263bc | 2459 | /* |
312dcd01 | 2460 | * Malloced buffers are not shrunk |
984263bc MD |
2461 | */ |
2462 | if (bp->b_flags & B_MALLOC) { | |
2463 | if (newbsize) { | |
2464 | bp->b_bcount = size; | |
2465 | } else { | |
efda3bd0 | 2466 | kfree(bp->b_data, M_BIOBUF); |
984263bc MD |
2467 | if (bp->b_bufsize) { |
2468 | bufmallocspace -= bp->b_bufsize; | |
2469 | bufspacewakeup(); | |
2470 | bp->b_bufsize = 0; | |
2471 | } | |
2472 | bp->b_data = bp->b_kvabase; | |
2473 | bp->b_bcount = 0; | |
2474 | bp->b_flags &= ~B_MALLOC; | |
2475 | } | |
2476 | return 1; | |
2477 | } | |
984263bc MD |
2478 | vm_hold_free_pages( |
2479 | bp, | |
2480 | (vm_offset_t) bp->b_data + newbsize, | |
2481 | (vm_offset_t) bp->b_data + bp->b_bufsize); | |
2482 | } else if (newbsize > bp->b_bufsize) { | |
984263bc MD |
2483 | /* |
2484 | * We only use malloced memory on the first allocation. | |
2485 | * and revert to page-allocated memory when the buffer | |
2486 | * grows. | |
2487 | */ | |
4baec531 | 2488 | if ((bufmallocspace < maxbufmallocspace) && |
984263bc MD |
2489 | (bp->b_bufsize == 0) && |
2490 | (mbsize <= PAGE_SIZE/2)) { | |
2491 | ||
efda3bd0 | 2492 | bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK); |
984263bc MD |
2493 | bp->b_bufsize = mbsize; |
2494 | bp->b_bcount = size; | |
2495 | bp->b_flags |= B_MALLOC; | |
2496 | bufmallocspace += mbsize; | |
2497 | return 1; | |
2498 | } | |
984263bc MD |
2499 | origbuf = NULL; |
2500 | origbufsize = 0; | |
984263bc | 2501 | /* |
4baec531 MD |
2502 | * If the buffer is growing on its other-than-first |
2503 | * allocation, then we revert to the page-allocation | |
2504 | * scheme. | |
984263bc MD |
2505 | */ |
2506 | if (bp->b_flags & B_MALLOC) { | |
2507 | origbuf = bp->b_data; | |
2508 | origbufsize = bp->b_bufsize; | |
2509 | bp->b_data = bp->b_kvabase; | |
2510 | if (bp->b_bufsize) { | |
2511 | bufmallocspace -= bp->b_bufsize; | |
2512 | bufspacewakeup(); | |
2513 | bp->b_bufsize = 0; | |
2514 | } | |
2515 | bp->b_flags &= ~B_MALLOC; | |
2516 | newbsize = round_page(newbsize); | |
2517 | } | |
984263bc MD |
2518 | vm_hold_load_pages( |
2519 | bp, | |
2520 | (vm_offset_t) bp->b_data + bp->b_bufsize, | |
2521 | (vm_offset_t) bp->b_data + newbsize); | |
984263bc MD |
2522 | if (origbuf) { |
2523 | bcopy(origbuf, bp->b_data, origbufsize); | |
efda3bd0 | 2524 | kfree(origbuf, M_BIOBUF); |
984263bc | 2525 | } |
984263bc MD |
2526 | } |
2527 | } else { | |
2528 | vm_page_t m; | |
2529 | int desiredpages; | |
2530 | ||
2531 | newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); | |
4baec531 MD |
2532 | desiredpages = ((int)(bp->b_loffset & PAGE_MASK) + |
2533 | newbsize + PAGE_MASK) >> PAGE_SHIFT; | |
2534 | KKASSERT(desiredpages <= XIO_INTERNAL_PAGES); | |
984263bc | 2535 | |
984263bc MD |
2536 | if (bp->b_flags & B_MALLOC) |
2537 | panic("allocbuf: VMIO buffer can't be malloced"); | |
984263bc MD |
2538 | /* |
2539 | * Set B_CACHE initially if buffer is 0 length or will become | |
2540 | * 0-length. | |
2541 | */ | |
2542 | if (size == 0 || bp->b_bufsize == 0) | |
2543 | bp->b_flags |= B_CACHE; | |
2544 | ||
2545 | if (newbsize < bp->b_bufsize) { | |
2546 | /* | |
2547 | * DEV_BSIZE aligned new buffer size is less then the | |
2548 | * DEV_BSIZE aligned existing buffer size. Figure out | |
2549 | * if we have to remove any pages. | |
2550 | */ | |
54f51aeb HP |
2551 | if (desiredpages < bp->b_xio.xio_npages) { |
2552 | for (i = desiredpages; i < bp->b_xio.xio_npages; i++) { | |
984263bc MD |
2553 | /* |
2554 | * the page is not freed here -- it | |
2555 | * is the responsibility of | |
2556 | * vnode_pager_setsize | |
2557 | */ | |
54f51aeb | 2558 | m = bp->b_xio.xio_pages[i]; |
984263bc MD |
2559 | KASSERT(m != bogus_page, |
2560 | ("allocbuf: bogus page found")); | |
2561 | while (vm_page_sleep_busy(m, TRUE, "biodep")) | |
2562 | ; | |
2563 | ||
54f51aeb | 2564 | bp->b_xio.xio_pages[i] = NULL; |
984263bc MD |
2565 | vm_page_unwire(m, 0); |
2566 | } | |
2567 | pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + | |
54f51aeb HP |
2568 | (desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages)); |
2569 | bp->b_xio.xio_npages = desiredpages; | |
984263bc MD |
2570 | } |
2571 | } else if (size > bp->b_bcount) { | |
2572 | /* | |
2573 | * We are growing the buffer, possibly in a | |
2574 | * byte-granular fashion. | |
2575 | */ | |
2576 | struct vnode *vp; | |
2577 | vm_object_t obj; | |
2578 | vm_offset_t toff; | |
2579 | vm_offset_t tinc; | |
2580 | ||
2581 | /* | |
2582 | * Step 1, bring in the VM pages from the object, | |
2583 | * allocating them if necessary. We must clear | |
2584 | * B_CACHE if these pages are not valid for the | |
2585 | * range covered by the buffer. | |
06ecca5a | 2586 | * |
e43a034f MD |
2587 | * critical section protection is required to protect |
2588 | * against interrupts unbusying and freeing pages | |
2589 | * between our vm_page_lookup() and our | |
2590 | * busycheck/wiring call. | |
984263bc | 2591 | */ |
984263bc | 2592 | vp = bp->b_vp; |
7540ab49 | 2593 | obj = vp->v_object; |
984263bc | 2594 | |
654a39f0 | 2595 | crit_enter(); |
54f51aeb | 2596 | while (bp->b_xio.xio_npages < desiredpages) { |
984263bc MD |
2597 | vm_page_t m; |
2598 | vm_pindex_t pi; | |
2599 | ||
81b5c339 | 2600 | pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages; |
984263bc MD |
2601 | if ((m = vm_page_lookup(obj, pi)) == NULL) { |
2602 | /* | |
2603 | * note: must allocate system pages | |
2604 | * since blocking here could intefere | |
2605 | * with paging I/O, no matter which | |
2606 | * process we are. | |
2607 | */ | |
dc1fd4b3 | 2608 | m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM); |
984263bc | 2609 | if (m == NULL) { |
659c6a07 | 2610 | vm_wait(); |
54f51aeb HP |
2611 | vm_pageout_deficit += desiredpages - |
2612 | bp->b_xio.xio_npages; | |
984263bc MD |
2613 | } else { |
2614 | vm_page_wire(m); | |
2615 | vm_page_wakeup(m); | |
2616 | bp->b_flags &= ~B_CACHE; | |
54f51aeb HP |
2617 | bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; |
2618 | ++bp->b_xio.xio_npages; | |
984263bc MD |
2619 | } |
2620 | continue; | |
2621 | } | |
2622 | ||
2623 | /* | |
2624 | * We found a page. If we have to sleep on it, | |
2625 | * retry because it might have gotten freed out | |
2626 | * from under us. | |
2627 | * | |
2628 | * We can only test PG_BUSY here. Blocking on | |
2629 | * m->busy might lead to a deadlock: | |
2630 | * | |
2631 | * vm_fault->getpages->cluster_read->allocbuf | |
2632 | * | |
2633 | */ | |
2634 | ||
2635 | if (vm_page_sleep_busy(m, FALSE, "pgtblk")) | |
2636 | continue; | |
2637 | ||
2638 | /* | |
2639 | * We have a good page. Should we wakeup the | |
2640 | * page daemon? | |
2641 | */ | |
bc6dffab | 2642 | if ((curthread != pagethread) && |
984263bc | 2643 | ((m->queue - m->pc) == PQ_CACHE) && |
12e4aaff MD |
2644 | ((vmstats.v_free_count + vmstats.v_cache_count) < |
2645 | (vmstats.v_free_min + vmstats.v_cache_min))) { | |
984263bc MD |
2646 | pagedaemon_wakeup(); |
2647 | } | |
2648 | vm_page_flag_clear(m, PG_ZERO); | |
2649 | vm_page_wire(m); | |
54f51aeb HP |
2650 | bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; |
2651 | ++bp->b_xio.xio_npages; | |
984263bc | 2652 | } |
654a39f0 | 2653 | crit_exit(); |
984263bc MD |
2654 | |
2655 | /* | |
2656 | * Step 2. We've loaded the pages into the buffer, | |
2657 | * we have to figure out if we can still have B_CACHE | |
2658 | * set. Note that B_CACHE is set according to the | |
3f779080 | 2659 | * byte-granular range ( bcount and size ), not the |
984263bc MD |
2660 | * aligned range ( newbsize ). |
2661 | * | |
2662 | * The VM test is against m->valid, which is DEV_BSIZE | |
2663 | * aligned. Needless to say, the validity of the data | |
2664 | * needs to also be DEV_BSIZE aligned. Note that this | |
2665 | * fails with NFS if the server or some other client | |
2666 | * extends the file's EOF. If our buffer is resized, | |
2667 | * B_CACHE may remain set! XXX | |
2668 | */ | |
2669 | ||
2670 | toff = bp->b_bcount; | |
81b5c339 | 2671 | tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK); |
984263bc MD |
2672 | |
2673 | while ((bp->b_flags & B_CACHE) && toff < size) { | |
2674 | vm_pindex_t pi; | |
2675 | ||
2676 | if (tinc > (size - toff)) | |
2677 | tinc = size - toff; | |
2678 | ||
81b5c339 | 2679 | pi = ((bp->b_loffset & PAGE_MASK) + toff) >> |
984263bc MD |
2680 | PAGE_SHIFT; |
2681 | ||
2682 | vfs_buf_test_cache( | |
2683 | bp, | |
81b5c339 | 2684 | bp->b_loffset, |
984263bc MD |
2685 | toff, |
2686 | tinc, | |
54f51aeb | 2687 | bp->b_xio.xio_pages[pi] |
984263bc MD |
2688 | ); |
2689 | toff += tinc; | |
2690 | tinc = PAGE_SIZE; | |
2691 | } | |
2692 | ||
2693 | /* | |
2694 | * Step 3, fixup the KVM pmap. Remember that | |
81b5c339 MD |
2695 | * bp->b_data is relative to bp->b_loffset, but |
2696 | * bp->b_loffset may be offset into the first page. | |
984263bc MD |
2697 | */ |
2698 | ||
2699 | bp->b_data = (caddr_t) | |
2700 | trunc_page((vm_offset_t)bp->b_data); | |
2701 | pmap_qenter( | |
2702 | (vm_offset_t)bp->b_data, | |
54f51aeb HP |
2703 | bp->b_xio.xio_pages, |
2704 | bp->b_xio.xio_npages | |
984263bc MD |
2705 | ); |
2706 | bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | | |
81b5c339 | 2707 | (vm_offset_t)(bp->b_loffset & PAGE_MASK)); |
984263bc MD |
2708 | } |
2709 | } | |
2710 | if (newbsize < bp->b_bufsize) | |
2711 | bufspacewakeup(); | |
2712 | bp->b_bufsize = newbsize; /* actual buffer allocation */ | |
2713 | bp->b_bcount = size; /* requested buffer size */ | |
2714 | return 1; | |
2715 | } | |
2716 | ||
2717 | /* | |
3f779080 | 2718 | * biowait: |
984263bc MD |
2719 | * |
2720 | * Wait for buffer I/O completion, returning error status. The buffer | |
10f3fee5 MD |
2721 | * is left locked on return. B_EINTR is converted into an EINTR error |
2722 | * and cleared. | |
2723 | * | |
2724 | * NOTE! The original b_cmd is lost on return, since b_cmd will be | |
2725 | * set to BUF_CMD_DONE. | |
984263bc MD |
2726 | */ |
2727 | int | |
c8e4131d | 2728 | biowait(struct buf *bp) |
984263bc | 2729 | { |
e43a034f | 2730 | crit_enter(); |
10f3fee5 MD |
2731 | while (bp->b_cmd != BUF_CMD_DONE) { |
2732 | if (bp->b_cmd == BUF_CMD_READ) | |
377d4740 | 2733 | tsleep(bp, 0, "biord", 0); |
984263bc | 2734 | else |
377d4740 | 2735 | tsleep(bp, 0, "biowr", 0); |
984263bc | 2736 | } |
e43a034f | 2737 | crit_exit(); |
984263bc MD |
2738 | if (bp->b_flags & B_EINTR) { |
2739 | bp->b_flags &= ~B_EINTR; | |
2740 | return (EINTR); | |
2741 | } | |
2742 | if (bp->b_flags & B_ERROR) { | |
2743 | return (bp->b_error ? bp->b_error : EIO); | |
2744 | } else { | |
2745 | return (0); | |
2746 | } | |
2747 | } | |
2748 | ||
81b5c339 MD |
2749 | /* |
2750 | * This associates a tracking count with an I/O. vn_strategy() and | |
2751 | * dev_dstrategy() do this automatically but there are a few cases | |
2752 | * where a vnode or device layer is bypassed when a block translation | |
2753 | * is cached. In such cases bio_start_transaction() may be called on | |
2754 | * the bypassed layers so the system gets an I/O in progress indication | |
2755 | * for those higher layers. | |
2756 | */ | |
2757 | void | |
2758 | bio_start_transaction(struct bio *bio, struct bio_track *track) | |
2759 | { | |
2760 | bio->bio_track = track; | |
2761 | atomic_add_int(&track->bk_active, 1); | |
2762 | } | |
2763 | ||
2764 | /* | |
2765 | * Initiate I/O on a vnode. | |
2766 | */ | |
2767 | void | |
2768 | vn_strategy(struct vnode *vp, struct bio *bio) | |
2769 | { | |
2770 | struct bio_track *track; | |
2771 | ||
10f3fee5 MD |
2772 | KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE); |
2773 | if (bio->bio_buf->b_cmd == BUF_CMD_READ) | |
81b5c339 MD |
2774 | track = &vp->v_track_read; |
2775 | else | |
2776 | track = &vp->v_track_write; | |
2777 | bio->bio_track = track; | |
2778 | atomic_add_int(&track->bk_active, 1); | |
2779 | vop_strategy(*vp->v_ops, vp, bio); | |
2780 | } | |
2781 | ||
2782 | ||
984263bc | 2783 | /* |
3f779080 | 2784 | * biodone: |
984263bc MD |
2785 | * |
2786 | * Finish I/O on a buffer, optionally calling a completion function. | |
2787 | * This is usually called from an interrupt so process blocking is | |
2788 | * not allowed. | |
2789 | * | |
2790 | * biodone is also responsible for setting B_CACHE in a B_VMIO bp. | |
2791 | * In a non-VMIO bp, B_CACHE will be set on the next getblk() | |
2792 | * assuming B_INVAL is clear. | |
2793 | * | |
2794 | * For the VMIO case, we set B_CACHE if the op was a read and no | |
2795 | * read error occured, or if the op was a write. B_CACHE is never | |
2796 | * set if the buffer is invalid or otherwise uncacheable. | |
2797 | * | |
2798 | * biodone does not mess with B_INVAL, allowing the I/O routine or the | |
2799 | * initiator to leave B_INVAL set to brelse the buffer out of existance | |
2800 | * in the biodone routine. | |
2801 | */ | |
2802 | void | |
81b5c339 | 2803 | biodone(struct bio *bio) |
984263bc | 2804 | { |
81b5c339 | 2805 | struct buf *bp = bio->bio_buf; |
10f3fee5 | 2806 | buf_cmd_t cmd; |
984263bc | 2807 | |
e43a034f | 2808 | crit_enter(); |
984263bc | 2809 | |
81b5c339 MD |
2810 | KASSERT(BUF_REFCNTNB(bp) > 0, |
2811 | ("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp))); | |
10f3fee5 MD |
2812 | KASSERT(bp->b_cmd != BUF_CMD_DONE, |
2813 | ("biodone: bp %p already done!", bp)); | |
984263bc | 2814 | |
984263bc MD |
2815 | runningbufwakeup(bp); |
2816 | ||
81b5c339 | 2817 | /* |
10f3fee5 | 2818 | * Run up the chain of BIO's. Leave b_cmd intact for the duration. |
81b5c339 MD |
2819 | */ |
2820 | while (bio) { | |
2821 | biodone_t *done_func; | |
2822 | struct bio_track *track; | |
984263bc | 2823 | |
81b5c339 MD |
2824 | /* |
2825 | * BIO tracking. Most but not all BIOs are tracked. | |
2826 | */ | |
2827 | if ((track = bio->bio_track) != NULL) { | |
2828 | atomic_subtract_int(&track->bk_active, 1); | |
2829 | if (track->bk_active < 0) { | |
2830 | panic("biodone: bad active count bio %p\n", | |
2831 | bio); | |
2832 | } | |
2833 | if (track->bk_waitflag) { | |
2834 | track->bk_waitflag = 0; | |
2835 | wakeup(track); | |
2836 | } | |
2837 | bio->bio_track = NULL; | |
2838 | } | |
2839 | ||
2840 | /* | |
2841 | * A bio_done function terminates the loop. The function | |
2842 | * will be responsible for any further chaining and/or | |
2843 | * buffer management. | |
10f3fee5 MD |
2844 | * |
2845 | * WARNING! The done function can deallocate the buffer! | |
81b5c339 MD |
2846 | */ |
2847 | if ((done_func = bio->bio_done) != NULL) { | |
2848 | bio->bio_done = NULL; | |
2849 | done_func(bio); | |
2850 | crit_exit(); | |
2851 | return; | |
2852 | } | |
2853 | bio = bio->bio_prev; | |
984263bc MD |
2854 | } |
2855 | ||
10f3fee5 MD |
2856 | cmd = bp->b_cmd; |
2857 | bp->b_cmd = BUF_CMD_DONE; | |
2858 | ||
81b5c339 | 2859 | /* |
10f3fee5 | 2860 | * Only reads and writes are processed past this point. |
81b5c339 | 2861 | */ |
10f3fee5 | 2862 | if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) { |
81b5c339 | 2863 | brelse(bp); |
e43a034f | 2864 | crit_exit(); |
984263bc MD |
2865 | return; |
2866 | } | |
81b5c339 | 2867 | |
69f8c926 MD |
2868 | /* |
2869 | * Warning: softupdates may re-dirty the buffer. | |
2870 | */ | |
984263bc MD |
2871 | if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) |
2872 | (*bioops.io_complete)(bp); | |
2873 | ||
2874 | if (bp->b_flags & B_VMIO) { | |
2875 | int i; | |
2876 | vm_ooffset_t foff; | |
2877 | vm_page_t m; | |
2878 | vm_object_t obj; | |
2879 | int iosize; | |
2880 | struct vnode *vp = bp->b_vp; | |
2881 | ||
7540ab49 | 2882 | obj = vp->v_object; |
984263bc MD |
2883 | |
2884 | #if defined(VFS_BIO_DEBUG) | |
3c37c940 | 2885 | if (vp->v_auxrefs == 0) |
8a6722ed | 2886 | panic("biodone: zero vnode hold count"); |
7540ab49 | 2887 | if ((vp->v_flag & VOBJBUF) == 0) |
984263bc | 2888 | panic("biodone: vnode is not setup for merged cache"); |
984263bc MD |
2889 | #endif |
2890 | ||
81b5c339 MD |
2891 | foff = bp->b_loffset; |
2892 | KASSERT(foff != NOOFFSET, ("biodone: no buffer offset")); | |
7540ab49 | 2893 | KASSERT(obj != NULL, ("biodone: missing VM object")); |
984263bc | 2894 | |
984263bc | 2895 | #if defined(VFS_BIO_DEBUG) |
54f51aeb | 2896 | if (obj->paging_in_progress < bp->b_xio.xio_npages) { |
6ea70f76 | 2897 | kprintf("biodone: paging in progress(%d) < bp->b_xio.xio_npages(%d)\n", |
54f51aeb | 2898 | obj->paging_in_progress, bp->b_xio.xio_npages); |
984263bc MD |
2899 | } |
2900 | #endif | |
2901 | ||
2902 | /* | |
2903 | * Set B_CACHE if the op was a normal read and no error | |
2904 | * occured. B_CACHE is set for writes in the b*write() | |
2905 | * routines. | |
2906 | */ | |
2907 | iosize = bp->b_bcount - bp->b_resid; | |
10f3fee5 | 2908 | if (cmd == BUF_CMD_READ && (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) { |
984263bc MD |
2909 | bp->b_flags |= B_CACHE; |
2910 | } | |
2911 | ||
54f51aeb | 2912 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
984263bc MD |
2913 | int bogusflag = 0; |
2914 | int resid; | |
2915 | ||
2916 | resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; | |
2917 | if (resid > iosize) | |
2918 | resid = iosize; | |
2919 | ||
2920 | /* | |
06ecca5a MD |
2921 | * cleanup bogus pages, restoring the originals. Since |
2922 | * the originals should still be wired, we don't have | |
2923 | * to worry about interrupt/freeing races destroying | |
2924 | * the VM object association. | |
984263bc | 2925 | */ |
54f51aeb | 2926 | m = bp->b_xio.xio_pages[i]; |
984263bc MD |
2927 | if (m == bogus_page) { |
2928 | bogusflag = 1; | |
2929 | m = vm_page_lookup(obj, OFF_TO_IDX(foff)); | |
2930 | if (m == NULL) | |
2931 | panic("biodone: page disappeared"); | |
54f51aeb HP |
2932 | bp->b_xio.xio_pages[i] = m; |
2933 | pmap_qenter(trunc_page((vm_offset_t)bp->b_data), | |
2934 | bp->b_xio.xio_pages, bp->b_xio.xio_npages); | |
984263bc MD |
2935 | } |
2936 | #if defined(VFS_BIO_DEBUG) | |
2937 | if (OFF_TO_IDX(foff) != m->pindex) { | |
6ea70f76 | 2938 | kprintf( |
984263bc MD |
2939 | "biodone: foff(%lu)/m->pindex(%d) mismatch\n", |
2940 | (unsigned long)foff, m->pindex); | |
2941 | } | |
2942 | #endif | |
2943 | ||
2944 | /* | |
2945 | * In the write case, the valid and clean bits are | |
2946 | * already changed correctly ( see bdwrite() ), so we | |
2947 | * only need to do this here in the read case. | |
2948 | */ | |
10f3fee5 | 2949 | if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) { |
984263bc MD |
2950 | vfs_page_set_valid(bp, foff, i, m); |
2951 | } | |
2952 | vm_page_flag_clear(m, PG_ZERO); | |
2953 | ||
2954 | /* | |
2955 | * when debugging new filesystems or buffer I/O methods, this | |
2956 | * is the most common error that pops up. if you see this, you | |
2957 | * have not set the page busy flag correctly!!! | |
2958 | */ | |
2959 | if (m->busy == 0) { | |
6ea70f76 | 2960 | kprintf("biodone: page busy < 0, " |
984263bc MD |
2961 | "pindex: %d, foff: 0x(%x,%x), " |
2962 | "resid: %d, index: %d\n", | |
2963 | (int) m->pindex, (int)(foff >> 32), | |
2964 | (int) foff & 0xffffffff, resid, i); | |
2965 | if (!vn_isdisk(vp, NULL)) | |
6ea70f76 | 2966 | kprintf(" iosize: %ld, loffset: %lld, flags: 0x%08x, npages: %d\n", |
984263bc | 2967 | bp->b_vp->v_mount->mnt_stat.f_iosize, |
54078292 | 2968 | bp->b_loffset, |
54f51aeb | 2969 | bp->b_flags, bp->b_xio.xio_npages); |
984263bc | 2970 | else |
6ea70f76 | 2971 | kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n", |
54078292 | 2972 | bp->b_loffset, |
54f51aeb | 2973 | bp->b_flags, bp->b_xio.xio_npages); |
6ea70f76 | 2974 | kprintf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", |
984263bc | 2975 | m->valid, m->dirty, m->wire_count); |
fc92d4aa | 2976 | panic("biodone: page busy < 0"); |
984263bc MD |
2977 | } |
2978 | vm_page_io_finish(m); | |
2979 | vm_object_pip_subtract(obj, 1); | |
2980 | foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; | |
2981 | iosize -= resid; | |
2982 | } | |
2983 | if (obj) | |
2984 | vm_object_pip_wakeupn(obj, 0); | |
2985 | } | |
2986 | ||
2987 | /* | |
2988 | * For asynchronous completions, release the buffer now. The brelse | |
2989 | * will do a wakeup there if necessary - so no need to do a wakeup | |
2990 | * here in the async case. The sync case always needs to do a wakeup. | |
2991 | */ | |
2992 | ||
2993 | if (bp->b_flags & B_ASYNC) { | |
2994 | if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) | |
2995 | brelse(bp); | |
2996 | else | |
2997 | bqrelse(bp); | |
2998 | } else { | |
2999 | wakeup(bp); | |
3000 | } | |
e43a034f | 3001 | crit_exit(); |
984263bc MD |
3002 | } |
3003 | ||
3004 | /* | |
3f779080 HP |
3005 | * vfs_unbusy_pages: |
3006 | * | |
3007 | * This routine is called in lieu of iodone in the case of | |
3008 | * incomplete I/O. This keeps the busy status for pages | |
3009 | * consistant. | |
984263bc MD |
3010 | */ |
3011 | void | |
493c516a | 3012 | vfs_unbusy_pages(struct buf *bp) |
984263bc MD |
3013 | { |
3014 | int i; | |
3015 | ||
3016 | runningbufwakeup(bp); | |
3017 | if (bp->b_flags & B_VMIO) { | |
3018 | struct vnode *vp = bp->b_vp; | |
3019 | vm_object_t obj; | |
3020 | ||
7540ab49 | 3021 | obj = vp->v_object; |
984263bc | 3022 | |
54f51aeb HP |
3023 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
3024 | vm_page_t m = bp->b_xio.xio_pages[i]; | |
984263bc | 3025 | |
06ecca5a MD |
3026 | /* |
3027 | * When restoring bogus changes the original pages | |
3028 | * should still be wired, so we are in no danger of | |
3029 | * losing the object association and do not need | |
e43a034f | 3030 | * critical section protection particularly. |
06ecca5a | 3031 | */ |
984263bc | 3032 | if (m == bogus_page) { |
81b5c339 | 3033 | m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i); |
984263bc | 3034 | if (!m) { |
fc92d4aa | 3035 | panic("vfs_unbusy_pages: page missing"); |
984263bc | 3036 | } |
54f51aeb HP |
3037 | bp->b_xio.xio_pages[i] = m; |
3038 | pmap_qenter(trunc_page((vm_offset_t)bp->b_data), | |
3039 | bp->b_xio.xio_pages, bp->b_xio.xio_npages); | |
984263bc MD |
3040 | } |
3041 | vm_object_pip_subtract(obj, 1); | |
3042 | vm_page_flag_clear(m, PG_ZERO); | |
3043 | vm_page_io_finish(m); | |
3044 | } | |
3045 | vm_object_pip_wakeupn(obj, 0); | |
3046 | } | |
3047 | } | |
3048 | ||
3049 | /* | |
3050 | * vfs_page_set_valid: | |
3051 | * | |
3052 | * Set the valid bits in a page based on the supplied offset. The | |
3053 | * range is restricted to the buffer's size. | |
3054 | * | |
3055 | * This routine is typically called after a read completes. | |
3056 | */ | |
3057 | static void | |
3058 | vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) | |
3059 | { | |
3060 | vm_ooffset_t soff, eoff; | |
3061 | ||
3062 | /* | |
3063 | * Start and end offsets in buffer. eoff - soff may not cross a | |
3064 | * page boundry or cross the end of the buffer. The end of the | |
3065 | * buffer, in this case, is our file EOF, not the allocation size | |
3066 | * of the buffer. | |
3067 | */ | |
3068 | soff = off; | |
3069 | eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; | |
81b5c339 MD |
3070 | if (eoff > bp->b_loffset + bp->b_bcount) |
3071 | eoff = bp->b_loffset + bp->b_bcount; | |
984263bc MD |
3072 | |
3073 | /* | |
3074 | * Set valid range. This is typically the entire buffer and thus the | |
3075 | * entire page. | |
3076 | */ | |
3077 | if (eoff > soff) { | |
3078 | vm_page_set_validclean( | |
3079 | m, | |
3080 | (vm_offset_t) (soff & PAGE_MASK), | |
3081 | (vm_offset_t) (eoff - soff) | |
3082 | ); | |
3083 | } | |
3084 | } | |
3085 | ||
3086 | /* | |
3f779080 HP |
3087 | * vfs_busy_pages: |
3088 | * | |
3089 | * This routine is called before a device strategy routine. | |
3090 | * It is used to tell the VM system that paging I/O is in | |
3091 | * progress, and treat the pages associated with the buffer | |
3092 | * almost as being PG_BUSY. Also the object 'paging_in_progress' | |
3093 | * flag is handled to make sure that the object doesn't become | |
3094 | * inconsistant. | |
3095 | * | |
3096 | * Since I/O has not been initiated yet, certain buffer flags | |
3097 | * such as B_ERROR or B_INVAL may be in an inconsistant state | |
3098 | * and should be ignored. | |
984263bc MD |
3099 | */ |
3100 | void | |
10f3fee5 | 3101 | vfs_busy_pages(struct vnode *vp, struct buf *bp) |
984263bc MD |
3102 | { |
3103 | int i, bogus; | |
fde7ac71 | 3104 | struct lwp *lp = curthread->td_lwp; |
984263bc | 3105 | |
a8f169e2 | 3106 | /* |
10f3fee5 MD |
3107 | * The buffer's I/O command must already be set. If reading, |
3108 | * B_CACHE must be 0 (double check against callers only doing | |
3109 | * I/O when B_CACHE is 0). | |
a8f169e2 | 3110 | */ |
10f3fee5 MD |
3111 | KKASSERT(bp->b_cmd != BUF_CMD_DONE); |
3112 | KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0); | |
a8f169e2 | 3113 | |
984263bc | 3114 | if (bp->b_flags & B_VMIO) { |
984263bc MD |
3115 | vm_object_t obj; |
3116 | vm_ooffset_t foff; | |
3117 | ||
7540ab49 | 3118 | obj = vp->v_object; |
81b5c339 MD |
3119 | foff = bp->b_loffset; |
3120 | KASSERT(bp->b_loffset != NOOFFSET, | |
3121 | ("vfs_busy_pages: no buffer offset")); | |
984263bc MD |
3122 | vfs_setdirty(bp); |
3123 | ||
3124 | retry: | |
54f51aeb HP |
3125 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
3126 | vm_page_t m = bp->b_xio.xio_pages[i]; | |
984263bc MD |
3127 | if (vm_page_sleep_busy(m, FALSE, "vbpage")) |
3128 | goto retry; | |
3129 | } | |
3130 | ||
3131 | bogus = 0; | |
54f51aeb HP |
3132 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
3133 | vm_page_t m = bp->b_xio.xio_pages[i]; | |
984263bc MD |
3134 | |
3135 | vm_page_flag_clear(m, PG_ZERO); | |
3136 | if ((bp->b_flags & B_CLUSTER) == 0) { | |
3137 | vm_object_pip_add(obj, 1); | |
3138 | vm_page_io_start(m); | |
3139 | } | |
3140 | ||
3141 | /* | |
10f3fee5 MD |
3142 | * When readying a vnode-backed buffer for a write |
3143 | * we must zero-fill any invalid portions of the | |
3144 | * backing VM pages. | |
3145 | * | |
3146 | * When readying a vnode-backed buffer for a read | |
3147 | * we must replace any dirty pages with a bogus | |
3148 | * page so we do not destroy dirty data when | |
3149 | * filling in gaps. Dirty pages might not | |
3150 | * necessarily be marked dirty yet, so use m->valid | |
3151 | * as a reasonable test. | |
3152 | * | |
3153 | * Bogus page replacement is, uh, bogus. We need | |
3154 | * to find a better way. | |
984263bc | 3155 | */ |
984263bc | 3156 | vm_page_protect(m, VM_PROT_NONE); |
10f3fee5 | 3157 | if (bp->b_cmd == BUF_CMD_WRITE) { |
984263bc | 3158 | vfs_page_set_valid(bp, foff, i, m); |
a8f169e2 | 3159 | } else if (m->valid == VM_PAGE_BITS_ALL) { |
54f51aeb | 3160 | bp->b_xio.xio_pages[i] = bogus_page; |
984263bc MD |
3161 | bogus++; |
3162 | } | |
3163 | foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; | |
3164 | } | |
3165 | if (bogus) | |
54f51aeb HP |
3166 | pmap_qenter(trunc_page((vm_offset_t)bp->b_data), |
3167 | bp->b_xio.xio_pages, bp->b_xio.xio_npages); | |
984263bc | 3168 | } |
05edc21a MD |
3169 | |
3170 | /* | |
3171 | * This is the easiest place to put the process accounting for the I/O | |
3172 | * for now. | |
3173 | */ | |
fde7ac71 | 3174 | if (lp != NULL) { |
10f3fee5 | 3175 | if (bp->b_cmd == BUF_CMD_READ) |
fde7ac71 | 3176 | lp->lwp_ru.ru_inblock++; |
081e0330 | 3177 | else |
fde7ac71 | 3178 | lp->lwp_ru.ru_oublock++; |
05edc21a | 3179 | } |
984263bc MD |
3180 | } |
3181 | ||
3182 | /* | |
3f779080 HP |
3183 | * vfs_clean_pages: |
3184 | * | |
3185 | * Tell the VM system that the pages associated with this buffer | |
3186 | * are clean. This is used for delayed writes where the data is | |
3187 | * going to go to disk eventually without additional VM intevention. | |
984263bc | 3188 | * |
3f779080 HP |
3189 | * Note that while we only really need to clean through to b_bcount, we |
3190 | * just go ahead and clean through to b_bufsize. | |
984263bc MD |
3191 | */ |
3192 | static void | |
493c516a | 3193 | vfs_clean_pages(struct buf *bp) |
984263bc MD |
3194 | { |
3195 | int i; | |
3196 | ||
3197 | if (bp->b_flags & B_VMIO) { | |
3198 | vm_ooffset_t foff; | |
3199 | ||
81b5c339 MD |
3200 | foff = bp->b_loffset; |
3201 | KASSERT(foff != NOOFFSET, ("vfs_clean_pages: no buffer offset")); | |
54f51aeb HP |
3202 | for (i = 0; i < bp->b_xio.xio_npages; i++) { |
3203 | vm_page_t m = bp->b_xio.xio_pages[i]; | |
984263bc MD |
3204 | vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; |
3205 | vm_ooffset_t eoff = noff; | |
3206 | ||
81b5c339 MD |
3207 | if (eoff > bp->b_loffset + bp->b_bufsize) |
3208 | eoff = bp->b_loffset + bp->b_bufsize; | |
984263bc MD |
3209 | vfs_page_set_valid(bp, foff, i, m); |
3210 | /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ | |
3211 | foff = noff; | |
3212 | } | |
3213 | } | |
3214 | } | |
3215 | ||
3216 | /* | |
3f779080 | 3217 | * vfs_bio_set_validclean: |
984263bc MD |
3218 | * |
3219 | * Set the range within the buffer to valid and clean. The range is | |
81b5c339 MD |
3220 | * relative to the beginning of the buffer, b_loffset. Note that |
3221 | * b_loffset itself may be offset from the beginning of the first page. | |
984263bc MD |
3222 | */ |
3223 | ||
3224 | void | |
3225 | vfs_bio_set_validclean(struct buf *bp, int base, int size) | |
3226 | { | |
3227 | if (bp->b_flags & B_VMIO) { | |
3228 | int i; | |
3229 | int n; | |
3230 | ||
3231 | /* | |
3232 | * Fixup base to be relative to beginning of first page. | |
3233 | * Set initial n to be the maximum number of bytes in the | |
3234 | * first page that can be validated. | |
3235 | */ | |
3236 | ||
81b5c339 | 3237 | base += (bp->b_loffset & PAGE_MASK); |
984263bc MD |
3238 | n = PAGE_SIZE - (base & PAGE_MASK); |
3239 | ||
54f51aeb HP |
3240 | for (i = base / PAGE_SIZE; size > 0 && i < bp->b_xio.xio_npages; ++i) { |
3241 | vm_page_t m = bp->b_xio.xio_pages[i]; | |
984263bc MD |
3242 | |
3243 | if (n > size) | |
3244 | n = size; | |
3245 | ||
3246 | vm_page_set_validclean(m, base & PAGE_MASK, n); | |
3247 | base += n; | |
3248 | size -= n; | |
3249 | n = PAGE_SIZE; | |
3250 | } | |
3251 | } | |
3252 | } | |
3253 | ||
3254 | /* | |
3f779080 | 3255 | * vfs_bio_clrbuf: |
984263bc | 3256 | * |
3f779080 | 3257 | * Clear a buffer. This routine essentially fakes an I/O, so we need |
984263bc MD |
3258 | * to clear B_ERROR and B_INVAL. |
3259 | * | |
3260 | * Note that while we only theoretically need to clear through b_bcount, | |
3261 | * we go ahead and clear through b_bufsize. | |
3262 | */ | |
3263 | ||
3264 | void | |
3265 | vfs_bio_clrbuf(struct buf *bp) | |
3266 | { | |
3267 | int i, mask = 0; | |
3268 | caddr_t sa, ea; | |
3269 | if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { | |
3270 | bp->b_flags &= ~(B_INVAL|B_ERROR); | |
54f51aeb | 3271 | if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && |
81b5c339 | 3272 | (bp->b_loffset & PAGE_MASK) == 0) { |
984263bc | 3273 | mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; |
54f51aeb | 3274 | if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) { |
984263bc MD |
3275 | bp->b_resid = 0; |
3276 | return; | |
3277 | } | |
54f51aeb HP |
3278 | if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) && |
3279 | ((bp->b_xio.xio_pages[0]->valid & mask) == 0)) { | |
984263bc | 3280 | bzero(bp->b_data, bp->b_bufsize); |
54f51aeb | 3281 | bp->b_xio.xio_pages[0]->valid |= mask; |
984263bc MD |
3282 | bp->b_resid = 0; |
3283 | return; | |
3284 | } | |
3285 | } | |
3286 | ea = sa = bp->b_data; | |
54f51aeb | 3287 | for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) { |
984263bc MD |
3288 | int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; |
3289 | ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); | |
3290 | ea = (caddr_t)(vm_offset_t)ulmin( | |
3291 | (u_long)(vm_offset_t)ea, | |
3292 | (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); | |
3293 | mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; | |
54f51aeb | 3294 | if ((bp->b_xio.xio_pages[i]->valid & mask) == mask) |
984263bc | 3295 | continue; |
54f51aeb HP |
3296 | if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) { |
3297 | if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) { | |
984263bc MD |
3298 | bzero(sa, ea - sa); |
3299 | } | |
3300 | } else { | |
3301 | for (; sa < ea; sa += DEV_BSIZE, j++) { | |
54f51aeb HP |
3302 | if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) && |
3303 | (bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0) | |
984263bc MD |
3304 | bzero(sa, DEV_BSIZE); |
3305 | } | |
3306 | } | |
54f51aeb HP |
3307 | bp->b_xio.xio_pages[i]->valid |= mask; |
3308 | vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO); | |
984263bc MD |
3309 | } |
3310 | bp->b_resid = 0; | |
3311 | } else { | |
3312 | clrbuf(bp); | |
3313 | } | |
3314 | } | |
3315 | ||
3316 | /* | |
3f779080 HP |
3317 | * vm_hold_load_pages: |
3318 | * | |
3319 | * Load pages into the buffer's address space. The pages are | |
3320 | * allocated from the kernel object in order to reduce interference | |
3321 | * with the any VM paging I/O activity. The range of loaded | |
3322 | * pages will be wired. | |
3323 | * | |
3324 | * If a page cannot be allocated, the 'pagedaemon' is woken up to | |
3325 | * retrieve the full range (to - from) of pages. | |
3326 | * | |
984263bc MD |
3327 | */ |
3328 | void | |
493c516a | 3329 | vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) |
984263bc MD |
3330 | { |
3331 | vm_offset_t pg; | |
3332 | vm_page_t p; | |
3333 | int index; | |
3334 | ||
3335 | to = round_page(to); | |
3336 | from = round_page(from); | |
3337 | index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; | |
3338 | ||
3339 | for (pg = from; pg < to; pg += PAGE_SIZE, index++) { | |
3340 | ||
3341 | tryagain: | |
3342 | ||
3343 | /* | |
3f779080 | 3344 | * Note: must allocate system pages since blocking here |
984263bc MD |
3345 | * could intefere with paging I/O, no matter which |
3346 | * process we are. | |
3347 | */ | |
c439ad8f | 3348 | p = vm_page_alloc(&kernel_object, |
e4846942 MD |
3349 | (pg >> PAGE_SHIFT), |
3350 | VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM); | |
984263bc MD |
3351 | if (!p) { |
3352 | vm_pageout_deficit += (to - from) >> PAGE_SHIFT; | |
659c6a07 | 3353 | vm_wait(); |
984263bc MD |
3354 | goto tryagain; |
3355 | } | |
3356 | vm_page_wire(p); | |
3357 | p->valid = VM_PAGE_BITS_ALL; | |
3358 | vm_page_flag_clear(p, PG_ZERO); | |
3359 | pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); | |
54f51aeb | 3360 | bp->b_xio.xio_pages[index] = p; |
984263bc MD |
3361 | vm_page_wakeup(p); |
3362 | } | |
54f51aeb | 3363 | bp->b_xio.xio_npages = index; |
984263bc MD |
3364 | } |
3365 | ||
3f779080 HP |
3366 | /* |
3367 | * vm_hold_free_pages: | |
3368 | * | |
3369 | * Return pages associated with the buffer back to the VM system. | |
3370 | * | |
3371 | * The range of pages underlying the buffer's address space will | |
3372 | * be unmapped and un-wired. | |
3373 | */ | |
984263bc | 3374 | void |
493c516a | 3375 | vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) |
984263bc MD |
3376 | { |
3377 | vm_offset_t pg; | |
3378 | vm_page_t p; | |
3379 | int index, newnpages; | |
3380 | ||
3381 | from = round_page(from); | |
3382 | to = round_page(to); | |
3383 | newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; | |
3384 | ||
3385 | for (pg = from; pg < to; pg += PAGE_SIZE, index++) { | |
54f51aeb HP |
3386 | p = bp->b_xio.xio_pages[index]; |
3387 | if (p && (index < bp->b_xio.xio_npages)) { | |
984263bc | 3388 | if (p->busy) { |
6ea70f76 | 3389 | kprintf("vm_hold_free_pages: doffset: %lld, loffset: %lld\n", |
54078292 | 3390 | bp->b_bio2.bio_offset, bp->b_loffset); |
984263bc | 3391 | } |
54f51aeb | 3392 | bp->b_xio.xio_pages[index] = NULL; |
984263bc MD |
3393 | pmap_kremove(pg); |
3394 | vm_page_busy(p); | |
3395 | vm_page_unwire(p, 0); | |
3396 | vm_page_free(p); | |
3397 | } | |
3398 | } | |
54f51aeb | 3399 | bp->b_xio.xio_npages = newnpages; |
984263bc MD |
3400 | } |
3401 | ||
3402 | /* | |
3f779080 | 3403 | * vmapbuf: |
984263bc | 3404 | * |
3591bbc6 MD |
3405 | * Map a user buffer into KVM via a pbuf. On return the buffer's |
3406 | * b_data, b_bufsize, and b_bcount will be set, and its XIO page array | |
3407 | * initialized. | |
984263bc MD |
3408 | */ |
3409 | int | |
3591bbc6 | 3410 | vmapbuf(struct buf *bp, caddr_t udata, int bytes) |
984263bc | 3411 | { |
3591bbc6 | 3412 | caddr_t addr; |
0a3895d4 MD |
3413 | vm_offset_t va; |
3414 | vm_page_t m; | |
3415 | int vmprot; | |
3416 | int error; | |
984263bc MD |
3417 | int pidx; |
3418 | int i; | |
984263bc | 3419 | |
10f3fee5 | 3420 | /* |
3591bbc6 | 3421 | * bp had better have a command and it better be a pbuf. |
10f3fee5 MD |
3422 | */ |
3423 | KKASSERT(bp->b_cmd != BUF_CMD_DONE); | |
3591bbc6 | 3424 | KKASSERT(bp->b_flags & B_PAGING); |
10f3fee5 | 3425 | |
3591bbc6 | 3426 | if (bytes < 0) |
984263bc | 3427 | return (-1); |
3591bbc6 MD |
3428 | |
3429 | /* | |
3430 | * Map the user data into KVM. Mappings have to be page-aligned. | |
3431 | */ | |
3432 | addr = (caddr_t)trunc_page((vm_offset_t)udata); | |
3433 | pidx = 0; | |
3434 | ||
3435 | vmprot = VM_PROT_READ; | |
3436 | if (bp->b_cmd == BUF_CMD_READ) | |
3437 | vmprot |= VM_PROT_WRITE; | |
3438 | ||
3439 | while (addr < udata + bytes) { | |
984263bc MD |
3440 | /* |
3441 | * Do the vm_fault if needed; do the copy-on-write thing | |
3442 | * when reading stuff off device into memory. | |
0a3895d4 MD |
3443 | * |
3444 | * vm_fault_page*() returns a held VM page. | |
984263bc | 3445 | */ |
0a3895d4 MD |
3446 | va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata; |
3447 | va = trunc_page(va); | |
3448 | ||
3449 | m = vm_fault_page_quick(va, vmprot, &error); | |
3450 | if (m == NULL) { | |
984263bc | 3451 | for (i = 0; i < pidx; ++i) { |
54f51aeb HP |
3452 | vm_page_unhold(bp->b_xio.xio_pages[i]); |
3453 | bp->b_xio.xio_pages[i] = NULL; | |
984263bc MD |
3454 | } |
3455 | return(-1); | |
3456 | } | |
54f51aeb | 3457 | bp->b_xio.xio_pages[pidx] = m; |
3591bbc6 MD |
3458 | addr += PAGE_SIZE; |
3459 | ++pidx; | |
984263bc | 3460 | } |
3591bbc6 MD |
3461 | |
3462 | /* | |
3463 | * Map the page array and set the buffer fields to point to | |
3464 | * the mapped data buffer. | |
3465 | */ | |
984263bc MD |
3466 | if (pidx > btoc(MAXPHYS)) |
3467 | panic("vmapbuf: mapped more than MAXPHYS"); | |
3591bbc6 MD |
3468 | pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx); |
3469 | ||
54f51aeb | 3470 | bp->b_xio.xio_npages = pidx; |
3591bbc6 MD |
3471 | bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK); |
3472 | bp->b_bcount = bytes; | |
3473 | bp->b_bufsize = bytes; | |
984263bc MD |
3474 | return(0); |
3475 | } | |
3476 | ||
3477 | /* | |
3f779080 HP |
3478 | * vunmapbuf: |
3479 | * | |
3480 | * Free the io map PTEs associated with this IO operation. | |
3481 | * We also invalidate the TLB entries and restore the original b_addr. | |
984263bc MD |
3482 | */ |
3483 | void | |