| Commit | Line | Data |
|---|---|---|
| 984263bc MD |
1 | /* |
| 2 | * Copyright (c) 1990 University of Utah. | |
| 3 | * Copyright (c) 1991 The Regents of the University of California. | |
| 4 | * All rights reserved. | |
| 5 | * Copyright (c) 1993, 1994 John S. Dyson | |
| 6 | * Copyright (c) 1995, David Greenman | |
| 7 | * | |
| 8 | * This code is derived from software contributed to Berkeley by | |
| 9 | * the Systems Programming Group of the University of Utah Computer | |
| 10 | * Science Department. | |
| 11 | * | |
| 12 | * Redistribution and use in source and binary forms, with or without | |
| 13 | * modification, are permitted provided that the following conditions | |
| 14 | * are met: | |
| 15 | * 1. Redistributions of source code must retain the above copyright | |
| 16 | * notice, this list of conditions and the following disclaimer. | |
| 17 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 18 | * notice, this list of conditions and the following disclaimer in the | |
| 19 | * documentation and/or other materials provided with the distribution. | |
| 20 | * 3. All advertising materials mentioning features or use of this software | |
| 21 | * must display the following acknowledgement: | |
| 22 | * This product includes software developed by the University of | |
| 23 | * California, Berkeley and its contributors. | |
| 24 | * 4. Neither the name of the University nor the names of its contributors | |
| 25 | * may be used to endorse or promote products derived from this software | |
| 26 | * without specific prior written permission. | |
| 27 | * | |
| 28 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 29 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 30 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 31 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 32 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 33 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 34 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 35 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 36 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 37 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 38 | * SUCH DAMAGE. | |
| 39 | * | |
| 40 | * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 | |
| 41 | * $FreeBSD: src/sys/vm/vnode_pager.c,v 1.116.2.7 2002/12/31 09:34:51 dillon Exp $ | |
| e92ca23a | 42 | * $DragonFly: src/sys/vm/vnode_pager.c,v 1.43 2008/06/19 23:27:39 dillon Exp $ |
| 984263bc MD |
43 | */ |
| 44 | ||
| 45 | /* | |
| 46 | * Page to/from files (vnodes). | |
| 47 | */ | |
| 48 | ||
| 49 | /* | |
| 50 | * TODO: | |
| 51 | * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will | |
| 52 | * greatly re-simplify the vnode_pager. | |
| 53 | */ | |
| 54 | ||
| 55 | #include <sys/param.h> | |
| 56 | #include <sys/systm.h> | |
| 5fd012e0 | 57 | #include <sys/kernel.h> |
| 984263bc MD |
58 | #include <sys/proc.h> |
| 59 | #include <sys/vnode.h> | |
| 60 | #include <sys/mount.h> | |
| 61 | #include <sys/buf.h> | |
| 62 | #include <sys/vmmeter.h> | |
| 63 | #include <sys/conf.h> | |
| 06ecca5a | 64 | #include <sys/sfbuf.h> |
| cdd46d2e | 65 | #include <sys/thread2.h> |
| 984263bc MD |
66 | |
| 67 | #include <vm/vm.h> | |
| 68 | #include <vm/vm_object.h> | |
| 69 | #include <vm/vm_page.h> | |
| 70 | #include <vm/vm_pager.h> | |
| 71 | #include <vm/vm_map.h> | |
| 72 | #include <vm/vnode_pager.h> | |
| 73 | #include <vm/vm_extern.h> | |
| 74 | ||
| 1388df65 RG |
75 | static void vnode_pager_dealloc (vm_object_t); |
| 76 | static int vnode_pager_getpages (vm_object_t, vm_page_t *, int, int); | |
| 77 | static void vnode_pager_putpages (vm_object_t, vm_page_t *, int, boolean_t, int *); | |
| 78 | static boolean_t vnode_pager_haspage (vm_object_t, vm_pindex_t, int *, int *); | |
| 984263bc MD |
79 | |
| 80 | struct pagerops vnodepagerops = { | |
| 81 | NULL, | |
| 82 | vnode_pager_alloc, | |
| 83 | vnode_pager_dealloc, | |
| 84 | vnode_pager_getpages, | |
| 85 | vnode_pager_putpages, | |
| 86 | vnode_pager_haspage, | |
| 87 | NULL | |
| 88 | }; | |
| 89 | ||
| 35f59bfa MD |
90 | static struct krate vbadrate = { 1 }; |
| 91 | static struct krate vresrate = { 1 }; | |
| 92 | ||
| 984263bc MD |
93 | int vnode_pbuf_freecnt = -1; /* start out unlimited */ |
| 94 | ||
| 95 | /* | |
| 96 | * Allocate (or lookup) pager for a vnode. | |
| 97 | * Handle is a vnode pointer. | |
| 98 | */ | |
| 99 | vm_object_t | |
| 57f7b636 | 100 | vnode_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset) |
| 984263bc MD |
101 | { |
| 102 | vm_object_t object; | |
| 103 | struct vnode *vp; | |
| 104 | ||
| 105 | /* | |
| 106 | * Pageout to vnode, no can do yet. | |
| 107 | */ | |
| 108 | if (handle == NULL) | |
| 109 | return (NULL); | |
| 110 | ||
| 111 | /* | |
| 112 | * XXX hack - This initialization should be put somewhere else. | |
| 113 | */ | |
| 114 | if (vnode_pbuf_freecnt < 0) { | |
| 115 | vnode_pbuf_freecnt = nswbuf / 2 + 1; | |
| 116 | } | |
| 117 | ||
| 118 | vp = (struct vnode *) handle; | |
| 119 | ||
| 120 | /* | |
| 121 | * Prevent race condition when allocating the object. This | |
| 122 | * can happen with NFS vnodes since the nfsnode isn't locked. | |
| 123 | */ | |
| 124 | while (vp->v_flag & VOLOCK) { | |
| 125 | vp->v_flag |= VOWANT; | |
| 377d4740 | 126 | tsleep(vp, 0, "vnpobj", 0); |
| 984263bc MD |
127 | } |
| 128 | vp->v_flag |= VOLOCK; | |
| 129 | ||
| 130 | /* | |
| 131 | * If the object is being terminated, wait for it to | |
| 132 | * go away. | |
| 133 | */ | |
| 134 | while (((object = vp->v_object) != NULL) && | |
| 135 | (object->flags & OBJ_DEAD)) { | |
| 9e12ff11 | 136 | vm_object_dead_sleep(object, "vadead"); |
| 984263bc MD |
137 | } |
| 138 | ||
| 3c37c940 | 139 | if (vp->v_sysref.refcnt <= 0) |
| 984263bc MD |
140 | panic("vnode_pager_alloc: no vnode reference"); |
| 141 | ||
| 142 | if (object == NULL) { | |
| 143 | /* | |
| 144 | * And an object of the appropriate size | |
| 145 | */ | |
| 146 | object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); | |
| 147 | object->flags = 0; | |
| 984263bc MD |
148 | object->handle = handle; |
| 149 | vp->v_object = object; | |
| 57f7b636 | 150 | vp->v_filesize = size; |
| 984263bc MD |
151 | } else { |
| 152 | object->ref_count++; | |
| 973c11b9 MD |
153 | if (vp->v_filesize != size) { |
| 154 | kprintf("vnode_pager_alloc: Warning, filesize " | |
| 155 | "mismatch %lld/%lld\n", | |
| 156 | (long long)vp->v_filesize, | |
| 157 | (long long)size); | |
| 158 | } | |
| 984263bc | 159 | } |
| 3c37c940 | 160 | vref(vp); |
| 984263bc MD |
161 | |
| 162 | vp->v_flag &= ~VOLOCK; | |
| 163 | if (vp->v_flag & VOWANT) { | |
| 164 | vp->v_flag &= ~VOWANT; | |
| 165 | wakeup(vp); | |
| 166 | } | |
| 167 | return (object); | |
| 168 | } | |
| 169 | ||
| 170 | static void | |
| 57e43348 | 171 | vnode_pager_dealloc(vm_object_t object) |
| 984263bc | 172 | { |
| 5f910b2f | 173 | struct vnode *vp = object->handle; |
| 984263bc MD |
174 | |
| 175 | if (vp == NULL) | |
| 176 | panic("vnode_pager_dealloc: pager already dealloced"); | |
| 177 | ||
| 178 | vm_object_pip_wait(object, "vnpdea"); | |
| 179 | ||
| 180 | object->handle = NULL; | |
| 181 | object->type = OBJT_DEAD; | |
| 182 | vp->v_object = NULL; | |
| 57f7b636 | 183 | vp->v_filesize = NOOFFSET; |
| 984263bc MD |
184 | vp->v_flag &= ~(VTEXT | VOBJBUF); |
| 185 | } | |
| 186 | ||
| 54078292 MD |
187 | /* |
| 188 | * Return whether the vnode pager has the requested page. Return the | |
| 189 | * number of disk-contiguous pages before and after the requested page, | |
| 190 | * not including the requested page. | |
| 191 | */ | |
| 984263bc | 192 | static boolean_t |
| 57e43348 | 193 | vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, |
| 54078292 | 194 | int *after) |
| 984263bc MD |
195 | { |
| 196 | struct vnode *vp = object->handle; | |
| 54078292 MD |
197 | off_t loffset; |
| 198 | off_t doffset; | |
| 199 | int voff; | |
| 984263bc | 200 | int bsize; |
| 54078292 | 201 | int error; |
| 984263bc MD |
202 | |
| 203 | /* | |
| 204 | * If no vp or vp is doomed or marked transparent to VM, we do not | |
| 205 | * have the page. | |
| 206 | */ | |
| 5fd012e0 | 207 | if ((vp == NULL) || (vp->v_flag & VRECLAIMED)) |
| 984263bc MD |
208 | return FALSE; |
| 209 | ||
| 210 | /* | |
| 211 | * If filesystem no longer mounted or offset beyond end of file we do | |
| 212 | * not have the page. | |
| 213 | */ | |
| 54078292 MD |
214 | loffset = IDX_TO_OFF(pindex); |
| 215 | ||
| 57f7b636 | 216 | if (vp->v_mount == NULL || loffset >= vp->v_filesize) |
| 984263bc MD |
217 | return FALSE; |
| 218 | ||
| 219 | bsize = vp->v_mount->mnt_stat.f_iosize; | |
| 54078292 MD |
220 | voff = loffset % bsize; |
| 221 | ||
| bc823b32 MD |
222 | /* |
| 223 | * BMAP returns byte counts before and after, where after | |
| 224 | * is inclusive of the base page. haspage must return page | |
| 225 | * counts before and after where after does not include the | |
| 226 | * base page. | |
| 227 | * | |
| 228 | * BMAP is allowed to return a *after of 0 for backwards | |
| 229 | * compatibility. The base page is still considered valid if | |
| 230 | * no error is returned. | |
| 231 | */ | |
| e92ca23a | 232 | error = VOP_BMAP(vp, loffset - voff, &doffset, after, before, 0); |
| bc823b32 MD |
233 | if (error) { |
| 234 | if (before) | |
| 235 | *before = 0; | |
| 236 | if (after) | |
| 237 | *after = 0; | |
| 984263bc | 238 | return TRUE; |
| bc823b32 | 239 | } |
| 54078292 | 240 | if (doffset == NOOFFSET) |
| 984263bc | 241 | return FALSE; |
| 984263bc | 242 | |
| 54078292 MD |
243 | if (before) { |
| 244 | *before = (*before + voff) >> PAGE_SHIFT; | |
| 245 | } | |
| 246 | if (after) { | |
| 247 | *after -= voff; | |
| 57f7b636 MD |
248 | if (loffset + *after > vp->v_filesize) |
| 249 | *after = vp->v_filesize - loffset; | |
| 54078292 MD |
250 | *after >>= PAGE_SHIFT; |
| 251 | if (*after < 0) | |
| 252 | *after = 0; | |
| 984263bc MD |
253 | } |
| 254 | return TRUE; | |
| 255 | } | |
| 256 | ||
| 257 | /* | |
| 258 | * Lets the VM system know about a change in size for a file. | |
| 259 | * We adjust our own internal size and flush any cached pages in | |
| 260 | * the associated object that are affected by the size change. | |
| 261 | * | |
| 57f7b636 | 262 | * NOTE: This routine may be invoked as a result of a pager put |
| 984263bc | 263 | * operation (possibly at object termination time), so we must be careful. |
| 57f7b636 MD |
264 | * |
| 265 | * NOTE: vp->v_filesize is initialized to NOOFFSET (-1), be sure that | |
| 266 | * we do not blow up on the case. nsize will always be >= 0, however. | |
| 984263bc MD |
267 | */ |
| 268 | void | |
| 57e43348 | 269 | vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) |
| 984263bc MD |
270 | { |
| 271 | vm_pindex_t nobjsize; | |
| c4b46cb4 | 272 | vm_pindex_t oobjsize; |
| 984263bc MD |
273 | vm_object_t object = vp->v_object; |
| 274 | ||
| 275 | if (object == NULL) | |
| 276 | return; | |
| 277 | ||
| 278 | /* | |
| 279 | * Hasn't changed size | |
| 280 | */ | |
| 57f7b636 | 281 | if (nsize == vp->v_filesize) |
| 984263bc MD |
282 | return; |
| 283 | ||
| c4b46cb4 MD |
284 | /* |
| 285 | * Has changed size. Adjust the VM object's size and v_filesize | |
| 286 | * before we start scanning pages to prevent new pages from being | |
| 287 | * allocated during the scan. | |
| 288 | */ | |
| 984263bc | 289 | nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); |
| c4b46cb4 MD |
290 | oobjsize = object->size; |
| 291 | object->size = nobjsize; | |
| 984263bc MD |
292 | |
| 293 | /* | |
| 294 | * File has shrunk. Toss any cached pages beyond the new EOF. | |
| 295 | */ | |
| 57f7b636 | 296 | if (nsize < vp->v_filesize) { |
| 135bd6a8 | 297 | vp->v_filesize = nsize; |
| c4b46cb4 MD |
298 | if (nobjsize < oobjsize) { |
| 299 | vm_object_page_remove(object, nobjsize, oobjsize, | |
| 300 | FALSE); | |
| 984263bc MD |
301 | } |
| 302 | /* | |
| 8d429613 MD |
303 | * This gets rid of garbage at the end of a page that is now |
| 304 | * only partially backed by the vnode. Since we are setting | |
| 305 | * the entire page valid & clean after we are done we have | |
| 306 | * to be sure that the portion of the page within the file | |
| 307 | * bounds is already valid. If it isn't then making it | |
| 308 | * valid would create a corrupt block. | |
| 984263bc MD |
309 | */ |
| 310 | if (nsize & PAGE_MASK) { | |
| 311 | vm_offset_t kva; | |
| 312 | vm_page_t m; | |
| 313 | ||
| 17cde63e MD |
314 | do { |
| 315 | m = vm_page_lookup(object, OFF_TO_IDX(nsize)); | |
| 316 | } while (m && vm_page_sleep_busy(m, TRUE, "vsetsz")); | |
| 317 | ||
| 984263bc MD |
318 | if (m && m->valid) { |
| 319 | int base = (int)nsize & PAGE_MASK; | |
| 320 | int size = PAGE_SIZE - base; | |
| 06ecca5a | 321 | struct sf_buf *sf; |
| 984263bc MD |
322 | |
| 323 | /* | |
| 324 | * Clear out partial-page garbage in case | |
| 325 | * the page has been mapped. | |
| 326 | */ | |
| 17cde63e | 327 | vm_page_busy(m); |
| 4f1640d6 | 328 | sf = sf_buf_alloc(m, SFB_CPUPRIVATE); |
| 06ecca5a | 329 | kva = sf_buf_kva(sf); |
| 984263bc | 330 | bzero((caddr_t)kva + base, size); |
| 06ecca5a | 331 | sf_buf_free(sf); |
| 984263bc MD |
332 | |
| 333 | /* | |
| 334 | * XXX work around SMP data integrity race | |
| 335 | * by unmapping the page from user processes. | |
| 336 | * The garbage we just cleared may be mapped | |
| 337 | * to a user process running on another cpu | |
| 338 | * and this code is not running through normal | |
| 339 | * I/O channels which handle SMP issues for | |
| 340 | * us, so unmap page to synchronize all cpus. | |
| 341 | * | |
| 342 | * XXX should vm_pager_unmap_page() have | |
| 343 | * dealt with this? | |
| 344 | */ | |
| 345 | vm_page_protect(m, VM_PROT_NONE); | |
| 346 | ||
| 347 | /* | |
| 348 | * Clear out partial-page dirty bits. This | |
| 349 | * has the side effect of setting the valid | |
| 350 | * bits, but that is ok. There are a bunch | |
| 351 | * of places in the VM system where we expected | |
| 352 | * m->dirty == VM_PAGE_BITS_ALL. The file EOF | |
| 353 | * case is one of them. If the page is still | |
| 354 | * partially dirty, make it fully dirty. | |
| 355 | * | |
| 356 | * note that we do not clear out the valid | |
| 357 | * bits. This would prevent bogus_page | |
| 358 | * replacement from working properly. | |
| 359 | */ | |
| 360 | vm_page_set_validclean(m, base, size); | |
| 361 | if (m->dirty != 0) | |
| 362 | m->dirty = VM_PAGE_BITS_ALL; | |
| 17cde63e | 363 | vm_page_wakeup(m); |
| 984263bc MD |
364 | } |
| 365 | } | |
| 135bd6a8 MD |
366 | } else { |
| 367 | vp->v_filesize = nsize; | |
| 984263bc | 368 | } |
| 984263bc MD |
369 | } |
| 370 | ||
| a55afca2 MD |
371 | /* |
| 372 | * Release a page busied for a getpages operation. The page may have become | |
| 373 | * wired (typically due to being used by the buffer cache) or otherwise been | |
| 374 | * soft-busied and cannot be freed in that case. A held page can still be | |
| 375 | * freed. | |
| 376 | */ | |
| 984263bc | 377 | void |
| 57e43348 | 378 | vnode_pager_freepage(vm_page_t m) |
| 984263bc | 379 | { |
| a55afca2 MD |
380 | if (m->busy || m->wire_count) { |
| 381 | vm_page_activate(m); | |
| 382 | vm_page_wakeup(m); | |
| 383 | } else { | |
| 384 | vm_page_free(m); | |
| 385 | } | |
| 984263bc MD |
386 | } |
| 387 | ||
| 388 | /* | |
| 984263bc MD |
389 | * EOPNOTSUPP is no longer legal. For local media VFS's that do not |
| 390 | * implement their own VOP_GETPAGES, their VOP_GETPAGES should call to | |
| 391 | * vnode_pager_generic_getpages() to implement the previous behaviour. | |
| 392 | * | |
| 393 | * All other FS's should use the bypass to get to the local media | |
| 394 | * backing vp's VOP_GETPAGES. | |
| 395 | */ | |
| 396 | static int | |
| 57e43348 | 397 | vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) |
| 984263bc MD |
398 | { |
| 399 | int rtval; | |
| 400 | struct vnode *vp; | |
| 401 | int bytes = count * PAGE_SIZE; | |
| 402 | ||
| 403 | vp = object->handle; | |
| 984263bc | 404 | rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0); |
| a05863a7 MD |
405 | if (rtval == EOPNOTSUPP) |
| 406 | panic("vnode_pager: vfs's must implement vop_getpages\n"); | |
| 984263bc MD |
407 | return rtval; |
| 408 | } | |
| 409 | ||
| 984263bc MD |
410 | /* |
| 411 | * This is now called from local media FS's to operate against their | |
| 412 | * own vnodes if they fail to implement VOP_GETPAGES. | |
| a05863a7 MD |
413 | * |
| 414 | * With all the caching local media devices do these days there is really | |
| 415 | * very little point to attempting to restrict the I/O size to contiguous | |
| 416 | * blocks on-disk, especially if our caller thinks we need all the specified | |
| 417 | * pages. Just construct and issue a READ. | |
| 984263bc MD |
418 | */ |
| 419 | int | |
| 57e43348 | 420 | vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount, |
| a05863a7 | 421 | int reqpage) |
| 984263bc | 422 | { |
| a05863a7 MD |
423 | struct iovec aiov; |
| 424 | struct uio auio; | |
| 425 | off_t foff; | |
| 426 | int error; | |
| 984263bc | 427 | int count; |
| a05863a7 MD |
428 | int i; |
| 429 | int ioflags; | |
| 984263bc MD |
430 | |
| 431 | /* | |
| a05863a7 | 432 | * Do not do anything if the vnode is bad. |
| 984263bc | 433 | */ |
| a05863a7 MD |
434 | if (vp->v_mount == NULL) |
| 435 | return VM_PAGER_BAD; | |
| 984263bc MD |
436 | |
| 437 | /* | |
| a05863a7 MD |
438 | * Calculate the number of pages. Since we are paging in whole |
| 439 | * pages, adjust bytecount to be an integral multiple of the page | |
| 440 | * size. It will be clipped to the file EOF later on. | |
| 984263bc | 441 | */ |
| a05863a7 MD |
442 | bytecount = round_page(bytecount); |
| 443 | count = bytecount / PAGE_SIZE; | |
| 984263bc MD |
444 | |
| 445 | /* | |
| 446 | * If we have a completely valid page available to us, we can | |
| 447 | * clean up and return. Otherwise we have to re-read the | |
| 448 | * media. | |
| 8d429613 MD |
449 | * |
| 450 | * Note that this does not work with NFS, so NFS has its own | |
| 451 | * getpages routine. The problem is that NFS can have partially | |
| 452 | * valid pages associated with the buffer cache due to the piecemeal | |
| 453 | * write support. If we were to fall through and re-read the media | |
| 454 | * as we do here, dirty data could be lost. | |
| 984263bc | 455 | */ |
| 984263bc MD |
456 | if (m[reqpage]->valid == VM_PAGE_BITS_ALL) { |
| 457 | for (i = 0; i < count; i++) { | |
| 458 | if (i != reqpage) | |
| 459 | vnode_pager_freepage(m[i]); | |
| 460 | } | |
| 461 | return VM_PAGER_OK; | |
| 462 | } | |
| 984263bc MD |
463 | |
| 464 | /* | |
| a05863a7 MD |
465 | * Discard pages past the file EOF. If the requested page is past |
| 466 | * the file EOF we just leave its valid bits set to 0, the caller | |
| 467 | * expects to maintain ownership of the requested page. If the | |
| 468 | * entire range is past file EOF discard everything and generate | |
| 469 | * a pagein error. | |
| 984263bc | 470 | */ |
| a05863a7 MD |
471 | foff = IDX_TO_OFF(m[0]->pindex); |
| 472 | if (foff >= vp->v_filesize) { | |
| 473 | for (i = 0; i < count; i++) { | |
| 474 | if (i != reqpage) | |
| 475 | vnode_pager_freepage(m[i]); | |
| 984263bc | 476 | } |
| a05863a7 | 477 | return VM_PAGER_ERROR; |
| 984263bc MD |
478 | } |
| 479 | ||
| a05863a7 MD |
480 | if (foff + bytecount > vp->v_filesize) { |
| 481 | bytecount = vp->v_filesize - foff; | |
| 482 | i = round_page(bytecount) / PAGE_SIZE; | |
| 483 | while (count > i) { | |
| 484 | --count; | |
| 485 | if (count != reqpage) | |
| 486 | vnode_pager_freepage(m[count]); | |
| 984263bc | 487 | } |
| 984263bc MD |
488 | } |
| 489 | ||
| 490 | /* | |
| a05863a7 MD |
491 | * The size of the transfer is bytecount. bytecount will be an |
| 492 | * integral multiple of the page size unless it has been clipped | |
| 493 | * to the file EOF. The transfer cannot exceed the file EOF. | |
| 494 | * | |
| 495 | * When dealing with real devices we must round-up to the device | |
| 496 | * sector size. | |
| 984263bc | 497 | */ |
| a05863a7 MD |
498 | if (vp->v_type == VBLK || vp->v_type == VCHR) { |
| 499 | int secmask = vp->v_rdev->si_bsize_phys - 1; | |
| 984263bc | 500 | KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large\n", secmask + 1)); |
| a05863a7 | 501 | bytecount = (bytecount + secmask) & ~secmask; |
| 984263bc MD |
502 | } |
| 503 | ||
| 984263bc | 504 | /* |
| ca28958c MD |
505 | * Severe hack to avoid deadlocks with the buffer cache |
| 506 | */ | |
| 507 | for (i = 0; i < count; ++i) { | |
| 508 | vm_page_t mt = m[i]; | |
| 509 | ||
| 510 | vm_page_io_start(mt); | |
| 511 | vm_page_wakeup(mt); | |
| 512 | } | |
| 513 | ||
| 514 | /* | |
| a05863a7 | 515 | * Issue the I/O without any read-ahead |
| 984263bc | 516 | */ |
| a05863a7 MD |
517 | ioflags = IO_VMIO; |
| 518 | /*ioflags |= IO_SEQMAX << IO_SEQSHIFT;*/ | |
| 519 | ||
| 520 | aiov.iov_base = (caddr_t) 0; | |
| 521 | aiov.iov_len = bytecount; | |
| 522 | auio.uio_iov = &aiov; | |
| 523 | auio.uio_iovcnt = 1; | |
| 524 | auio.uio_offset = foff; | |
| 525 | auio.uio_segflg = UIO_NOCOPY; | |
| 526 | auio.uio_rw = UIO_READ; | |
| 527 | auio.uio_resid = bytecount; | |
| 528 | auio.uio_td = NULL; | |
| 12e4aaff MD |
529 | mycpu->gd_cnt.v_vnodein++; |
| 530 | mycpu->gd_cnt.v_vnodepgsin += count; | |
| 984263bc | 531 | |
| a05863a7 | 532 | error = VOP_READ(vp, &auio, ioflags, proc0.p_ucred); |
| 984263bc MD |
533 | |
| 534 | /* | |
| ca28958c MD |
535 | * Severe hack to avoid deadlocks with the buffer cache |
| 536 | */ | |
| 537 | for (i = 0; i < count; ++i) { | |
| 538 | vm_page_t mt = m[i]; | |
| 539 | ||
| 540 | while (vm_page_sleep_busy(mt, FALSE, "getpgs")) | |
| 541 | ; | |
| 542 | vm_page_busy(mt); | |
| 543 | vm_page_io_finish(mt); | |
| 544 | } | |
| 545 | ||
| 546 | /* | |
| a05863a7 MD |
547 | * Calculate the actual number of bytes read and clean up the |
| 548 | * page list. | |
| 984263bc | 549 | */ |
| a05863a7 | 550 | bytecount -= auio.uio_resid; |
| 984263bc | 551 | |
| a05863a7 | 552 | for (i = 0; i < count; ++i) { |
| a55afca2 MD |
553 | vm_page_t mt = m[i]; |
| 554 | ||
| a05863a7 | 555 | if (i != reqpage) { |
| a55afca2 | 556 | if (error == 0 && mt->valid) { |
| 984263bc MD |
557 | if (mt->flags & PG_WANTED) |
| 558 | vm_page_activate(mt); | |
| 559 | else | |
| 560 | vm_page_deactivate(mt); | |
| 561 | vm_page_wakeup(mt); | |
| 562 | } else { | |
| 563 | vnode_pager_freepage(mt); | |
| 564 | } | |
| a55afca2 MD |
565 | } else if (mt->valid == 0) { |
| 566 | if (error == 0) { | |
| 567 | kprintf("page failed but no I/O error page %p object %p pindex %d\n", mt, mt->object, (int) mt->pindex); | |
| 568 | /* whoops, something happened */ | |
| 569 | error = EINVAL; | |
| 570 | } | |
| 571 | } else if (mt->valid != VM_PAGE_BITS_ALL) { | |
| 572 | /* | |
| 573 | * Zero-extend the requested page if necessary (if | |
| 574 | * the filesystem is using a small block size). | |
| 575 | */ | |
| 576 | vm_page_zero_invalid(mt, TRUE); | |
| 984263bc MD |
577 | } |
| 578 | } | |
| 579 | if (error) { | |
| 086c1d7e | 580 | kprintf("vnode_pager_getpages: I/O read error\n"); |
| 984263bc MD |
581 | } |
| 582 | return (error ? VM_PAGER_ERROR : VM_PAGER_OK); | |
| 583 | } | |
| 584 | ||
| 585 | /* | |
| 586 | * EOPNOTSUPP is no longer legal. For local media VFS's that do not | |
| 587 | * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to | |
| 588 | * vnode_pager_generic_putpages() to implement the previous behaviour. | |
| 589 | * | |
| 590 | * All other FS's should use the bypass to get to the local media | |
| 591 | * backing vp's VOP_PUTPAGES. | |
| 592 | */ | |
| 593 | static void | |
| 57e43348 MD |
594 | vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, |
| 595 | boolean_t sync, int *rtvals) | |
| 984263bc MD |
596 | { |
| 597 | int rtval; | |
| 598 | struct vnode *vp; | |
| 599 | int bytes = count * PAGE_SIZE; | |
| 600 | ||
| 601 | /* | |
| 602 | * Force synchronous operation if we are extremely low on memory | |
| 603 | * to prevent a low-memory deadlock. VOP operations often need to | |
| 604 | * allocate more memory to initiate the I/O ( i.e. do a BMAP | |
| 605 | * operation ). The swapper handles the case by limiting the amount | |
| 606 | * of asynchronous I/O, but that sort of solution doesn't scale well | |
| 607 | * for the vnode pager without a lot of work. | |
| 608 | * | |
| 609 | * Also, the backing vnode's iodone routine may not wake the pageout | |
| 610 | * daemon up. This should be probably be addressed XXX. | |
| 611 | */ | |
| 612 | ||
| 12e4aaff | 613 | if ((vmstats.v_free_count + vmstats.v_cache_count) < vmstats.v_pageout_free_min) |
| 984263bc MD |
614 | sync |= OBJPC_SYNC; |
| 615 | ||
| 616 | /* | |
| 617 | * Call device-specific putpages function | |
| 618 | */ | |
| 619 | ||
| 620 | vp = object->handle; | |
| 621 | rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0); | |
| 622 | if (rtval == EOPNOTSUPP) { | |
| 086c1d7e | 623 | kprintf("vnode_pager: *** WARNING *** stale FS putpages\n"); |
| 984263bc MD |
624 | rtval = vnode_pager_generic_putpages( vp, m, bytes, sync, rtvals); |
| 625 | } | |
| 626 | } | |
| 627 | ||
| 628 | ||
| 629 | /* | |
| 630 | * This is now called from local media FS's to operate against their | |
| 631 | * own vnodes if they fail to implement VOP_PUTPAGES. | |
| 632 | * | |
| 633 | * This is typically called indirectly via the pageout daemon and | |
| 634 | * clustering has already typically occured, so in general we ask the | |
| 635 | * underlying filesystem to write the data out asynchronously rather | |
| 636 | * then delayed. | |
| 637 | */ | |
| 638 | int | |
| 57e43348 | 639 | vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, int bytecount, |
| a05863a7 | 640 | int flags, int *rtvals) |
| 984263bc MD |
641 | { |
| 642 | int i; | |
| 643 | vm_object_t object; | |
| 644 | int count; | |
| 645 | ||
| 646 | int maxsize, ncount; | |
| 647 | vm_ooffset_t poffset; | |
| 648 | struct uio auio; | |
| 649 | struct iovec aiov; | |
| 650 | int error; | |
| 651 | int ioflags; | |
| 652 | ||
| 653 | object = vp->v_object; | |
| 654 | count = bytecount / PAGE_SIZE; | |
| 655 | ||
| 656 | for (i = 0; i < count; i++) | |
| 657 | rtvals[i] = VM_PAGER_AGAIN; | |
| 658 | ||
| 659 | if ((int) m[0]->pindex < 0) { | |
| 086c1d7e | 660 | kprintf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n", |
| 984263bc MD |
661 | (long)m[0]->pindex, m[0]->dirty); |
| 662 | rtvals[0] = VM_PAGER_BAD; | |
| 663 | return VM_PAGER_BAD; | |
| 664 | } | |
| 665 | ||
| 666 | maxsize = count * PAGE_SIZE; | |
| 667 | ncount = count; | |
| 668 | ||
| 669 | poffset = IDX_TO_OFF(m[0]->pindex); | |
| 670 | ||
| 671 | /* | |
| 672 | * If the page-aligned write is larger then the actual file we | |
| 673 | * have to invalidate pages occuring beyond the file EOF. However, | |
| 674 | * there is an edge case where a file may not be page-aligned where | |
| 675 | * the last page is partially invalid. In this case the filesystem | |
| 676 | * may not properly clear the dirty bits for the entire page (which | |
| 677 | * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). | |
| 678 | * With the page locked we are free to fix-up the dirty bits here. | |
| 679 | * | |
| 680 | * We do not under any circumstances truncate the valid bits, as | |
| 681 | * this will screw up bogus page replacement. | |
| 93afe6be MD |
682 | * |
| 683 | * The caller has already read-protected the pages. The VFS must | |
| 684 | * use the buffer cache to wrap the pages. The pages might not | |
| 685 | * be immediately flushed by the buffer cache but once under its | |
| 686 | * control the pages themselves can wind up being marked clean | |
| 687 | * and their covering buffer cache buffer can be marked dirty. | |
| 984263bc | 688 | */ |
| 57f7b636 MD |
689 | if (maxsize + poffset > vp->v_filesize) { |
| 690 | if (vp->v_filesize > poffset) { | |
| 984263bc MD |
691 | int pgoff; |
| 692 | ||
| 57f7b636 | 693 | maxsize = vp->v_filesize - poffset; |
| 984263bc MD |
694 | ncount = btoc(maxsize); |
| 695 | if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { | |
| 696 | vm_page_clear_dirty(m[ncount - 1], pgoff, | |
| 697 | PAGE_SIZE - pgoff); | |
| 698 | } | |
| 699 | } else { | |
| 700 | maxsize = 0; | |
| 701 | ncount = 0; | |
| 702 | } | |
| 703 | if (ncount < count) { | |
| 704 | for (i = ncount; i < count; i++) { | |
| 705 | rtvals[i] = VM_PAGER_BAD; | |
| 706 | } | |
| 707 | } | |
| 708 | } | |
| 709 | ||
| 710 | /* | |
| 711 | * pageouts are already clustered, use IO_ASYNC to force a bawrite() | |
| 712 | * rather then a bdwrite() to prevent paging I/O from saturating | |
| 713 | * the buffer cache. Dummy-up the sequential heuristic to cause | |
| 714 | * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set, | |
| 715 | * the system decides how to cluster. | |
| 716 | */ | |
| 717 | ioflags = IO_VMIO; | |
| 718 | if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) | |
| 719 | ioflags |= IO_SYNC; | |
| 720 | else if ((flags & VM_PAGER_CLUSTER_OK) == 0) | |
| 721 | ioflags |= IO_ASYNC; | |
| 722 | ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0; | |
| 723 | ioflags |= IO_SEQMAX << IO_SEQSHIFT; | |
| 724 | ||
| 725 | aiov.iov_base = (caddr_t) 0; | |
| 726 | aiov.iov_len = maxsize; | |
| 727 | auio.uio_iov = &aiov; | |
| 728 | auio.uio_iovcnt = 1; | |
| 729 | auio.uio_offset = poffset; | |
| 730 | auio.uio_segflg = UIO_NOCOPY; | |
| 731 | auio.uio_rw = UIO_WRITE; | |
| 732 | auio.uio_resid = maxsize; | |
| dadab5e9 | 733 | auio.uio_td = NULL; |
| 8a8d5d85 | 734 | error = VOP_WRITE(vp, &auio, ioflags, proc0.p_ucred); |
| 12e4aaff MD |
735 | mycpu->gd_cnt.v_vnodeout++; |
| 736 | mycpu->gd_cnt.v_vnodepgsout += ncount; | |
| 984263bc MD |
737 | |
| 738 | if (error) { | |
| 35f59bfa MD |
739 | krateprintf(&vbadrate, |
| 740 | "vnode_pager_putpages: I/O error %d\n", error); | |
| 984263bc MD |
741 | } |
| 742 | if (auio.uio_resid) { | |
| 35f59bfa MD |
743 | krateprintf(&vresrate, |
| 744 | "vnode_pager_putpages: residual I/O %d at %lu\n", | |
| 745 | auio.uio_resid, (u_long)m[0]->pindex); | |
| 984263bc | 746 | } |
| 8a63550c | 747 | for (i = 0; i < ncount; i++) |
| 984263bc | 748 | rtvals[i] = VM_PAGER_OK; |
| 984263bc MD |
749 | return rtvals[0]; |
| 750 | } | |
| 751 | ||
| 752 | struct vnode * | |
| dadab5e9 | 753 | vnode_pager_lock(vm_object_t object) |
| 984263bc | 754 | { |
| dadab5e9 | 755 | struct thread *td = curthread; /* XXX */ |
| 5fd012e0 | 756 | int error; |
| 984263bc MD |
757 | |
| 758 | for (; object != NULL; object = object->backing_object) { | |
| 759 | if (object->type != OBJT_VNODE) | |
| 760 | continue; | |
| 761 | if (object->flags & OBJ_DEAD) | |
| 762 | return NULL; | |
| 763 | ||
| 5fd012e0 MD |
764 | for (;;) { |
| 765 | struct vnode *vp = object->handle; | |
| ab6f251b | 766 | error = vget(vp, LK_SHARED | LK_RETRY | LK_CANRECURSE); |
| 5fd012e0 MD |
767 | if (error == 0) { |
| 768 | if (object->handle != vp) { | |
| 769 | vput(vp); | |
| 770 | continue; | |
| 771 | } | |
| 772 | return (vp); | |
| 773 | } | |
| dadab5e9 MD |
774 | if ((object->flags & OBJ_DEAD) || |
| 775 | (object->type != OBJT_VNODE)) { | |
| 984263bc | 776 | return NULL; |
| dadab5e9 | 777 | } |
| 086c1d7e | 778 | kprintf("vnode_pager_lock: vp %p error %d lockstatus %d, retrying\n", vp, error, lockstatus(&vp->v_lock, td)); |
| 5fd012e0 | 779 | tsleep(object->handle, 0, "vnpgrl", hz); |
| 984263bc | 780 | } |
| 984263bc MD |
781 | } |
| 782 | return NULL; | |
| 783 | } |