| 1 | /* |
| 2 | * Copyright (c) 1991, 1993 |
| 3 | * The Regents of the University of California. All rights reserved. |
| 4 | * |
| 5 | * This code is derived from software contributed to Berkeley by |
| 6 | * The Mach Operating System project at Carnegie-Mellon University. |
| 7 | * |
| 8 | * Redistribution and use in source and binary forms, with or without |
| 9 | * modification, are permitted provided that the following conditions |
| 10 | * are met: |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in the |
| 15 | * documentation and/or other materials provided with the distribution. |
| 16 | * 3. All advertising materials mentioning features or use of this software |
| 17 | * must display the following acknowledgement: |
| 18 | * This product includes software developed by the University of |
| 19 | * California, Berkeley and its contributors. |
| 20 | * 4. Neither the name of the University nor the names of its contributors |
| 21 | * may be used to endorse or promote products derived from this software |
| 22 | * without specific prior written permission. |
| 23 | * |
| 24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 34 | * SUCH DAMAGE. |
| 35 | * |
| 36 | * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 |
| 37 | * |
| 38 | * |
| 39 | * Copyright (c) 1987, 1990 Carnegie-Mellon University. |
| 40 | * All rights reserved. |
| 41 | * |
| 42 | * Authors: Avadis Tevanian, Jr., Michael Wayne Young |
| 43 | * |
| 44 | * Permission to use, copy, modify and distribute this software and |
| 45 | * its documentation is hereby granted, provided that both the copyright |
| 46 | * notice and this permission notice appear in all copies of the |
| 47 | * software, derivative works or modified versions, and any portions |
| 48 | * thereof, and that both notices appear in supporting documentation. |
| 49 | * |
| 50 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 51 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 52 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 53 | * |
| 54 | * Carnegie Mellon requests users of this software to return to |
| 55 | * |
| 56 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 57 | * School of Computer Science |
| 58 | * Carnegie Mellon University |
| 59 | * Pittsburgh PA 15213-3890 |
| 60 | * |
| 61 | * any improvements or extensions that they make and grant Carnegie the |
| 62 | * rights to redistribute these changes. |
| 63 | * |
| 64 | * $FreeBSD: src/sys/vm/vm_pager.c,v 1.54.2.2 2001/11/18 07:11:00 dillon Exp $ |
| 65 | * $DragonFly: src/sys/vm/vm_pager.c,v 1.11 2004/07/14 03:10:17 hmp Exp $ |
| 66 | */ |
| 67 | |
| 68 | /* |
| 69 | * Paging space routine stubs. Emulates a matchmaker-like interface |
| 70 | * for builtin pagers. |
| 71 | */ |
| 72 | |
| 73 | #include <sys/param.h> |
| 74 | #include <sys/systm.h> |
| 75 | #include <sys/kernel.h> |
| 76 | #include <sys/vnode.h> |
| 77 | #include <sys/buf.h> |
| 78 | #include <sys/ucred.h> |
| 79 | #include <sys/malloc.h> |
| 80 | #include <sys/proc.h> |
| 81 | |
| 82 | #include <vm/vm.h> |
| 83 | #include <vm/vm_param.h> |
| 84 | #include <vm/vm_object.h> |
| 85 | #include <vm/vm_page.h> |
| 86 | #include <vm/vm_pager.h> |
| 87 | #include <vm/vm_extern.h> |
| 88 | |
| 89 | #include <sys/buf2.h> |
| 90 | |
| 91 | MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data"); |
| 92 | |
| 93 | extern struct pagerops defaultpagerops; |
| 94 | extern struct pagerops swappagerops; |
| 95 | extern struct pagerops vnodepagerops; |
| 96 | extern struct pagerops devicepagerops; |
| 97 | extern struct pagerops physpagerops; |
| 98 | |
| 99 | int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ |
| 100 | |
| 101 | static int dead_pager_getpages (vm_object_t, vm_page_t *, int, int); |
| 102 | static vm_object_t dead_pager_alloc (void *, vm_ooffset_t, vm_prot_t, |
| 103 | vm_ooffset_t); |
| 104 | static void dead_pager_putpages (vm_object_t, vm_page_t *, int, int, int *); |
| 105 | static boolean_t dead_pager_haspage (vm_object_t, vm_pindex_t, int *, int *); |
| 106 | static void dead_pager_dealloc (vm_object_t); |
| 107 | |
| 108 | static int |
| 109 | dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int req) |
| 110 | { |
| 111 | return VM_PAGER_FAIL; |
| 112 | } |
| 113 | |
| 114 | static vm_object_t |
| 115 | dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, |
| 116 | vm_ooffset_t off) |
| 117 | { |
| 118 | return NULL; |
| 119 | } |
| 120 | |
| 121 | static void |
| 122 | dead_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, |
| 123 | int *rtvals) |
| 124 | { |
| 125 | int i; |
| 126 | |
| 127 | for (i = 0; i < count; i++) { |
| 128 | rtvals[i] = VM_PAGER_AGAIN; |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | static int |
| 133 | dead_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *prev, int *next) |
| 134 | { |
| 135 | if (prev) |
| 136 | *prev = 0; |
| 137 | if (next) |
| 138 | *next = 0; |
| 139 | return FALSE; |
| 140 | } |
| 141 | |
| 142 | static void |
| 143 | dead_pager_dealloc(vm_object_t object) |
| 144 | { |
| 145 | return; |
| 146 | } |
| 147 | |
| 148 | static struct pagerops deadpagerops = { |
| 149 | NULL, |
| 150 | dead_pager_alloc, |
| 151 | dead_pager_dealloc, |
| 152 | dead_pager_getpages, |
| 153 | dead_pager_putpages, |
| 154 | dead_pager_haspage, |
| 155 | NULL |
| 156 | }; |
| 157 | |
| 158 | struct pagerops *pagertab[] = { |
| 159 | &defaultpagerops, /* OBJT_DEFAULT */ |
| 160 | &swappagerops, /* OBJT_SWAP */ |
| 161 | &vnodepagerops, /* OBJT_VNODE */ |
| 162 | &devicepagerops, /* OBJT_DEVICE */ |
| 163 | &physpagerops, /* OBJT_PHYS */ |
| 164 | &deadpagerops /* OBJT_DEAD */ |
| 165 | }; |
| 166 | |
| 167 | int npagers = sizeof(pagertab) / sizeof(pagertab[0]); |
| 168 | |
| 169 | /* |
| 170 | * Kernel address space for mapping pages. |
| 171 | * Used by pagers where KVAs are needed for IO. |
| 172 | * |
| 173 | * XXX needs to be large enough to support the number of pending async |
| 174 | * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size |
| 175 | * (MAXPHYS == 64k) if you want to get the most efficiency. |
| 176 | */ |
| 177 | #define PAGER_MAP_SIZE (8 * 1024 * 1024) |
| 178 | |
| 179 | int pager_map_size = PAGER_MAP_SIZE; |
| 180 | vm_map_t pager_map; |
| 181 | static int bswneeded; |
| 182 | static vm_offset_t swapbkva; /* swap buffers kva */ |
| 183 | |
| 184 | void |
| 185 | vm_pager_init(void) |
| 186 | { |
| 187 | struct pagerops **pgops; |
| 188 | |
| 189 | /* |
| 190 | * Initialize known pagers |
| 191 | */ |
| 192 | for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) |
| 193 | if (pgops && ((*pgops)->pgo_init != NULL)) |
| 194 | (*(*pgops)->pgo_init) (); |
| 195 | } |
| 196 | |
| 197 | void |
| 198 | vm_pager_bufferinit(void) |
| 199 | { |
| 200 | struct buf *bp; |
| 201 | int i; |
| 202 | |
| 203 | bp = swbuf; |
| 204 | /* |
| 205 | * Now set up swap and physical I/O buffer headers. |
| 206 | */ |
| 207 | for (i = 0; i < nswbuf; i++, bp++) { |
| 208 | TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); |
| 209 | BUF_LOCKINIT(bp); |
| 210 | LIST_INIT(&bp->b_dep); |
| 211 | bp->b_xflags = 0; |
| 212 | } |
| 213 | |
| 214 | cluster_pbuf_freecnt = nswbuf / 2; |
| 215 | |
| 216 | swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS); |
| 217 | if (!swapbkva) |
| 218 | panic("Not enough pager_map VM space for physical buffers"); |
| 219 | } |
| 220 | |
| 221 | /* |
| 222 | * Allocate an instance of a pager of the given type. |
| 223 | * Size, protection and offset parameters are passed in for pagers that |
| 224 | * need to perform page-level validation (e.g. the device pager). |
| 225 | */ |
| 226 | vm_object_t |
| 227 | vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, |
| 228 | vm_ooffset_t off) |
| 229 | { |
| 230 | struct pagerops *ops; |
| 231 | |
| 232 | ops = pagertab[type]; |
| 233 | if (ops) |
| 234 | return ((*ops->pgo_alloc) (handle, size, prot, off)); |
| 235 | return (NULL); |
| 236 | } |
| 237 | |
| 238 | void |
| 239 | vm_pager_deallocate(vm_object_t object) |
| 240 | { |
| 241 | (*pagertab[object->type]->pgo_dealloc) (object); |
| 242 | } |
| 243 | |
| 244 | /* |
| 245 | * vm_pager_strategy: |
| 246 | * |
| 247 | * called with no specific spl |
| 248 | * Execute strategy routine directly to pager. |
| 249 | */ |
| 250 | |
| 251 | void |
| 252 | vm_pager_strategy(vm_object_t object, struct buf *bp) |
| 253 | { |
| 254 | if (pagertab[object->type]->pgo_strategy) { |
| 255 | (*pagertab[object->type]->pgo_strategy)(object, bp); |
| 256 | } else { |
| 257 | bp->b_flags |= B_ERROR; |
| 258 | bp->b_error = ENXIO; |
| 259 | biodone(bp); |
| 260 | } |
| 261 | } |
| 262 | |
| 263 | /* |
| 264 | * vm_pager_get_pages() - inline, see vm/vm_pager.h |
| 265 | * vm_pager_put_pages() - inline, see vm/vm_pager.h |
| 266 | * vm_pager_has_page() - inline, see vm/vm_pager.h |
| 267 | * vm_pager_page_inserted() - inline, see vm/vm_pager.h |
| 268 | * vm_pager_page_removed() - inline, see vm/vm_pager.h |
| 269 | */ |
| 270 | |
| 271 | #if 0 |
| 272 | /* |
| 273 | * vm_pager_sync: |
| 274 | * |
| 275 | * Called by pageout daemon before going back to sleep. |
| 276 | * Gives pagers a chance to clean up any completed async pageing |
| 277 | * operations. |
| 278 | */ |
| 279 | void |
| 280 | vm_pager_sync(void) |
| 281 | { |
| 282 | struct pagerops **pgops; |
| 283 | |
| 284 | for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) |
| 285 | if (pgops && ((*pgops)->pgo_sync != NULL)) |
| 286 | (*(*pgops)->pgo_sync) (); |
| 287 | } |
| 288 | |
| 289 | #endif |
| 290 | |
| 291 | vm_object_t |
| 292 | vm_pager_object_lookup(struct pagerlst *pg_list, void *handle) |
| 293 | { |
| 294 | vm_object_t object; |
| 295 | |
| 296 | for (object = TAILQ_FIRST(pg_list); object != NULL; object = TAILQ_NEXT(object,pager_object_list)) |
| 297 | if (object->handle == handle) |
| 298 | return (object); |
| 299 | return (NULL); |
| 300 | } |
| 301 | |
| 302 | /* |
| 303 | * initialize a physical buffer |
| 304 | */ |
| 305 | |
| 306 | static void |
| 307 | initpbuf(struct buf *bp) |
| 308 | { |
| 309 | bp->b_qindex = QUEUE_NONE; |
| 310 | bp->b_data = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva; |
| 311 | bp->b_kvabase = bp->b_data; |
| 312 | bp->b_kvasize = MAXPHYS; |
| 313 | bp->b_xflags = 0; |
| 314 | bp->b_flags = 0; |
| 315 | bp->b_error = 0; |
| 316 | xio_init(&bp->b_xio); |
| 317 | BUF_LOCK(bp, LK_EXCLUSIVE); |
| 318 | } |
| 319 | |
| 320 | /* |
| 321 | * allocate a physical buffer |
| 322 | * |
| 323 | * There are a limited number (nswbuf) of physical buffers. We need |
| 324 | * to make sure that no single subsystem is able to hog all of them, |
| 325 | * so each subsystem implements a counter which is typically initialized |
| 326 | * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and |
| 327 | * increments it on release, and blocks if the counter hits zero. A |
| 328 | * subsystem may initialize the counter to -1 to disable the feature, |
| 329 | * but it must still be sure to match up all uses of getpbuf() with |
| 330 | * relpbuf() using the same variable. |
| 331 | * |
| 332 | * NOTE: pfreecnt can be NULL, but this 'feature' will be removed |
| 333 | * relatively soon when the rest of the subsystems get smart about it. XXX |
| 334 | */ |
| 335 | struct buf * |
| 336 | getpbuf(int *pfreecnt) |
| 337 | { |
| 338 | int s; |
| 339 | struct buf *bp; |
| 340 | |
| 341 | s = splvm(); |
| 342 | |
| 343 | for (;;) { |
| 344 | if (pfreecnt) { |
| 345 | while (*pfreecnt == 0) { |
| 346 | tsleep(pfreecnt, 0, "wswbuf0", 0); |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | /* get a bp from the swap buffer header pool */ |
| 351 | if ((bp = TAILQ_FIRST(&bswlist)) != NULL) |
| 352 | break; |
| 353 | |
| 354 | bswneeded = 1; |
| 355 | tsleep(&bswneeded, 0, "wswbuf1", 0); |
| 356 | /* loop in case someone else grabbed one */ |
| 357 | } |
| 358 | TAILQ_REMOVE(&bswlist, bp, b_freelist); |
| 359 | if (pfreecnt) |
| 360 | --*pfreecnt; |
| 361 | splx(s); |
| 362 | |
| 363 | initpbuf(bp); |
| 364 | return bp; |
| 365 | } |
| 366 | |
| 367 | /* |
| 368 | * allocate a physical buffer, if one is available. |
| 369 | * |
| 370 | * Note that there is no NULL hack here - all subsystems using this |
| 371 | * call understand how to use pfreecnt. |
| 372 | */ |
| 373 | struct buf * |
| 374 | trypbuf(int *pfreecnt) |
| 375 | { |
| 376 | int s; |
| 377 | struct buf *bp; |
| 378 | |
| 379 | s = splvm(); |
| 380 | if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { |
| 381 | splx(s); |
| 382 | return NULL; |
| 383 | } |
| 384 | TAILQ_REMOVE(&bswlist, bp, b_freelist); |
| 385 | |
| 386 | --*pfreecnt; |
| 387 | |
| 388 | splx(s); |
| 389 | |
| 390 | initpbuf(bp); |
| 391 | |
| 392 | return bp; |
| 393 | } |
| 394 | |
| 395 | /* |
| 396 | * release a physical buffer |
| 397 | * |
| 398 | * NOTE: pfreecnt can be NULL, but this 'feature' will be removed |
| 399 | * relatively soon when the rest of the subsystems get smart about it. XXX |
| 400 | */ |
| 401 | void |
| 402 | relpbuf(struct buf *bp, int *pfreecnt) |
| 403 | { |
| 404 | int s; |
| 405 | |
| 406 | s = splvm(); |
| 407 | |
| 408 | if (bp->b_vp) |
| 409 | pbrelvp(bp); |
| 410 | |
| 411 | BUF_UNLOCK(bp); |
| 412 | |
| 413 | TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); |
| 414 | |
| 415 | if (bswneeded) { |
| 416 | bswneeded = 0; |
| 417 | wakeup(&bswneeded); |
| 418 | } |
| 419 | if (pfreecnt) { |
| 420 | if (++*pfreecnt == 1) |
| 421 | wakeup(pfreecnt); |
| 422 | } |
| 423 | splx(s); |
| 424 | } |
| 425 | |
| 426 | /******************************************************** |
| 427 | * CHAINING FUNCTIONS * |
| 428 | ******************************************************** |
| 429 | * |
| 430 | * These functions support recursion of I/O operations |
| 431 | * on bp's, typically by chaining one or more 'child' bp's |
| 432 | * to the parent. Synchronous, asynchronous, and semi-synchronous |
| 433 | * chaining is possible. |
| 434 | */ |
| 435 | |
| 436 | /* |
| 437 | * vm_pager_chain_iodone: |
| 438 | * |
| 439 | * io completion routine for child bp. Currently we fudge a bit |
| 440 | * on dealing with b_resid. Since users of these routines may issue |
| 441 | * multiple children simultaniously, sequencing of the error can be lost. |
| 442 | */ |
| 443 | |
| 444 | static void |
| 445 | vm_pager_chain_iodone(struct buf *nbp) |
| 446 | { |
| 447 | struct buf *bp; |
| 448 | |
| 449 | if ((bp = nbp->b_chain.parent) != NULL) { |
| 450 | if (nbp->b_flags & B_ERROR) { |
| 451 | bp->b_flags |= B_ERROR; |
| 452 | bp->b_error = nbp->b_error; |
| 453 | } else if (nbp->b_resid != 0) { |
| 454 | bp->b_flags |= B_ERROR; |
| 455 | bp->b_error = EINVAL; |
| 456 | } else { |
| 457 | bp->b_resid -= nbp->b_bcount; |
| 458 | } |
| 459 | nbp->b_chain.parent = NULL; |
| 460 | --bp->b_chain.count; |
| 461 | if (bp->b_flags & B_WANT) { |
| 462 | bp->b_flags &= ~B_WANT; |
| 463 | wakeup(bp); |
| 464 | } |
| 465 | if (!bp->b_chain.count && (bp->b_xflags & BX_AUTOCHAINDONE)) { |
| 466 | bp->b_xflags &= ~BX_AUTOCHAINDONE; |
| 467 | if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) { |
| 468 | bp->b_flags |= B_ERROR; |
| 469 | bp->b_error = EINVAL; |
| 470 | } |
| 471 | biodone(bp); |
| 472 | } |
| 473 | } |
| 474 | nbp->b_flags |= B_DONE; |
| 475 | nbp->b_flags &= ~B_ASYNC; |
| 476 | relpbuf(nbp, NULL); |
| 477 | } |
| 478 | |
| 479 | /* |
| 480 | * getchainbuf: |
| 481 | * |
| 482 | * Obtain a physical buffer and chain it to its parent buffer. When |
| 483 | * I/O completes, the parent buffer will be B_SIGNAL'd. Errors are |
| 484 | * automatically propogated to the parent |
| 485 | * |
| 486 | * Since these are brand new buffers, we do not have to clear B_INVAL |
| 487 | * and B_ERROR because they are already clear. |
| 488 | */ |
| 489 | |
| 490 | struct buf * |
| 491 | getchainbuf(struct buf *bp, struct vnode *vp, int flags) |
| 492 | { |
| 493 | struct buf *nbp = getpbuf(NULL); |
| 494 | |
| 495 | nbp->b_chain.parent = bp; |
| 496 | ++bp->b_chain.count; |
| 497 | |
| 498 | if (bp->b_chain.count > 4) |
| 499 | waitchainbuf(bp, 4, 0); |
| 500 | |
| 501 | nbp->b_flags = B_CALL | (bp->b_flags & B_ORDERED) | flags; |
| 502 | nbp->b_iodone = vm_pager_chain_iodone; |
| 503 | |
| 504 | if (vp) |
| 505 | pbgetvp(vp, nbp); |
| 506 | return(nbp); |
| 507 | } |
| 508 | |
| 509 | void |
| 510 | flushchainbuf(struct buf *nbp) |
| 511 | { |
| 512 | if (nbp->b_bcount) { |
| 513 | nbp->b_bufsize = nbp->b_bcount; |
| 514 | if ((nbp->b_flags & B_READ) == 0) |
| 515 | nbp->b_dirtyend = nbp->b_bcount; |
| 516 | BUF_KERNPROC(nbp); |
| 517 | VOP_STRATEGY(nbp->b_vp, nbp); |
| 518 | } else { |
| 519 | biodone(nbp); |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | void |
| 524 | waitchainbuf(struct buf *bp, int count, int done) |
| 525 | { |
| 526 | int s; |
| 527 | |
| 528 | s = splbio(); |
| 529 | while (bp->b_chain.count > count) { |
| 530 | bp->b_flags |= B_WANT; |
| 531 | tsleep(bp, 0, "bpchain", 0); |
| 532 | } |
| 533 | if (done) { |
| 534 | if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) { |
| 535 | bp->b_flags |= B_ERROR; |
| 536 | bp->b_error = EINVAL; |
| 537 | } |
| 538 | biodone(bp); |
| 539 | } |
| 540 | splx(s); |
| 541 | } |
| 542 | |
| 543 | void |
| 544 | autochaindone(struct buf *bp) |
| 545 | { |
| 546 | int s; |
| 547 | |
| 548 | s = splbio(); |
| 549 | if (bp->b_chain.count == 0) |
| 550 | biodone(bp); |
| 551 | else |
| 552 | bp->b_xflags |= BX_AUTOCHAINDONE; |
| 553 | splx(s); |
| 554 | } |